From 6cb4b332180b4f21ed26123b3479c9ad0c38437c Mon Sep 17 00:00:00 2001 From: "Frank J. T. Wojcik" Date: Wed, 20 Jul 2022 19:50:08 -0700 Subject: [PATCH] Bulk code reformatting (NFC) This is mostly automatic, but some manual changes have been done. --- hashes/EXAMPLE-mit.cpp | 34 +- hashes/EXAMPLE.cpp | 34 +- hashes/aesnihash.cpp | 64 +- hashes/aesrng.cpp | 278 +- hashes/ascon.cpp | 671 ++-- hashes/badhash.cpp | 122 +- hashes/beamsplitter.cpp | 644 ++-- hashes/blake2.cpp | 603 ++-- hashes/blake2/compress-portable.h | 193 +- hashes/blake2/compress-sse2-plus.h | 860 +++--- hashes/blake3.cpp | 992 +++--- hashes/blake3/compress-portable.h | 277 +- hashes/blake3/compress-sse2.h | 992 +++--- hashes/blake3/compress-sse41.h | 981 +++--- hashes/blockpearson.cpp | 314 +- hashes/chaskey.cpp | 529 ++-- hashes/cityhash.cpp | 1304 ++++---- hashes/clhash.cpp | 393 +-- hashes/crap.cpp | 172 +- hashes/crc.cpp | 225 +- hashes/discohash.cpp | 266 +- hashes/donothing.cpp | 164 +- hashes/falcon_oaat.cpp | 103 +- hashes/falkhash.cpp | 253 +- hashes/farmhash.cpp | 3093 ++++++++++--------- hashes/farsh.cpp | 378 +-- hashes/fasthash.cpp | 88 +- hashes/fletcher.cpp | 236 +- hashes/floppsyhash.cpp | 112 +- hashes/fnv.cpp | 412 +-- hashes/halftimehash.cpp | 2240 +++++++------- hashes/hasshe2.cpp | 282 +- hashes/jodyhash.cpp | 110 +- hashes/khash.cpp | 110 +- hashes/komihash.cpp | 191 +- hashes/lookup3.cpp | 144 +- hashes/md5.cpp | 345 +-- hashes/meowhash.cpp | 307 +- hashes/metrohash.cpp | 541 ++-- hashes/multiply_shift.cpp | 247 +- hashes/mum_mir.cpp | 1625 +++++----- hashes/murmur_oaat.cpp | 41 +- hashes/murmurhash1.cpp | 67 +- hashes/murmurhash2.cpp | 297 +- hashes/murmurhash3.cpp | 516 ++-- hashes/mx3.cpp | 108 +- hashes/nmhash.cpp | 491 +-- hashes/o1hash.cpp | 61 +- hashes/pearson.cpp | 849 ++--- hashes/pengyhash.cpp | 45 +- hashes/perlhashes.cpp | 216 +- hashes/pmp_multilinear.cpp | 4629 ++++++++++++++-------------- hashes/poly_mersenne.cpp | 197 +- hashes/prvhash.cpp | 255 +- hashes/rmd.cpp | 1336 ++++---- hashes/seahash.cpp | 222 +- hashes/sha1.cpp | 734 ++--- hashes/sha2.cpp | 1257 ++++---- hashes/sha3.cpp | 375 +-- hashes/siphash.cpp | 608 ++-- hashes/spookyhash.cpp | 454 ++- hashes/superfasthash.cpp | 83 +- hashes/t1ha.cpp | 2027 ++++++------ hashes/tabulation.cpp | 485 +-- hashes/umash.cpp | 912 +++--- hashes/vmac.cpp | 756 ++--- hashes/wyhash.cpp | 345 ++- hashes/x17.cpp | 41 +- hashes/xxhash.cpp | 860 +++--- hashes/xxhash/xxh3-arm.h | 101 +- hashes/xxhash/xxh3-avx2.h | 126 +- hashes/xxhash/xxh3-avx512.h | 85 +- hashes/xxhash/xxh3-ppc.h | 84 +- hashes/xxhash/xxh3-sse2.h | 106 +- include/common/Hashinfo.h | 209 +- include/common/Intrinsics.h | 232 +- include/hashlib/AES-aesni.h | 73 +- include/hashlib/AES-arm.h | 34 +- include/hashlib/AES-portable.h | 531 ++-- include/hashlib/AES-ppc.h | 24 +- include/hashlib/AES.h | 34 +- include/hashlib/Hashlib.h | 59 +- include/hashlib/Mathmult.h | 364 +-- lib/Hashinfo.cpp | 100 +- lib/Hashlib.cpp | 228 +- lib/Mathmult.cpp | 320 +- main.cpp | 969 +++--- misc/exactcoll.c | 39 +- tests/AvalancheTest.cpp | 370 ++- tests/AvalancheTest.h | 4 +- tests/BadSeedsTest.cpp | 202 +- tests/BadSeedsTest.h | 4 +- tests/BitIndependenceTest.cpp | 483 ++- tests/BitIndependenceTest.h | 4 +- tests/CyclicKeysetTest.cpp | 95 +- tests/CyclicKeysetTest.h | 4 +- tests/DiffDistributionTest.cpp | 133 +- tests/DiffDistributionTest.h | 4 +- tests/DifferentialTest.cpp | 251 +- tests/DifferentialTest.h | 4 +- tests/HashMapTest.cpp | 363 ++- tests/HashMapTest.h | 4 +- tests/PRNGTest.cpp | 56 +- tests/PRNGTest.h | 4 +- tests/PerlinNoiseTest.cpp | 93 +- tests/PerlinNoiseTest.h | 4 +- tests/PermutationKeysetTest.cpp | 370 ++- tests/PermutationKeysetTest.h | 4 +- tests/PopcountTest.cpp | 439 +-- tests/PopcountTest.h | 4 +- tests/SanityTest.cpp | 329 +- tests/SanityTest.h | 4 +- tests/SeedTest.cpp | 148 +- tests/SeedTest.h | 4 +- tests/SparseKeysetTest.cpp | 125 +- tests/SparseKeysetTest.h | 4 +- tests/SpeedTest.cpp | 343 +-- tests/SpeedTest.h | 6 +- tests/TextKeysetTest.cpp | 250 +- tests/TextKeysetTest.h | 4 +- tests/TwoBytesKeysetTest.cpp | 49 +- tests/TwoBytesKeysetTest.h | 4 +- tests/WindowedKeysetTest.cpp | 112 +- tests/WindowedKeysetTest.h | 4 +- tests/ZeroesKeysetTest.cpp | 46 +- tests/ZeroesKeysetTest.h | 4 +- util/Analyze.cpp | 1288 ++++---- util/Analyze.h | 22 +- util/Blob.h | 642 ++-- util/Blobsort.cpp | 279 +- util/Blobsort.h | 292 +- util/Instantiate.h | 14 +- util/Platform.cpp | 6 +- util/Random.h | 177 +- util/Stats.cpp | 1304 ++++---- util/Stats.h | 38 +- util/TestGlobals.h | 58 +- util/VCode.cpp | 1848 +++++------ util/VCode.h | 141 +- 139 files changed, 27085 insertions(+), 26273 deletions(-) diff --git a/hashes/EXAMPLE-mit.cpp b/hashes/EXAMPLE-mit.cpp index 248eedfc..ae78eb85 100644 --- a/hashes/EXAMPLE-mit.cpp +++ b/hashes/EXAMPLE-mit.cpp @@ -33,29 +33,29 @@ // hashes/Hashsrc.cmake, keeping the list sorted by size! //------------------------------------------------------------ -//###YOURHASHCODE +// ###YOURHASHCODE //------------------------------------------------------------ -template < bool bswap > -static void ###YOURHASHNAMEHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void ###YOURHASHNAMEHash( const void * in, const size_t len, const seed_t seed, void * out ) { PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(###YOURHASHFAMILYNAME, - $.src_url = "###YOURREPOSITORYURL", - $.src_status = HashFamilyInfo::SRC_###YOURSRCSTATUS -); + $.src_url = "###YOURREPOSITORYURL", + $.src_status = HashFamilyInfo::SRC_###YOURSRCSTATUS + ); REGISTER_HASH(###YOURHASHNAME, - $.desc = "", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = ###YOURHASHNAMEHash, - $.hashfn_bswap = ###YOURHASHNAMEHash -); + $.desc = "", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = ###YOURHASHNAMEHash, + $.hashfn_bswap = ###YOURHASHNAMEHash + ); diff --git a/hashes/EXAMPLE.cpp b/hashes/EXAMPLE.cpp index a63d1021..42b494ae 100644 --- a/hashes/EXAMPLE.cpp +++ b/hashes/EXAMPLE.cpp @@ -14,29 +14,29 @@ // hashes/Hashsrc.cmake, keeping the list sorted by size! //------------------------------------------------------------ -//###YOURHASHCODE +// ###YOURHASHCODE //------------------------------------------------------------ -template < bool bswap > -static void ###YOURHASHNAMEHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void ###YOURHASHNAMEHash( const void * in, const size_t len, const seed_t seed, void * out ) { PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(###YOURHASHFAMILYNAME, - $.src_url = "###YOURREPOSITORYURL", - $.src_status = HashFamilyInfo::SRC_###YOURSRCSTATUS -); + $.src_url = "###YOURREPOSITORYURL", + $.src_status = HashFamilyInfo::SRC_###YOURSRCSTATUS + ); REGISTER_HASH(###YOURHASHNAME, - $.desc = "", - $.hash_flags = - 0, - $.impl_flags = - 0, - $.bits = 32, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = ###YOURHASHNAMEHash, - $.hashfn_bswap = ###YOURHASHNAMEHash -); + $.desc = "", + $.hash_flags = + 0, + $.impl_flags = + 0, + $.bits = 32, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = ###YOURHASHNAMEHash, + $.hashfn_bswap = ###YOURHASHNAMEHash + ); diff --git a/hashes/aesnihash.cpp b/hashes/aesnihash.cpp index 62af7825..54ae7ef0 100644 --- a/hashes/aesnihash.cpp +++ b/hashes/aesnihash.cpp @@ -36,29 +36,29 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_AES) -#include "Intrinsics.h" + #include "Intrinsics.h" -template < bool bswap > -static void aesnihash(const void * inv, const size_t len, const seed_t seed, void * out) { - const uint8_t * in = (uint8_t *)inv; - uint64_t src_sz = len; +template +static void aesnihash( const void * inv, const size_t len, const seed_t seed, void * out ) { + const uint8_t * in = (uint8_t *)inv; + uint64_t src_sz = len; - uint8_t tmp_buf[16] = {0}; - __m128i rk0 = {UINT64_C(0x736f6d6570736575), UINT64_C(0x646f72616e646f6d)}; - __m128i rk1 = {UINT64_C(0x1231236570743245), UINT64_C(0x126f12321321456d)}; + uint8_t tmp_buf[16] = { 0 }; + __m128i rk0 = { UINT64_C(0x736f6d6570736575), UINT64_C(0x646f72616e646f6d) }; + __m128i rk1 = { UINT64_C(0x1231236570743245), UINT64_C(0x126f12321321456d) }; // Homegrown seeding for SMHasher3 - __m128i seed128 = {(int64_t)seed, 0}; - __m128i hash = _mm_xor_si128(rk0, seed128); + __m128i seed128 = { (int64_t)seed, 0 }; + __m128i hash = _mm_xor_si128(rk0, seed128); while (src_sz >= 16) { - onemoretry: + onemoretry: __m128i piece = _mm_loadu_si128((__m128i *)in); // Arbitrarily chose 64-bit wordlen if (bswap) { piece = mm_bswap64(piece); } - in += 16; + in += 16; src_sz -= 16; - hash = _mm_aesenc_si128(_mm_xor_si128(hash, piece), rk0); - hash = _mm_aesenc_si128(hash, rk1); + hash = _mm_aesenc_si128(_mm_xor_si128(hash, piece), rk0); + hash = _mm_aesenc_si128(hash, rk1); } if (src_sz > 0) { @@ -67,7 +67,7 @@ static void aesnihash(const void * inv, const size_t len, const seed_t seed, voi tmp_buf[i] = in[i]; } src_sz = 16; - in = &tmp_buf[0]; + in = &tmp_buf[0]; goto onemoretry; } @@ -80,25 +80,25 @@ static void aesnihash(const void * inv, const size_t len, const seed_t seed, voi } REGISTER_FAMILY(aesnihash, - $.src_url = "https://gist.github.com/majek/96dd615ed6c8aa64f60aac14e3f6ab5a", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://gist.github.com/majek/96dd615ed6c8aa64f60aac14e3f6ab5a", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(aesnihash, - $.desc = "majek's aesnihash", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0xA68E0D42, - $.verification_BE = 0xEBC48EDA, - $.hashfn_native = aesnihash, - $.hashfn_bswap = aesnihash, - $.badseeds = {0x70736575} -); + $.desc = "majek's aesnihash", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0xA68E0D42, + $.verification_BE = 0xEBC48EDA, + $.hashfn_native = aesnihash, + $.hashfn_bswap = aesnihash, + $.badseeds = { 0x70736575 } + ); #else REGISTER_FAMILY(aesnihash); diff --git a/hashes/aesrng.cpp b/hashes/aesrng.cpp index 9b88ba8d..1e8181b0 100644 --- a/hashes/aesrng.cpp +++ b/hashes/aesrng.cpp @@ -31,12 +31,12 @@ #include -// ------------------------------------------------------------ +//------------------------------------------------------------ // This is not strictly AES CTR mode, it is based on that plus the ARS // RNG constructions. static thread_local uint64_t ctr[2], oldctr[2]; -static const uint64_t incr[2] = {UINT64_C(1), UINT64_C(-1)}; +static const uint64_t incr[2] = { UINT64_C(1), UINT64_C(-1) }; static uint32_t round_keys[44]; // only modified on main thread // A little ugly... @@ -45,8 +45,10 @@ extern seed_t g_seed; /* K1 is golden ratio - 1, K2 is sqrt(3) - 1 */ #define K1 UINT64_C(0x9E3779B97F4A7C15) #define K2 UINT64_C(0xBB67AE8584CAA73B) -static bool aesrng_init(void) { + +static bool aesrng_init( void ) { uint8_t key[16]; + if (isLE()) { PUT_U64(g_seed + K2, key, 0); PUT_U64(g_seed + K1, key, 8); @@ -59,8 +61,9 @@ static bool aesrng_init(void) { return true; } -static uint64_t rnd64(void) { +static uint64_t rnd64( void ) { uint8_t result[16]; + if (isLE()) { PUT_U64(ctr[0], result, 0); PUT_U64(ctr[1], result, 8); @@ -75,16 +78,17 @@ static uint64_t rnd64(void) { return GET_U64(result, 0); } -static void rng_ffwd(int64_t ffwd) { +static void rng_ffwd( int64_t ffwd ) { ctr[0] += ffwd; ctr[1] -= ffwd; } -static void rng_setctr(uint64_t stream, uint64_t seq) { +static void rng_setctr( uint64_t stream, uint64_t seq ) { ctr[0] = seq; ctr[1] = stream; } // This variable is _not_ thread-local static uint64_t hash_mode; + // These complications are intended to make this "hash" return the // same results if threading is enabled or not. It makes the following // assumptions about the rest of the code: @@ -110,14 +114,14 @@ static uint64_t hash_mode; // thread's results should be unaffected if threading is enabled or // disabled, or if the possibly-threaded tests are skipped, and the // per-thread results should be unaffected by the number of threads. -static seed_t aesrng_seedfix(const HashInfo * hinfo, const seed_t hint) { +static seed_t aesrng_seedfix( const HashInfo * hinfo, const seed_t hint ) { if (hash_mode == hint) { oldctr[0] = ctr[0]; oldctr[1] = ctr[1]; } else { hash_mode = hint; - ctr[0] = oldctr[0]; - ctr[1] = oldctr[1]; + ctr[0] = oldctr[0]; + ctr[1] = oldctr[1]; } return 0; } @@ -131,7 +135,8 @@ static seed_t aesrng_seedfix(const HashInfo * hinfo, const seed_t hint) { // Hash_mode 2 is for Avalanche, which is very hard to fool in a // consistent way, so we have some magic knowledge of how it calls us. static thread_local uint64_t callcount; -static void rng_keyseq(const void * key, size_t len, uint64_t seed) { + +static void rng_keyseq( const void * key, size_t len, uint64_t seed ) { if (hash_mode == 2) { if (callcount-- != 0) { return; @@ -140,15 +145,15 @@ static void rng_keyseq(const void * key, size_t len, uint64_t seed) { } uint64_t s = 0; memcpy(&s, key, len > 8 ? 8 : len); - s = COND_BSWAP(s, isBE()); - s ^= len * K2; + s = COND_BSWAP(s, isBE()); + s ^= len * K2; seed ^= s * K1; - s ^= seed * K2; + s ^= seed * K2; rng_setctr(s, seed); } -template < uint32_t nbytes > -static void rng_impl(void * out) { +template +static void rng_impl( void * out ) { assert((nbytes >= 0) && (nbytes <= 39)); uint8_t * result = (uint8_t *)out; if (nbytes >= 8) { @@ -177,140 +182,141 @@ static void rng_impl(void * out) { } } -template < uint32_t hashbits > -static void aesrng(const void * in, const size_t len, const seed_t seed, void * out) { - if (hash_mode != 0) - rng_keyseq(in, len, seed); +template +static void aesrng( const void * in, const size_t len, const seed_t seed, void * out ) { + if (hash_mode != 0) { + rng_keyseq(in, len, seed); + } rng_impl<(hashbits >> 3)>(out); } REGISTER_FAMILY(aesrng, - $.src_url = "https://gitlab.com/fwojcik/smhasher3/-/blob/main/hashes/aesrng.cpp", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://gitlab.com/fwojcik/smhasher3/-/blob/main/hashes/aesrng.cpp", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(aesrng_32, - $.desc = "32-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xED1590AC, - $.verification_BE = 0xED1590AC, - $.hashfn_native = aesrng<32>, - $.hashfn_bswap = aesrng<32>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "32-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xED1590AC, + $.verification_BE = 0xED1590AC, + $.hashfn_native = aesrng<32>, + $.hashfn_bswap = aesrng<32>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); REGISTER_HASH(aesrng_64, - $.desc = "64-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xAE36B667, - $.verification_BE = 0xAE36B667, - $.hashfn_native = aesrng<64>, - $.hashfn_bswap = aesrng<64>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "64-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xAE36B667, + $.verification_BE = 0xAE36B667, + $.hashfn_native = aesrng<64>, + $.hashfn_bswap = aesrng<64>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); REGISTER_HASH(aesrng_128, - $.desc = "128-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x2D1A1DB5, - $.verification_BE = 0x2D1A1DB5, - $.hashfn_native = aesrng<128>, - $.hashfn_bswap = aesrng<128>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "128-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x2D1A1DB5, + $.verification_BE = 0x2D1A1DB5, + $.hashfn_native = aesrng<128>, + $.hashfn_bswap = aesrng<128>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); REGISTER_HASH(aesrng_160, - $.desc = "160-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 160, - $.verification_LE = 0x3FC284C3, - $.verification_BE = 0x3FC284C3, - $.hashfn_native = aesrng<160>, - $.hashfn_bswap = aesrng<160>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "160-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 160, + $.verification_LE = 0x3FC284C3, + $.verification_BE = 0x3FC284C3, + $.hashfn_native = aesrng<160>, + $.hashfn_bswap = aesrng<160>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); REGISTER_HASH(aesrng_224, - $.desc = "224-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 224, - $.verification_LE = 0x9288A516, - $.verification_BE = 0x9288A516, - $.hashfn_native = aesrng<224>, - $.hashfn_bswap = aesrng<224>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "224-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 224, + $.verification_LE = 0x9288A516, + $.verification_BE = 0x9288A516, + $.hashfn_native = aesrng<224>, + $.hashfn_bswap = aesrng<224>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); REGISTER_HASH(aesrng_256, - $.desc = "256-bit RNG using AES in CTR mode; not a hash", - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_AES_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_SEED_WITH_HINT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 256, - $.verification_LE = 0x2816EEC1, - $.verification_BE = 0x2816EEC1, - $.hashfn_native = aesrng<256>, - $.hashfn_bswap = aesrng<256>, - $.initfn = aesrng_init, - $.seedfixfn = aesrng_seedfix, - $.sort_order = 50 -); + $.desc = "256-bit RNG using AES in CTR mode; not a hash", + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_AES_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_SEED_WITH_HINT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 256, + $.verification_LE = 0x2816EEC1, + $.verification_BE = 0x2816EEC1, + $.hashfn_native = aesrng<256>, + $.hashfn_bswap = aesrng<256>, + $.initfn = aesrng_init, + $.seedfixfn = aesrng_seedfix, + $.sort_order = 50 + ); diff --git a/hashes/ascon.cpp b/hashes/ascon.cpp index 5fabc4d5..c79d490d 100644 --- a/hashes/ascon.cpp +++ b/hashes/ascon.cpp @@ -134,11 +134,11 @@ #include "Platform.h" #include "Hashlib.h" -//#define CRYPTO_VERSION "1.2.6" +// #define CRYPTO_VERSION "1.2.6" //------------------------------------------------------------ typedef struct { - uint64_t x[5]; + uint64_t x[5]; } state_t; #define ASCON_HASH_RATE 8 @@ -147,38 +147,39 @@ typedef struct { #define P_ROUNDS_XOF 12 #define P_ROUNDS_XOFA 8 -static FORCE_INLINE void ROUND(state_t * s, uint8_t C) { +static FORCE_INLINE void ROUND( state_t * s, uint8_t C ) { state_t t; + /* round constant */ s->x[2] ^= C; /* s-box layer */ s->x[0] ^= s->x[4]; s->x[4] ^= s->x[3]; s->x[2] ^= s->x[1]; - t.x[0] = s->x[0] ^ (~s->x[1] & s->x[2]); - t.x[2] = s->x[2] ^ (~s->x[3] & s->x[4]); - t.x[4] = s->x[4] ^ (~s->x[0] & s->x[1]); - t.x[1] = s->x[1] ^ (~s->x[2] & s->x[3]); - t.x[3] = s->x[3] ^ (~s->x[4] & s->x[0]); - t.x[1] ^= t.x[0]; - t.x[3] ^= t.x[2]; - t.x[0] ^= t.x[4]; + t.x[0] = s->x[0] ^ (~s->x[1] & s->x[2]); + t.x[2] = s->x[2] ^ (~s->x[3] & s->x[4]); + t.x[4] = s->x[4] ^ (~s->x[0] & s->x[1]); + t.x[1] = s->x[1] ^ (~s->x[2] & s->x[3]); + t.x[3] = s->x[3] ^ (~s->x[4] & s->x[0]); + t.x[1] ^= t.x[0]; + t.x[3] ^= t.x[2]; + t.x[0] ^= t.x[4]; /* linear layer */ - s->x[2] = t.x[2] ^ ROTR64(t.x[2], 6 - 1); - s->x[3] = t.x[3] ^ ROTR64(t.x[3], 17 - 10); - s->x[4] = t.x[4] ^ ROTR64(t.x[4], 41 - 7); - s->x[0] = t.x[0] ^ ROTR64(t.x[0], 28 - 19); - s->x[1] = t.x[1] ^ ROTR64(t.x[1], 61 - 39); - s->x[2] = t.x[2] ^ ROTR64(s->x[2], 1); - s->x[3] = t.x[3] ^ ROTR64(s->x[3], 10); - s->x[4] = t.x[4] ^ ROTR64(s->x[4], 7); - s->x[0] = t.x[0] ^ ROTR64(s->x[0], 19); - s->x[1] = t.x[1] ^ ROTR64(s->x[1], 39); - s->x[2] = ~s->x[2]; + s->x[2] = t.x[2] ^ ROTR64(t.x [2], 6 - 1); + s->x[3] = t.x[3] ^ ROTR64(t.x [3], 17 - 10); + s->x[4] = t.x[4] ^ ROTR64(t.x [4], 41 - 7); + s->x[0] = t.x[0] ^ ROTR64(t.x [0], 28 - 19); + s->x[1] = t.x[1] ^ ROTR64(t.x [1], 61 - 39); + s->x[2] = t.x[2] ^ ROTR64(s->x[2], 1); + s->x[3] = t.x[3] ^ ROTR64(s->x[3], 10); + s->x[4] = t.x[4] ^ ROTR64(s->x[4], 7); + s->x[0] = t.x[0] ^ ROTR64(s->x[0], 19); + s->x[1] = t.x[1] ^ ROTR64(s->x[1], 39); + s->x[2] = ~s->x[2]; } -template < uint32_t rounds > -static FORCE_INLINE void P(state_t * s) { +template +static FORCE_INLINE void P( state_t * s ) { if (rounds > MAX_P_ROUNDS) { return; } const uint8_t RC[MAX_P_ROUNDS] = { @@ -192,8 +193,8 @@ static FORCE_INLINE void P(state_t * s) { } // Homegrown seeding for SMHasher3 -template < bool XOFa > -static FORCE_INLINE void ascon_initxof(state_t * s, uint64_t seed) { +template +static FORCE_INLINE void ascon_initxof( state_t * s, uint64_t seed ) { if (XOFa) { s->x[0] = UINT64_C(0x44906568b77b9832); s->x[1] = UINT64_C(0xcd8d6cae53455532); @@ -209,14 +210,14 @@ static FORCE_INLINE void ascon_initxof(state_t * s, uint64_t seed) { } } -template < bool XOFa, bool bswap > -static FORCE_INLINE void ascon_absorb(state_t* s, const uint8_t* in, uint64_t inlen) { +template +static FORCE_INLINE void ascon_absorb( state_t * s, const uint8_t * in, uint64_t inlen ) { /* absorb full plaintext blocks */ while (inlen >= ASCON_HASH_RATE) { s->x[0] ^= GET_U64(in, 0); P(s); - in += ASCON_HASH_RATE; - inlen -= ASCON_HASH_RATE; + in += ASCON_HASH_RATE; + inlen -= ASCON_HASH_RATE; } /* absorb final plaintext block */ if (inlen) { @@ -228,12 +229,12 @@ static FORCE_INLINE void ascon_absorb(state_t* s, const uint8_t* in, uint64_t in s->x[0] ^= UINT64_C(0x80) << (56 - 8 * inlen); } -template < bool XOFa, bool bswap > -static void ascon_squeeze(state_t * s, uint8_t * out, uint64_t outlen) { +template +static void ascon_squeeze( state_t * s, uint8_t * out, uint64_t outlen ) { while (outlen > ASCON_HASH_RATE) { PUT_U64(s->x[0], out, 0); P(s); - out += ASCON_HASH_RATE; + out += ASCON_HASH_RATE; outlen -= ASCON_HASH_RATE; } uint8_t buf[8]; @@ -242,9 +243,10 @@ static void ascon_squeeze(state_t * s, uint8_t * out, uint64_t outlen) { } //------------------------------------------------------------ -template < uint64_t outbits, bool XOFa, bool bswap > -static void ascon_xof(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void ascon_xof( const void * in, const size_t len, const seed_t seed, void * out ) { state_t s; + ascon_initxof(&s, seed); ascon_absorb(&s, (const uint8_t *)in, (uint64_t)len); P(&s); // Always! Never P_ROUNDS_XOFA @@ -263,84 +265,187 @@ static void ascon_xof(const void * in, const size_t len, const seed_t seed, void // ascon using `./genkat_crypto_hash_asconxofv12_opt64` and // `./genkat_crypto_hash_asconxofav12_opt64`. #define KAT_NUM 17 -static const uint8_t KAT[KAT_NUM][2][256/8] = { +static const uint8_t KAT[KAT_NUM][2][256 / 8] = { { - { 0x5D, 0x4C, 0xBD, 0xE6, 0x35, 0x0E, 0xA4, 0xC1, 0x74, 0xBD, 0x65, 0xB5, 0xB3, 0x32, 0xF8, 0x40, 0x8F, 0x99, 0x74, 0x0B, 0x81, 0xAA, 0x02, 0x73, 0x5E, 0xAE, 0xFB, 0xCF, 0x0B, 0xA0, 0x33, 0x9E, }, - { 0x7C, 0x10, 0xDF, 0xFD, 0x6B, 0xB0, 0x3B, 0xE2, 0x62, 0xD7, 0x2F, 0xBE, 0x1B, 0x0F, 0x53, 0x00, 0x13, 0xC6, 0xC4, 0xEA, 0xDA, 0xAB, 0xDE, 0x27, 0x8D, 0x6F, 0x29, 0xD5, 0x79, 0xE3, 0x90, 0x8D, }, + { + 0x5D, 0x4C, 0xBD, 0xE6, 0x35, 0x0E, 0xA4, 0xC1, 0x74, 0xBD, 0x65, 0xB5, 0xB3, 0x32, 0xF8, 0x40, + 0x8F, 0x99, 0x74, 0x0B, 0x81, 0xAA, 0x02, 0x73, 0x5E, 0xAE, 0xFB, 0xCF, 0x0B, 0xA0, 0x33, 0x9E, + }, + { + 0x7C, 0x10, 0xDF, 0xFD, 0x6B, 0xB0, 0x3B, 0xE2, 0x62, 0xD7, 0x2F, 0xBE, 0x1B, 0x0F, 0x53, 0x00, + 0x13, 0xC6, 0xC4, 0xEA, 0xDA, 0xAB, 0xDE, 0x27, 0x8D, 0x6F, 0x29, 0xD5, 0x79, 0xE3, 0x90, 0x8D, + }, }, { - { 0xB2, 0xED, 0xBB, 0x27, 0xAC, 0x83, 0x97, 0xA5, 0x5B, 0xC8, 0x3D, 0x13, 0x7C, 0x15, 0x1D, 0xE9, 0xED, 0xE0, 0x48, 0x33, 0x8F, 0xE9, 0x07, 0xF0, 0xD3, 0x62, 0x9E, 0x71, 0x78, 0x46, 0xFE, 0xDC, }, - { 0x96, 0x54, 0x45, 0xC4, 0x6C, 0x8E, 0x9B, 0x94, 0x8E, 0xDF, 0xEF, 0x7B, 0x58, 0x79, 0xE0, 0x6A, 0xB5, 0xF0, 0x23, 0x77, 0x0E, 0xA8, 0x92, 0xFA, 0x4B, 0x54, 0x52, 0x50, 0x08, 0x46, 0x7E, 0xA3, }, + { + 0xB2, 0xED, 0xBB, 0x27, 0xAC, 0x83, 0x97, 0xA5, 0x5B, 0xC8, 0x3D, 0x13, 0x7C, 0x15, 0x1D, 0xE9, + 0xED, 0xE0, 0x48, 0x33, 0x8F, 0xE9, 0x07, 0xF0, 0xD3, 0x62, 0x9E, 0x71, 0x78, 0x46, 0xFE, 0xDC, + }, + { + 0x96, 0x54, 0x45, 0xC4, 0x6C, 0x8E, 0x9B, 0x94, 0x8E, 0xDF, 0xEF, 0x7B, 0x58, 0x79, 0xE0, 0x6A, + 0xB5, 0xF0, 0x23, 0x77, 0x0E, 0xA8, 0x92, 0xFA, 0x4B, 0x54, 0x52, 0x50, 0x08, 0x46, 0x7E, 0xA3, + }, }, { - { 0xD1, 0x96, 0x46, 0x1C, 0x29, 0x9D, 0xB7, 0x14, 0xD7, 0x8C, 0x26, 0x79, 0x24, 0xB5, 0x78, 0x6E, 0xE2, 0x6F, 0xC4, 0x3B, 0x3E, 0x64, 0x0D, 0xAA, 0x53, 0x97, 0xE3, 0x8E, 0x39, 0xD3, 0x9D, 0xC6, }, - { 0x48, 0xEB, 0x41, 0xB7, 0xA4, 0x35, 0x2A, 0xFB, 0x89, 0x43, 0xB7, 0x65, 0x65, 0x48, 0x55, 0xB1, 0xD7, 0x10, 0x4B, 0x22, 0xE9, 0x81, 0xE5, 0x12, 0x0D, 0xA9, 0x96, 0x25, 0x79, 0xA7, 0xBA, 0xE6, }, + { + 0xD1, 0x96, 0x46, 0x1C, 0x29, 0x9D, 0xB7, 0x14, 0xD7, 0x8C, 0x26, 0x79, 0x24, 0xB5, 0x78, 0x6E, + 0xE2, 0x6F, 0xC4, 0x3B, 0x3E, 0x64, 0x0D, 0xAA, 0x53, 0x97, 0xE3, 0x8E, 0x39, 0xD3, 0x9D, 0xC6, + }, + { + 0x48, 0xEB, 0x41, 0xB7, 0xA4, 0x35, 0x2A, 0xFB, 0x89, 0x43, 0xB7, 0x65, 0x65, 0x48, 0x55, 0xB1, + 0xD7, 0x10, 0x4B, 0x22, 0xE9, 0x81, 0xE5, 0x12, 0x0D, 0xA9, 0x96, 0x25, 0x79, 0xA7, 0xBA, 0xE6, + }, }, { - { 0x1D, 0x18, 0xB9, 0xDD, 0x8F, 0xF9, 0xA1, 0xBF, 0x59, 0x75, 0x1B, 0x88, 0xD3, 0x27, 0x66, 0xC5, 0xE0, 0x54, 0x91, 0x0F, 0x49, 0x7B, 0xFF, 0x40, 0x92, 0xAF, 0xC4, 0x7F, 0x58, 0x85, 0x52, 0x3B, }, - { 0x5C, 0xFD, 0x8A, 0xCE, 0x65, 0x3E, 0x21, 0x27, 0x57, 0xD4, 0xA4, 0xAC, 0x3B, 0x6F, 0xAD, 0x31, 0xAB, 0xCB, 0xFA, 0x3F, 0x9E, 0x0F, 0x92, 0x24, 0x46, 0xF7, 0x6A, 0xF3, 0x72, 0xC5, 0x3E, 0xED, }, + { + 0x1D, 0x18, 0xB9, 0xDD, 0x8F, 0xF9, 0xA1, 0xBF, 0x59, 0x75, 0x1B, 0x88, 0xD3, 0x27, 0x66, 0xC5, + 0xE0, 0x54, 0x91, 0x0F, 0x49, 0x7B, 0xFF, 0x40, 0x92, 0xAF, 0xC4, 0x7F, 0x58, 0x85, 0x52, 0x3B, + }, + { + 0x5C, 0xFD, 0x8A, 0xCE, 0x65, 0x3E, 0x21, 0x27, 0x57, 0xD4, 0xA4, 0xAC, 0x3B, 0x6F, 0xAD, 0x31, + 0xAB, 0xCB, 0xFA, 0x3F, 0x9E, 0x0F, 0x92, 0x24, 0x46, 0xF7, 0x6A, 0xF3, 0x72, 0xC5, 0x3E, 0xED, + }, }, { - { 0x66, 0xFB, 0x74, 0x17, 0x47, 0x82, 0xAF, 0xED, 0x89, 0x84, 0x78, 0xAA, 0x72, 0x90, 0x58, 0xD5, 0xC3, 0x0A, 0xF1, 0x9A, 0xF2, 0xF5, 0xD4, 0xE1, 0xCE, 0x65, 0xCD, 0x32, 0x05, 0x94, 0xEF, 0x66, }, - { 0xE2, 0xFE, 0xE1, 0x11, 0xA8, 0xE4, 0xB6, 0x22, 0x46, 0x2F, 0x89, 0x7D, 0xA4, 0x8C, 0x02, 0xB8, 0x07, 0xCA, 0xDD, 0xC2, 0x80, 0x17, 0x18, 0x6D, 0xC8, 0x56, 0xD8, 0xCF, 0x3D, 0xC2, 0x02, 0x48, }, + { + 0x66, 0xFB, 0x74, 0x17, 0x47, 0x82, 0xAF, 0xED, 0x89, 0x84, 0x78, 0xAA, 0x72, 0x90, 0x58, 0xD5, + 0xC3, 0x0A, 0xF1, 0x9A, 0xF2, 0xF5, 0xD4, 0xE1, 0xCE, 0x65, 0xCD, 0x32, 0x05, 0x94, 0xEF, 0x66, + }, + { + 0xE2, 0xFE, 0xE1, 0x11, 0xA8, 0xE4, 0xB6, 0x22, 0x46, 0x2F, 0x89, 0x7D, 0xA4, 0x8C, 0x02, 0xB8, + 0x07, 0xCA, 0xDD, 0xC2, 0x80, 0x17, 0x18, 0x6D, 0xC8, 0x56, 0xD8, 0xCF, 0x3D, 0xC2, 0x02, 0x48, + }, }, { - { 0xF4, 0x73, 0xC7, 0xA7, 0xD9, 0xF1, 0x40, 0xAA, 0x1A, 0xFB, 0x2D, 0xD0, 0xA0, 0xEC, 0xC2, 0x63, 0x5B, 0x01, 0x74, 0x94, 0x2A, 0x70, 0x94, 0xEC, 0x34, 0xF4, 0xD8, 0x02, 0x5B, 0x9F, 0xC3, 0x91, }, - { 0x05, 0x2E, 0xA9, 0x65, 0x27, 0x96, 0xB2, 0xD7, 0xBA, 0x5B, 0x63, 0x05, 0xAD, 0x3E, 0x42, 0x91, 0x27, 0x71, 0x30, 0x25, 0x29, 0xBA, 0xDF, 0x73, 0x51, 0x7C, 0x54, 0xC7, 0xDA, 0xD9, 0x5F, 0xDF, }, + { + 0xF4, 0x73, 0xC7, 0xA7, 0xD9, 0xF1, 0x40, 0xAA, 0x1A, 0xFB, 0x2D, 0xD0, 0xA0, 0xEC, 0xC2, 0x63, + 0x5B, 0x01, 0x74, 0x94, 0x2A, 0x70, 0x94, 0xEC, 0x34, 0xF4, 0xD8, 0x02, 0x5B, 0x9F, 0xC3, 0x91, + }, + { + 0x05, 0x2E, 0xA9, 0x65, 0x27, 0x96, 0xB2, 0xD7, 0xBA, 0x5B, 0x63, 0x05, 0xAD, 0x3E, 0x42, 0x91, + 0x27, 0x71, 0x30, 0x25, 0x29, 0xBA, 0xDF, 0x73, 0x51, 0x7C, 0x54, 0xC7, 0xDA, 0xD9, 0x5F, 0xDF, + }, }, { - { 0xD7, 0x65, 0x8B, 0x24, 0xB9, 0x88, 0x60, 0x57, 0xB8, 0x82, 0x75, 0x18, 0xA2, 0xA3, 0x67, 0x15, 0xA1, 0xB7, 0x32, 0x56, 0xE6, 0x5D, 0x04, 0x93, 0xDD, 0x0A, 0xF3, 0xE2, 0x73, 0x87, 0xDF, 0x40, }, - { 0x30, 0xBC, 0x8D, 0x20, 0xC4, 0xAA, 0x4D, 0xF5, 0x39, 0xE9, 0xE6, 0xB5, 0x8A, 0x45, 0x2C, 0xAC, 0x9E, 0x5E, 0x98, 0xF9, 0x4C, 0x6C, 0x90, 0xBF, 0x6C, 0x3B, 0xC9, 0xCF, 0x57, 0x3E, 0xB9, 0xED, }, + { + 0xD7, 0x65, 0x8B, 0x24, 0xB9, 0x88, 0x60, 0x57, 0xB8, 0x82, 0x75, 0x18, 0xA2, 0xA3, 0x67, 0x15, + 0xA1, 0xB7, 0x32, 0x56, 0xE6, 0x5D, 0x04, 0x93, 0xDD, 0x0A, 0xF3, 0xE2, 0x73, 0x87, 0xDF, 0x40, + }, + { + 0x30, 0xBC, 0x8D, 0x20, 0xC4, 0xAA, 0x4D, 0xF5, 0x39, 0xE9, 0xE6, 0xB5, 0x8A, 0x45, 0x2C, 0xAC, + 0x9E, 0x5E, 0x98, 0xF9, 0x4C, 0x6C, 0x90, 0xBF, 0x6C, 0x3B, 0xC9, 0xCF, 0x57, 0x3E, 0xB9, 0xED, + }, }, { - { 0x1D, 0xB7, 0x47, 0x6C, 0xD7, 0x20, 0x64, 0xC6, 0x8E, 0x73, 0x6D, 0x82, 0x1E, 0xA6, 0xF0, 0xC9, 0x36, 0x10, 0xFE, 0x22, 0x32, 0x67, 0x54, 0xF5, 0x36, 0x68, 0x36, 0x87, 0x1A, 0x6F, 0x5A, 0x10, }, - { 0x00, 0x75, 0x5B, 0x9D, 0x72, 0xB2, 0x63, 0x2D, 0x88, 0xCB, 0x69, 0x45, 0xD5, 0x36, 0x38, 0x2C, 0x1E, 0x0B, 0x49, 0x57, 0xB4, 0xA4, 0x4B, 0xB5, 0x1C, 0x14, 0x88, 0x6A, 0x6F, 0xB3, 0x1A, 0x45, }, + { + 0x1D, 0xB7, 0x47, 0x6C, 0xD7, 0x20, 0x64, 0xC6, 0x8E, 0x73, 0x6D, 0x82, 0x1E, 0xA6, 0xF0, 0xC9, + 0x36, 0x10, 0xFE, 0x22, 0x32, 0x67, 0x54, 0xF5, 0x36, 0x68, 0x36, 0x87, 0x1A, 0x6F, 0x5A, 0x10, + }, + { + 0x00, 0x75, 0x5B, 0x9D, 0x72, 0xB2, 0x63, 0x2D, 0x88, 0xCB, 0x69, 0x45, 0xD5, 0x36, 0x38, 0x2C, + 0x1E, 0x0B, 0x49, 0x57, 0xB4, 0xA4, 0x4B, 0xB5, 0x1C, 0x14, 0x88, 0x6A, 0x6F, 0xB3, 0x1A, 0x45, + }, }, { - { 0x18, 0x42, 0x7D, 0x2D, 0x29, 0xDF, 0x1E, 0x02, 0x02, 0x64, 0x9F, 0x03, 0x2F, 0x20, 0x80, 0x36, 0x3F, 0xEC, 0x5D, 0xE7, 0x2E, 0xCA, 0xE1, 0x1B, 0x4F, 0x98, 0xCC, 0xC7, 0x58, 0x43, 0xE7, 0xCC, }, - { 0x91, 0xC7, 0x2F, 0x62, 0x73, 0xB6, 0xED, 0x44, 0x4B, 0xF5, 0x60, 0xF2, 0xFA, 0xC9, 0x9E, 0x8F, 0xED, 0xDD, 0xF3, 0x01, 0x62, 0x68, 0x8B, 0x86, 0x55, 0x3E, 0xB5, 0x7F, 0x1C, 0x98, 0xC2, 0x0E, }, + { + 0x18, 0x42, 0x7D, 0x2D, 0x29, 0xDF, 0x1E, 0x02, 0x02, 0x64, 0x9F, 0x03, 0x2F, 0x20, 0x80, 0x36, + 0x3F, 0xEC, 0x5D, 0xE7, 0x2E, 0xCA, 0xE1, 0x1B, 0x4F, 0x98, 0xCC, 0xC7, 0x58, 0x43, 0xE7, 0xCC, + }, + { + 0x91, 0xC7, 0x2F, 0x62, 0x73, 0xB6, 0xED, 0x44, 0x4B, 0xF5, 0x60, 0xF2, 0xFA, 0xC9, 0x9E, 0x8F, + 0xED, 0xDD, 0xF3, 0x01, 0x62, 0x68, 0x8B, 0x86, 0x55, 0x3E, 0xB5, 0x7F, 0x1C, 0x98, 0xC2, 0x0E, + }, }, { - { 0xCE, 0x60, 0x6E, 0x3F, 0xFC, 0xEE, 0x53, 0xB1, 0x13, 0xAA, 0x5A, 0x5C, 0xA3, 0xA1, 0x63, 0x76, 0xA3, 0xDE, 0x36, 0x43, 0x52, 0x87, 0x5D, 0x33, 0x60, 0xE1, 0x31, 0x66, 0x6A, 0x56, 0x72, 0x48, }, - { 0x7E, 0x79, 0x76, 0x8F, 0x37, 0xD2, 0x13, 0xB1, 0x1B, 0x41, 0x93, 0xE1, 0xD6, 0x2D, 0x33, 0x99, 0x54, 0xA3, 0xB9, 0xE1, 0x6C, 0xCE, 0xF0, 0x5F, 0xD5, 0x74, 0xE1, 0x33, 0x06, 0x68, 0xB6, 0x28, }, + { + 0xCE, 0x60, 0x6E, 0x3F, 0xFC, 0xEE, 0x53, 0xB1, 0x13, 0xAA, 0x5A, 0x5C, 0xA3, 0xA1, 0x63, 0x76, + 0xA3, 0xDE, 0x36, 0x43, 0x52, 0x87, 0x5D, 0x33, 0x60, 0xE1, 0x31, 0x66, 0x6A, 0x56, 0x72, 0x48, + }, + { + 0x7E, 0x79, 0x76, 0x8F, 0x37, 0xD2, 0x13, 0xB1, 0x1B, 0x41, 0x93, 0xE1, 0xD6, 0x2D, 0x33, 0x99, + 0x54, 0xA3, 0xB9, 0xE1, 0x6C, 0xCE, 0xF0, 0x5F, 0xD5, 0x74, 0xE1, 0x33, 0x06, 0x68, 0xB6, 0x28, + }, }, { - { 0xAA, 0x1F, 0x11, 0xB1, 0x73, 0x85, 0xCC, 0xEB, 0xDC, 0x06, 0x5F, 0x20, 0xA6, 0x19, 0x5A, 0xB6, 0x54, 0x0D, 0x98, 0xA1, 0xCA, 0xBE, 0x6D, 0xBB, 0x35, 0x81, 0x33, 0x3E, 0x70, 0x32, 0xD0, 0xDB, }, - { 0xB1, 0x9D, 0x75, 0xF2, 0x26, 0x60, 0x8F, 0xBB, 0x58, 0x30, 0x72, 0x44, 0x49, 0x0A, 0xC6, 0x7E, 0x96, 0x3A, 0x66, 0x44, 0x43, 0x94, 0x1F, 0xD6, 0xB1, 0xEE, 0x03, 0x71, 0xB7, 0x6F, 0x45, 0xF3, }, + { + 0xAA, 0x1F, 0x11, 0xB1, 0x73, 0x85, 0xCC, 0xEB, 0xDC, 0x06, 0x5F, 0x20, 0xA6, 0x19, 0x5A, 0xB6, + 0x54, 0x0D, 0x98, 0xA1, 0xCA, 0xBE, 0x6D, 0xBB, 0x35, 0x81, 0x33, 0x3E, 0x70, 0x32, 0xD0, 0xDB, + }, + { + 0xB1, 0x9D, 0x75, 0xF2, 0x26, 0x60, 0x8F, 0xBB, 0x58, 0x30, 0x72, 0x44, 0x49, 0x0A, 0xC6, 0x7E, + 0x96, 0x3A, 0x66, 0x44, 0x43, 0x94, 0x1F, 0xD6, 0xB1, 0xEE, 0x03, 0x71, 0xB7, 0x6F, 0x45, 0xF3, + }, }, { - { 0xB7, 0x4A, 0xC0, 0x1F, 0xBE, 0xCE, 0xA5, 0x2A, 0x80, 0x11, 0xDD, 0x6F, 0x94, 0x71, 0x47, 0x39, 0x56, 0x03, 0x4D, 0xF5, 0x47, 0xA7, 0x81, 0x13, 0x92, 0x4D, 0x73, 0x69, 0xB6, 0xB1, 0xDC, 0x0D, }, - { 0x1C, 0x93, 0xD3, 0xA4, 0x48, 0xEC, 0x29, 0x44, 0xCC, 0x74, 0x05, 0x60, 0x08, 0xE5, 0x2B, 0x1D, 0x8F, 0xCC, 0xA9, 0x78, 0x4C, 0x80, 0x63, 0x3B, 0xCB, 0xF5, 0x74, 0x5B, 0x57, 0xA2, 0xFD, 0x58, }, + { + 0xB7, 0x4A, 0xC0, 0x1F, 0xBE, 0xCE, 0xA5, 0x2A, 0x80, 0x11, 0xDD, 0x6F, 0x94, 0x71, 0x47, 0x39, + 0x56, 0x03, 0x4D, 0xF5, 0x47, 0xA7, 0x81, 0x13, 0x92, 0x4D, 0x73, 0x69, 0xB6, 0xB1, 0xDC, 0x0D, + }, + { + 0x1C, 0x93, 0xD3, 0xA4, 0x48, 0xEC, 0x29, 0x44, 0xCC, 0x74, 0x05, 0x60, 0x08, 0xE5, 0x2B, 0x1D, + 0x8F, 0xCC, 0xA9, 0x78, 0x4C, 0x80, 0x63, 0x3B, 0xCB, 0xF5, 0x74, 0x5B, 0x57, 0xA2, 0xFD, 0x58, + }, }, { - { 0x46, 0x50, 0xC5, 0x70, 0x93, 0x29, 0x66, 0x08, 0x25, 0xA9, 0xA5, 0xDA, 0xED, 0x9F, 0xA5, 0x0B, 0xE5, 0xAB, 0xAB, 0xAA, 0x9D, 0x37, 0x32, 0x71, 0x9A, 0x01, 0xBF, 0x29, 0xD7, 0xBF, 0xE5, 0x43, }, - { 0x20, 0x91, 0x42, 0xD4, 0xB9, 0x49, 0xBF, 0xFA, 0xC2, 0x8D, 0xB9, 0x79, 0xAF, 0x84, 0xC9, 0xC2, 0x91, 0xF8, 0x75, 0x40, 0x41, 0x0F, 0x2C, 0xC6, 0xBF, 0x96, 0xAA, 0x63, 0x7B, 0x45, 0x85, 0x64, }, + { + 0x46, 0x50, 0xC5, 0x70, 0x93, 0x29, 0x66, 0x08, 0x25, 0xA9, 0xA5, 0xDA, 0xED, 0x9F, 0xA5, 0x0B, + 0xE5, 0xAB, 0xAB, 0xAA, 0x9D, 0x37, 0x32, 0x71, 0x9A, 0x01, 0xBF, 0x29, 0xD7, 0xBF, 0xE5, 0x43, + }, + { + 0x20, 0x91, 0x42, 0xD4, 0xB9, 0x49, 0xBF, 0xFA, 0xC2, 0x8D, 0xB9, 0x79, 0xAF, 0x84, 0xC9, 0xC2, + 0x91, 0xF8, 0x75, 0x40, 0x41, 0x0F, 0x2C, 0xC6, 0xBF, 0x96, 0xAA, 0x63, 0x7B, 0x45, 0x85, 0x64, + }, }, { - { 0x6E, 0x68, 0x23, 0xD3, 0xC0, 0x4E, 0xA3, 0xBC, 0x20, 0xB4, 0x3B, 0xEC, 0xEB, 0x5B, 0x42, 0x85, 0x4E, 0xF8, 0x40, 0xEE, 0x47, 0x7B, 0x58, 0x70, 0x94, 0x49, 0xBB, 0x8D, 0x8F, 0x63, 0xEE, 0x78, }, - { 0xF8, 0x4E, 0x89, 0xA3, 0xE9, 0x07, 0x0A, 0xAE, 0xFE, 0x86, 0x0D, 0x49, 0x83, 0x80, 0x7E, 0x07, 0xD1, 0xFB, 0xF6, 0x5D, 0xAB, 0x2F, 0x1B, 0x81, 0x51, 0x34, 0x7F, 0x82, 0x8C, 0x9F, 0x0F, 0xC0, }, + { + 0x6E, 0x68, 0x23, 0xD3, 0xC0, 0x4E, 0xA3, 0xBC, 0x20, 0xB4, 0x3B, 0xEC, 0xEB, 0x5B, 0x42, 0x85, + 0x4E, 0xF8, 0x40, 0xEE, 0x47, 0x7B, 0x58, 0x70, 0x94, 0x49, 0xBB, 0x8D, 0x8F, 0x63, 0xEE, 0x78, + }, + { + 0xF8, 0x4E, 0x89, 0xA3, 0xE9, 0x07, 0x0A, 0xAE, 0xFE, 0x86, 0x0D, 0x49, 0x83, 0x80, 0x7E, 0x07, + 0xD1, 0xFB, 0xF6, 0x5D, 0xAB, 0x2F, 0x1B, 0x81, 0x51, 0x34, 0x7F, 0x82, 0x8C, 0x9F, 0x0F, 0xC0, + }, }, { - { 0x3D, 0x02, 0xF6, 0x79, 0xEF, 0x69, 0xD3, 0x3D, 0xF1, 0x7C, 0xC8, 0x04, 0x0A, 0xBC, 0xAC, 0xDD, 0xF8, 0x13, 0x3A, 0x04, 0xE0, 0xD8, 0x9E, 0x3C, 0xF1, 0x0D, 0xAD, 0x74, 0xE0, 0x08, 0x04, 0xD9, }, - { 0x82, 0xE2, 0x74, 0x4E, 0xE7, 0xD9, 0x32, 0x76, 0xD1, 0x74, 0xE9, 0x87, 0x7A, 0x42, 0x6A, 0x83, 0x0D, 0xF9, 0x1A, 0xAE, 0x41, 0x24, 0x57, 0x6A, 0x7E, 0xC5, 0x2E, 0xE8, 0x47, 0xEB, 0x0B, 0xC0, }, + { + 0x3D, 0x02, 0xF6, 0x79, 0xEF, 0x69, 0xD3, 0x3D, 0xF1, 0x7C, 0xC8, 0x04, 0x0A, 0xBC, 0xAC, 0xDD, + 0xF8, 0x13, 0x3A, 0x04, 0xE0, 0xD8, 0x9E, 0x3C, 0xF1, 0x0D, 0xAD, 0x74, 0xE0, 0x08, 0x04, 0xD9, + }, + { + 0x82, 0xE2, 0x74, 0x4E, 0xE7, 0xD9, 0x32, 0x76, 0xD1, 0x74, 0xE9, 0x87, 0x7A, 0x42, 0x6A, 0x83, + 0x0D, 0xF9, 0x1A, 0xAE, 0x41, 0x24, 0x57, 0x6A, 0x7E, 0xC5, 0x2E, 0xE8, 0x47, 0xEB, 0x0B, 0xC0, + }, }, { - { 0x39, 0x9E, 0x6B, 0xE5, 0x84, 0xDE, 0x50, 0x91, 0xF4, 0x97, 0x11, 0xED, 0x6C, 0x19, 0x5F, 0x0D, 0xE0, 0xEE, 0x81, 0x11, 0x13, 0xC6, 0x8B, 0x37, 0x23, 0x99, 0xDB, 0xBF, 0xF2, 0x8F, 0x11, 0x73, }, - { 0x75, 0xF6, 0x13, 0x59, 0xF0, 0x4C, 0x77, 0xFF, 0x4D, 0xE5, 0x8A, 0x10, 0xF9, 0xF8, 0x7B, 0x31, 0xB5, 0xB8, 0xDA, 0x33, 0x73, 0xF6, 0x23, 0x0F, 0xE1, 0x73, 0x50, 0x33, 0x44, 0x6B, 0x99, 0x48, }, + { + 0x39, 0x9E, 0x6B, 0xE5, 0x84, 0xDE, 0x50, 0x91, 0xF4, 0x97, 0x11, 0xED, 0x6C, 0x19, 0x5F, 0x0D, + 0xE0, 0xEE, 0x81, 0x11, 0x13, 0xC6, 0x8B, 0x37, 0x23, 0x99, 0xDB, 0xBF, 0xF2, 0x8F, 0x11, 0x73, + }, + { + 0x75, 0xF6, 0x13, 0x59, 0xF0, 0x4C, 0x77, 0xFF, 0x4D, 0xE5, 0x8A, 0x10, 0xF9, 0xF8, 0x7B, 0x31, + 0xB5, 0xB8, 0xDA, 0x33, 0x73, 0xF6, 0x23, 0x0F, 0xE1, 0x73, 0x50, 0x33, 0x44, 0x6B, 0x99, 0x48, + }, }, { - { 0xC8, 0x61, 0xA8, 0x9C, 0xFB, 0x13, 0x35, 0xF2, 0x78, 0xC9, 0x6C, 0xF7, 0xFF, 0xC9, 0x75, 0x3C, 0x29, 0x0C, 0xBE, 0x1A, 0x4E, 0x18, 0x6D, 0x29, 0x23, 0xB4, 0x96, 0xBB, 0x4E, 0xA5, 0xE5, 0x19, }, - { 0x94, 0x24, 0xB7, 0xAE, 0x5F, 0xA7, 0x2D, 0x3E, 0xE4, 0xA2, 0x66, 0x11, 0x2E, 0x7A, 0xBC, 0x40, 0x92, 0xE8, 0x15, 0xAE, 0x29, 0xFA, 0xB2, 0x6D, 0xA6, 0x66, 0xC1, 0x48, 0x5B, 0xA9, 0x2B, 0xDC, }, + { + 0xC8, 0x61, 0xA8, 0x9C, 0xFB, 0x13, 0x35, 0xF2, 0x78, 0xC9, 0x6C, 0xF7, 0xFF, 0xC9, 0x75, 0x3C, + 0x29, 0x0C, 0xBE, 0x1A, 0x4E, 0x18, 0x6D, 0x29, 0x23, 0xB4, 0x96, 0xBB, 0x4E, 0xA5, 0xE5, 0x19, + }, + { + 0x94, 0x24, 0xB7, 0xAE, 0x5F, 0xA7, 0x2D, 0x3E, 0xE4, 0xA2, 0x66, 0x11, 0x2E, 0x7A, 0xBC, 0x40, + 0x92, 0xE8, 0x15, 0xAE, 0x29, 0xFA, 0xB2, 0x6D, 0xA6, 0x66, 0xC1, 0x48, 0x5B, 0xA9, 0x2B, 0xDC, + }, }, }; -static bool ascon_xof_selftest(void) { +static bool ascon_xof_selftest( void ) { uint8_t input[KAT_NUM - 1]; + for (int i = 0; i < sizeof(input); i++) { input[i] = (uint8_t)i; } bool passed = true; for (int i = 0; i < KAT_NUM; i++) { - uint8_t output[256/8]; + uint8_t output[256 / 8]; if (isLE()) { ascon_xof<256, true, true>(input, i, 0, output); @@ -349,9 +454,9 @@ static bool ascon_xof_selftest(void) { } if (0 != memcmp(KAT[i][1], output, sizeof(output))) { printf("Mismatch with XOFa len %d\n Expected:", i); - for (int j = 0; j < 256/8; j++) { printf(" %02x", KAT[i][1][j]); } + for (int j = 0; j < 256 / 8; j++) { printf(" %02x", KAT[i][1][j]); } printf("\n Found :"); - for (int j = 0; j < 256/8; j++) { printf(" %02x", output[j]); } + for (int j = 0; j < 256 / 8; j++) { printf(" %02x", output[j]); } printf("\n\n"); passed = false; } @@ -363,9 +468,9 @@ static bool ascon_xof_selftest(void) { } if (0 != memcmp(KAT[i][0], output, sizeof(output))) { printf("Mismatch with XOF len %d\n Expected:", i); - for (int j = 0; j < 256/8; j++) { printf(" %02x", KAT[i][0][j]); } + for (int j = 0; j < 256 / 8; j++) { printf(" %02x", KAT[i][0][j]); } printf("\n Found :"); - for (int j = 0; j < 256/8; j++) { printf(" %02x", output[j]); } + for (int j = 0; j < 256 / 8; j++) { printf(" %02x", output[j]); } printf("\n\n"); passed = false; } @@ -376,234 +481,234 @@ static bool ascon_xof_selftest(void) { //------------------------------------------------------------ REGISTER_FAMILY(ascon, - $.src_url = "https://github.com/ascon/ascon-c", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/ascon/ascon-c", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(ascon_XOF_32, - $.desc = "ascon v1.2 (XOF, 32 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x1124BD16, - $.verification_BE = 0xED22753E, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<32,false,false>, - $.hashfn_bswap = ascon_xof<32,false,true> -); + $.desc = "ascon v1.2 (XOF, 32 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x1124BD16, + $.verification_BE = 0xED22753E, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<32, false, false>, + $.hashfn_bswap = ascon_xof<32, false, true> + ); REGISTER_HASH(ascon_XOFa_32, - $.desc = "ascon v1.2 (XOFa, 32 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x8F5BB129, - $.verification_BE = 0x44EBDFB6, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<32,true,false>, - $.hashfn_bswap = ascon_xof<32,true,true> -); + $.desc = "ascon v1.2 (XOFa, 32 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x8F5BB129, + $.verification_BE = 0x44EBDFB6, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<32, true, false>, + $.hashfn_bswap = ascon_xof<32, true, true> + ); REGISTER_HASH(ascon_XOF_64, - $.desc = "ascon v1.2 (XOF, 64 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0xCDAAB40E, - $.verification_BE = 0xAC65EB36, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<64,false,false>, - $.hashfn_bswap = ascon_xof<64,false,true> -); + $.desc = "ascon v1.2 (XOF, 64 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0xCDAAB40E, + $.verification_BE = 0xAC65EB36, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<64, false, false>, + $.hashfn_bswap = ascon_xof<64, false, true> + ); REGISTER_HASH(ascon_XOFa_64, - $.desc = "ascon v1.2 (XOFa, 64 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x43ACD116, - $.verification_BE = 0xACFB3C9F, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<64,true,false>, - $.hashfn_bswap = ascon_xof<64,true,true> -); + $.desc = "ascon v1.2 (XOFa, 64 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x43ACD116, + $.verification_BE = 0xACFB3C9F, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<64, true, false>, + $.hashfn_bswap = ascon_xof<64, true, true> + ); REGISTER_HASH(ascon_XOF_128, - $.desc = "ascon v1.2 (XOF, 128 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x9B2F9305, - $.verification_BE = 0x6C15FBDF, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<128,false,false>, - $.hashfn_bswap = ascon_xof<128,false,true> -); + $.desc = "ascon v1.2 (XOF, 128 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x9B2F9305, + $.verification_BE = 0x6C15FBDF, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<128, false, false>, + $.hashfn_bswap = ascon_xof<128, false, true> + ); REGISTER_HASH(ascon_XOFa_128, - $.desc = "ascon v1.2 (XOFa, 128 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x5701888C, - $.verification_BE = 0x10B381AE, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<128,true,false>, - $.hashfn_bswap = ascon_xof<128,true,true> -); + $.desc = "ascon v1.2 (XOFa, 128 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x5701888C, + $.verification_BE = 0x10B381AE, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<128, true, false>, + $.hashfn_bswap = ascon_xof<128, true, true> + ); REGISTER_HASH(ascon_XOF_160, - $.desc = "ascon v1.2 (XOF, 160 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 160, - $.verification_LE = 0x3B726110, - $.verification_BE = 0x3215F456, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<160,false,false>, - $.hashfn_bswap = ascon_xof<160,false,true> -); + $.desc = "ascon v1.2 (XOF, 160 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 160, + $.verification_LE = 0x3B726110, + $.verification_BE = 0x3215F456, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<160, false, false>, + $.hashfn_bswap = ascon_xof<160, false, true> + ); REGISTER_HASH(ascon_XOFa_160, - $.desc = "ascon v1.2 (XOFa, 160 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 160, - $.verification_LE = 0xA4E9A794, - $.verification_BE = 0x387FC024, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<160,true,false>, - $.hashfn_bswap = ascon_xof<160,true,true> -); + $.desc = "ascon v1.2 (XOFa, 160 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 160, + $.verification_LE = 0xA4E9A794, + $.verification_BE = 0x387FC024, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<160, true, false>, + $.hashfn_bswap = ascon_xof<160, true, true> + ); REGISTER_HASH(ascon_XOF_224, - $.desc = "ascon v1.2 (XOF, 224 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 224, - $.verification_LE = 0x83EAEBCC, - $.verification_BE = 0x9929AC99, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<224,false,false>, - $.hashfn_bswap = ascon_xof<224,false,true> -); + $.desc = "ascon v1.2 (XOF, 224 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 224, + $.verification_LE = 0x83EAEBCC, + $.verification_BE = 0x9929AC99, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<224, false, false>, + $.hashfn_bswap = ascon_xof<224, false, true> + ); REGISTER_HASH(ascon_XOFa_224, - $.desc = "ascon v1.2 (XOFa, 224 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 224, - $.verification_LE = 0x618744B2, - $.verification_BE = 0x2D9AFDE5, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<224,true,false>, - $.hashfn_bswap = ascon_xof<224,true,true> -); + $.desc = "ascon v1.2 (XOFa, 224 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 224, + $.verification_LE = 0x618744B2, + $.verification_BE = 0x2D9AFDE5, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<224, true, false>, + $.hashfn_bswap = ascon_xof<224, true, true> + ); REGISTER_HASH(ascon_XOF_256, - $.desc = "ascon v1.2 (XOF, 256 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0xC6629453, - $.verification_BE = 0x6D8F406F, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<256,false,false>, - $.hashfn_bswap = ascon_xof<256,false,true> -); + $.desc = "ascon v1.2 (XOF, 256 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0xC6629453, + $.verification_BE = 0x6D8F406F, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<256, false, false>, + $.hashfn_bswap = ascon_xof<256, false, true> + ); REGISTER_HASH(ascon_XOFa_256, - $.desc = "ascon v1.2 (XOFa, 256 bits)", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0x2ACF11FE, - $.verification_BE = 0xE5CD2E9B, - $.initfn = ascon_xof_selftest, - $.hashfn_native = ascon_xof<256,true,false>, - $.hashfn_bswap = ascon_xof<256,true,true> -); + $.desc = "ascon v1.2 (XOFa, 256 bits)", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0x2ACF11FE, + $.verification_BE = 0xE5CD2E9B, + $.initfn = ascon_xof_selftest, + $.hashfn_native = ascon_xof<256, true, false>, + $.hashfn_bswap = ascon_xof<256, true, true> + ); diff --git a/hashes/badhash.cpp b/hashes/badhash.cpp index 8e215e4d..660055e2 100644 --- a/hashes/badhash.cpp +++ b/hashes/badhash.cpp @@ -27,11 +27,11 @@ #include "Platform.h" #include "Hashlib.h" -template < bool bswap > -static void BadHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void BadHash( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * data = (const uint8_t *)in; const uint8_t * const end = &data[len]; - uint32_t h = seed; + uint32_t h = seed; while (data < end) { h ^= h >> 3; @@ -42,11 +42,11 @@ static void BadHash(const void * in, const size_t len, const seed_t seed, void * PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void sumhash8(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void sumhash8( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * data = (const uint8_t *)in; const uint8_t * const end = &data[len]; - uint32_t h = seed; + uint32_t h = seed; while (data < end) { h += *data++; @@ -55,11 +55,11 @@ static void sumhash8(const void * in, const size_t len, const seed_t seed, void PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void sumhash32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void sumhash32( const void * in, const size_t len, const seed_t seed, void * out ) { const uint32_t * data = (const uint32_t *)in; - const uint32_t * const end = &data[len/4]; - uint32_t h = seed; + const uint32_t * const end = &data[len / 4]; + uint32_t h = seed; while (data < end) { h += GET_U32((const uint8_t *)data, 0); @@ -67,8 +67,8 @@ static void sumhash32(const void * in, const size_t len, const seed_t seed, void } if (len & 3) { - uint8_t * dc = (uint8_t*)data; //byte stepper - const uint8_t * const endc = &((const uint8_t*)in)[len]; + uint8_t * dc = (uint8_t *)data; // byte stepper + const uint8_t * const endc = &((const uint8_t *)in)[len]; while (dc < endc) { h += *dc++ * UINT64_C(11400714819323198485); } @@ -78,58 +78,58 @@ static void sumhash32(const void * in, const size_t len, const seed_t seed, void } REGISTER_FAMILY(badhash, - $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(badhash, - $.desc = "very simple XOR shift", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xAB432E23, - $.verification_BE = 0x241F49BE, - $.hashfn_native = BadHash, - $.hashfn_bswap = BadHash, - $.seedfixfn = excludeBadseeds, - $.badseeds = { 0 }, - $.sort_order = 20 -); + $.desc = "very simple XOR shift", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xAB432E23, + $.verification_BE = 0x241F49BE, + $.hashfn_native = BadHash, + $.hashfn_bswap = BadHash, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0 }, + $.sort_order = 20 + ); REGISTER_HASH(sum8hash, - $.desc = "sum all 8-bit bytes", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_SANITY_FAILS , - $.bits = 32, - $.verification_LE = 0x0000A9AC, - $.verification_BE = 0xACA90000, - $.hashfn_native = sumhash8, - $.hashfn_bswap = sumhash8, - $.seedfixfn = excludeBadseeds, - $.badseeds = { 0 }, - $.sort_order = 30 -); + $.desc = "sum all 8-bit bytes", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_SANITY_FAILS, + $.bits = 32, + $.verification_LE = 0x0000A9AC, + $.verification_BE = 0xACA90000, + $.hashfn_native = sumhash8, + $.hashfn_bswap = sumhash8, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0 }, + $.sort_order = 30 + ); REGISTER_HASH(sum32hash, - $.desc = "sum all 32-bit words", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY, - $.bits = 32, - $.verification_LE = 0x3D6DC280, - $.verification_BE = 0x00A10D9E, - $.hashfn_native = sumhash32, - $.hashfn_bswap = sumhash32, - $.seedfixfn = excludeZeroSeed, - $.badseeds = { UINT64_C(0x9e3779b97f4a7c15) }, - $.sort_order = 31 -); + $.desc = "sum all 32-bit words", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY, + $.bits = 32, + $.verification_LE = 0x3D6DC280, + $.verification_BE = 0x00A10D9E, + $.hashfn_native = sumhash32, + $.hashfn_bswap = sumhash32, + $.seedfixfn = excludeZeroSeed, + $.badseeds = { UINT64_C (0x9e3779b97f4a7c15) }, + $.sort_order = 31 + ); diff --git a/hashes/beamsplitter.cpp b/hashes/beamsplitter.cpp index b3926a80..3368b55a 100644 --- a/hashes/beamsplitter.cpp +++ b/hashes/beamsplitter.cpp @@ -30,309 +30,310 @@ // gotten from random.org // as hex bytes that I formatted into 64-bit values static const uint64_t T[1024] = { - UINT64_C(0x6fa74b1b15047628), UINT64_C(0xa2b5ee64e9e8f629), UINT64_C(0xd0937853bdd0edca), UINT64_C(0x4e9fb2b2b0a637a6), - UINT64_C(0x26ac5a8fac69497e), UINT64_C(0x51e127f0db14aa48), UINT64_C(0xea5b9f512d8d6a09), UINT64_C(0xf3af1406a87de6a9), - UINT64_C(0x3b36e2ed14818955), UINT64_C(0xb0ac19ef2dde986c), UINT64_C(0xd34ed04929f8f66d), UINT64_C(0xe99978cff2b324ea), - UINT64_C(0x4032cb3ecff8cb38), UINT64_C(0xfa52274072d86042), UINT64_C(0x27437346dec26105), UINT64_C(0xec1cbf04b76aec71), - UINT64_C(0x6dd57b3dac56cd39), UINT64_C(0x34e9021797e95aad), UINT64_C(0xdc8d3363540c5999), UINT64_C(0x773d283eeeabf4ab), - UINT64_C(0x373c522657461aaf), UINT64_C(0x154cfe0f497d7f78), UINT64_C(0x6d377183b5ca6550), UINT64_C(0x614da5f6055e904b), - UINT64_C(0xd77b66b34896f00e), UINT64_C(0x122538125d6adaef), UINT64_C(0x1021e161206d9091), UINT64_C(0x38407c4313aefdfa), - UINT64_C(0xd941cc5dafc66162), UINT64_C(0xfc2432a6ea885315), UINT64_C(0x5576dc02b68b10ed), UINT64_C(0xd8449f9d4ab139a2), - UINT64_C(0xd333cbcd49cbacba), UINT64_C(0x700d20430e06eeb8), UINT64_C(0xdeb34810d6d0320a), UINT64_C(0x6743363d6cc8ba68), - UINT64_C(0xbd183cb526e6e936), UINT64_C(0xee62bf5ee97de5ea), UINT64_C(0xf6b855e743e76853), UINT64_C(0x83ac16a35d132df9), - UINT64_C(0x2046f2c70c2130b1), UINT64_C(0xaadd5007102b5ee4), UINT64_C(0x8eedac842e63cdac), UINT64_C(0xba02956e43c18608), - UINT64_C(0xd2688af010adbeaf), UINT64_C(0x4aaa5295377c17be), UINT64_C(0x83792382ba198f10), UINT64_C(0x6fc42849961a25b6), - UINT64_C(0x3501677f06fb1311), UINT64_C(0x1e18b89705c224dd), UINT64_C(0xa0a0b8684aa2e12d), UINT64_C(0x30d19aac3d40898e), - UINT64_C(0x41dd335a29272e9b), UINT64_C(0x5c5d445a07426e3f), UINT64_C(0x6f13080e67946fdc), UINT64_C(0x3ddabae21609bf08), - UINT64_C(0x8e6146d3cde11ca5), UINT64_C(0x9eff76a4c39eacf4), UINT64_C(0x71c66d0a423a21b7), UINT64_C(0x68515c0b712bbc4f), - UINT64_C(0x5edd17cec412a735), UINT64_C(0xa444f487c96f896c), UINT64_C(0xc161d16d4e54041a), UINT64_C(0x3a2d84d3e09bafb9), - UINT64_C(0x63a406b157a5f2f1), UINT64_C(0x18292d6007f839ba), UINT64_C(0xcaac5789618f2aac), UINT64_C(0x6f516d95f749dd97), - UINT64_C(0xb5784409560e219f), UINT64_C(0x12f0f0d6fbdcb81c), UINT64_C(0x993d6c2a47089679), UINT64_C(0xcc9247b35870aebf), - UINT64_C(0xa1ca8eff8b1bca70), UINT64_C(0x7a1d015397e558cc), UINT64_C(0xc504a4d4815f8722), UINT64_C(0x3e44258e93472b26), - UINT64_C(0x11bd0578a36c8044), UINT64_C(0x84c7087603a0a6ea), UINT64_C(0x457d0c59e84c9ac8), UINT64_C(0x32129275ee63dd95), - UINT64_C(0x66269220e943024d), UINT64_C(0x197de12f9d6e5c72), UINT64_C(0x06fdd09a4d6157dd), UINT64_C(0xf8c1a8b51fe95716), - UINT64_C(0x41eeb6129149f6cf), UINT64_C(0x42f510887a61de1b), UINT64_C(0xf3d2aa6e4fe5949d), UINT64_C(0xc0799007b85373aa), - UINT64_C(0x81577b167de515c3), UINT64_C(0x01f424fc6b856270), UINT64_C(0xff6247ed0658caa8), UINT64_C(0x63ad005e620fe4bb), - UINT64_C(0xdb919b9f63c93174), UINT64_C(0x5693dbd6c76c7683), UINT64_C(0xdaa9b82e85e0355a), UINT64_C(0x424c5c4e5672fc73), - UINT64_C(0x9de3ca332ba818f1), UINT64_C(0xb28f375a58bc6c1e), UINT64_C(0xef0af1e6041b9cd4), UINT64_C(0x0418afb53ef5408f), - UINT64_C(0x9a37634585d3330a), UINT64_C(0x3ab5aec014b097cd), UINT64_C(0x384a0739a3ff7dc8), UINT64_C(0x0ff31c11226e5d5a), - UINT64_C(0x71070735f1c16bb4), UINT64_C(0xc4f78905f49a3840), UINT64_C(0x561f68d6a5f44a81), UINT64_C(0xb09bd8cd8d932357), - UINT64_C(0xf270b47652354fdb), UINT64_C(0x47d6ca7bba50c2c7), UINT64_C(0x2720590d7b2b7b54), UINT64_C(0xcaac35df08cab300), - UINT64_C(0xd05759dee169d9fd), UINT64_C(0xdb8d0d0403a6aafb), UINT64_C(0xcd3ab85684ba537c), UINT64_C(0xad69c4e5240c158f), - UINT64_C(0x65427c4ff3637db2), UINT64_C(0x085ecbbf903a45ae), UINT64_C(0xeafed57a94384c62), UINT64_C(0xc99972367cd21eba), - UINT64_C(0xc1e2cf52270b20eb), UINT64_C(0x825dad5142681653), UINT64_C(0x47e99edc5e141d94), UINT64_C(0x125813bc26e42e07), - UINT64_C(0x06f41d2441b172ca), UINT64_C(0x5e9e640ed911730e), UINT64_C(0x5900403342f0f362), UINT64_C(0x57a600d157ee9945), - UINT64_C(0xbcc5d702f02dc7e0), UINT64_C(0x8258cf5a1a6435ab), UINT64_C(0xdf885b6a0343a3e0), UINT64_C(0xadd74c04a503b09a), - UINT64_C(0x0ea210122eeef589), UINT64_C(0x5217fd50f3ecaf85), UINT64_C(0xd0c39849df6b4756), UINT64_C(0xf66d9e1c91bd0981), - UINT64_C(0x0f355b00f40e3e6b), UINT64_C(0xc01dabcd14518520), UINT64_C(0x58691b4fa9e7d327), UINT64_C(0x357616c77c22fffe), - UINT64_C(0xb9fbf8de2ed23303), UINT64_C(0x0195932bc205c466), UINT64_C(0xef0763590a08a50d), UINT64_C(0xf546866c0028a938), - UINT64_C(0x41cc8732eaad496a), UINT64_C(0xadc61f16374896c6), UINT64_C(0x5eb8f93f25ad0457), UINT64_C(0x240f00f5db3fae25), - UINT64_C(0xcc48503596dc01ef), UINT64_C(0x351baaa904a306d5), UINT64_C(0x7111179ae328bb19), UINT64_C(0x6789a31719d5d453), - UINT64_C(0xf5318492c9613de6), UINT64_C(0xa0e8c24f3f0da716), UINT64_C(0xac15d68d54401b9d), UINT64_C(0xadafb35cf63092ee), - UINT64_C(0xceb5f8d63c7fec4c), UINT64_C(0x1ae71929b980fc9d), UINT64_C(0x6efdc5693ef4ee2a), UINT64_C(0xbedd8334cade7855), - UINT64_C(0x06f1b768b476a249), UINT64_C(0x9e614bedf41dd639), UINT64_C(0x9eca9c6c9e389a5d), UINT64_C(0x76999bf01b912df2), - UINT64_C(0x04d52fb2ac70ab31), UINT64_C(0xe467ea8172f5d066), UINT64_C(0x356ed51bb0e094ae), UINT64_C(0xab2047c21b54d8ba), - UINT64_C(0x21dbbfa0a6157474), UINT64_C(0x7de36edec62f1997), UINT64_C(0x306ef59f5204a58c), UINT64_C(0x954135a769d5b72e), - UINT64_C(0x9d7774a0c2d29380), UINT64_C(0xc03acfd63ac6b88c), UINT64_C(0x9989d5ee565322e6), UINT64_C(0x19d1a58324bdd145), - UINT64_C(0xe74685383cc6b27c), UINT64_C(0xf9edffe1c4d81108), UINT64_C(0x94950b5b6247cb43), UINT64_C(0xe3fa8c6468d419eb), - UINT64_C(0x29981bd802f77ac5), UINT64_C(0x6cf1a6cab28c1c36), UINT64_C(0x1d34a382a5d48973), UINT64_C(0xcd1d5d546e5e4d3d), - UINT64_C(0x4ad78b4a37e52322), UINT64_C(0x24da17671ab463f2), UINT64_C(0x527504b7c7bc5537), UINT64_C(0x7ba1d92e1969b2b5), - UINT64_C(0x53a130812c49d64a), UINT64_C(0x503af48d9510f1d7), UINT64_C(0x719db8a348dee165), UINT64_C(0xa85e4fad1f343e67), - UINT64_C(0xdafc1fa9203d2d45), UINT64_C(0x7730f245c903a407), UINT64_C(0xb7c04e53f913aeae), UINT64_C(0x39ed817e1e039153), - UINT64_C(0xf415ea2b3efc7606), UINT64_C(0x15e3c53fe43f104d), UINT64_C(0x1b71e4d83ccba83c), UINT64_C(0xfe088f4c90812841), - UINT64_C(0x1ff8e2ee0a04b6ae), UINT64_C(0xf4f4a23612b9eed2), UINT64_C(0xc596a66051b8aca1), UINT64_C(0xbc898edd3370a8dd), - UINT64_C(0xce7638a7a2f9152e), UINT64_C(0xd99192635c0d5c92), UINT64_C(0x62038c87c094a1ff), UINT64_C(0xa73f1bcaac7343af), - UINT64_C(0x93c797804faa5ff3), UINT64_C(0x9da7407c705da1f0), UINT64_C(0xa52cde7d37fef9f0), UINT64_C(0xb93a7db97e3fa7ff), - UINT64_C(0x75ee91392c60fb6b), UINT64_C(0x4d7f8e3db9383ae0), UINT64_C(0xe0aec397d5290d06), UINT64_C(0x159a20f22d740d81), - UINT64_C(0x231416cff9a9b014), UINT64_C(0x71ed3a6e513b4795), UINT64_C(0x190b08ebcb87f3bc), UINT64_C(0x36bb0bcb0e8df593), - UINT64_C(0xc1e63cdc4d78dfb3), UINT64_C(0x36e2c57ba6799460), UINT64_C(0x280c0618b19f63dc), UINT64_C(0xca2b8e49d6c71d2d), - UINT64_C(0xc881e59705270f09), UINT64_C(0x26fdf0dbb5f2f451), UINT64_C(0xc6d1a3697ca86855), UINT64_C(0xd00755a203980eb5), - UINT64_C(0xa85962163dd7de95), UINT64_C(0x622b7a1d2531d00e), UINT64_C(0xb6c1cfba74436ef7), UINT64_C(0x9578891a720bf317), - UINT64_C(0x5e325058bd3a343a), UINT64_C(0x9a468a5a888a475f), UINT64_C(0xa57f0edb414a0589), UINT64_C(0xa044aef7ea680f8c), - UINT64_C(0x2036717cee9b991a), UINT64_C(0x3925631ec66cb8aa), UINT64_C(0xdcb6a5da6b2fc78f), UINT64_C(0x17a8cd724b7b5e26), - UINT64_C(0x1c704c6a48a2dae0), UINT64_C(0x87d8f6738a0c30bc), UINT64_C(0xd8580262a4801240), UINT64_C(0x5812cea521ffaeaf), - UINT64_C(0x21b6ff923871f14c), UINT64_C(0x922dbd45c2b307d1), UINT64_C(0x5c67ecbaace24d31), UINT64_C(0xb90f5e3acfaeff9b), - UINT64_C(0xea5aa9f2f14efeb1), UINT64_C(0x08003af95ab5ce92), UINT64_C(0x5a39361e05692622), UINT64_C(0xd4b8cddc309e44da), - UINT64_C(0xe20bfe5f0a1343d9), UINT64_C(0x13848357d100b2b3), UINT64_C(0x912a1b220fa678f5), UINT64_C(0x7631242b7f6d6365), - UINT64_C(0x5a9f9a3284d95674), UINT64_C(0x0d5b02c98afd4279), UINT64_C(0xede70dbc04a7a3d9), UINT64_C(0xadb3f72865ba580e), - UINT64_C(0xc4a3c11163562e90), UINT64_C(0x482e567c69b6b128), UINT64_C(0x38ec96bfcb4d965d), UINT64_C(0x923fe02a6b4bdabe), - UINT64_C(0x0ae0ca91a2be0579), UINT64_C(0x137401e7f2acf3e8), UINT64_C(0xfdad100e85bc5622), UINT64_C(0x9c07483343c8030f), - UINT64_C(0x71872f8555dbd0a8), UINT64_C(0x8de5873dbfa538e0), UINT64_C(0x2922d0d9a2d9eb02), UINT64_C(0x2744006cfc375d0c), - UINT64_C(0xa82c09537574f583), UINT64_C(0x2ab2d255e73f6f83), UINT64_C(0x6cc5f73b682b3701), UINT64_C(0x6e59fc51ee28845d), - UINT64_C(0xe536b381533cc4cf), UINT64_C(0xfd2ac9f30025e109), UINT64_C(0xc26cdfa60b8be153), UINT64_C(0x62da136e08f0f885), - UINT64_C(0xeb6a7a065b640357), UINT64_C(0x7462b101e2adb3ff), UINT64_C(0x996ec340bf52ea07), UINT64_C(0xf0aa2a872333e60c), - UINT64_C(0x222884f9c4632341), UINT64_C(0x32b5289d94dac82e), UINT64_C(0x7cdd99055bd35f17), UINT64_C(0x92d3d262aefe21bc), - UINT64_C(0xc6c1b1029eb0dd4c), UINT64_C(0x28f046ec80f3c975), UINT64_C(0xc1f0c2d9745c5cb7), UINT64_C(0x92ada28cf6f7fe0b), - UINT64_C(0xdfb215a8df753a03), UINT64_C(0x942ecdad535f962d), UINT64_C(0x7d739b8c0b7a1669), UINT64_C(0xee95286e88be8510), - UINT64_C(0x4ae71aa9d3c3d36f), UINT64_C(0x2bd6d5d12452cc38), UINT64_C(0x16fa1504fbedf267), UINT64_C(0x4b835f8377f3937d), - UINT64_C(0x0004374053160cb7), UINT64_C(0xe44a676c90906fe8), UINT64_C(0x2389c459f53fbdcd), UINT64_C(0x4a7031455481da9e), - UINT64_C(0xb72c293d969a40cc), UINT64_C(0xd9b72ee09dde404d), UINT64_C(0xa31f4f98c5aabc97), UINT64_C(0x56f240ad0aea491c), - UINT64_C(0x86264ebf858d67bf), UINT64_C(0x93fd3b332948fd87), UINT64_C(0x79899120e2d72215), UINT64_C(0x36dedea1a614643e), - UINT64_C(0x1c5e947b88cba0f6), UINT64_C(0x20ec77907c771a4f), UINT64_C(0x587a65fe2c8f5487), UINT64_C(0x9b5431d881ff3b4a), - UINT64_C(0x8f55b2fd967902d7), UINT64_C(0xebd59a640fee9b7e), UINT64_C(0xd5a77b39543d5bef), UINT64_C(0x5dbf440d204f5d0f), - UINT64_C(0x4e22065f53ba213e), UINT64_C(0x4611a2d169ad5a0b), UINT64_C(0x41ea9888cb5be7d1), UINT64_C(0xf8a661f2359be997), - UINT64_C(0xde83a9e3a6562631), UINT64_C(0xd66dedc223dad775), UINT64_C(0x162e54732874a52a), UINT64_C(0xf6d91b1963c23d56), - UINT64_C(0x56d3c9a025a95772), UINT64_C(0x92ddff0a1caeb05c), UINT64_C(0x6cbeb9f263443bd7), UINT64_C(0xb4ad540e1b11894b), - UINT64_C(0xcfa573f2f78d8b29), UINT64_C(0xad477ed16d45543f), UINT64_C(0x0d0283973ed3423a), UINT64_C(0x5307f93f3654f284), - UINT64_C(0xbc9b362f504b145b), UINT64_C(0x5661193dc5bcb5ff), UINT64_C(0x151c9b1c7c0f246a), UINT64_C(0xad25cfcfd5e399d2), - UINT64_C(0xc5855adf08226db2), UINT64_C(0x5a027c03c078be13), UINT64_C(0xc2465bfb0dc5b99c), UINT64_C(0x8aaa55a9eca79b60), - UINT64_C(0x797a7c2608c23d9e), UINT64_C(0x692b8d7da8c7f748), UINT64_C(0xc23c7b1ab3e883e1), UINT64_C(0xe1ebb866f32ac6cf), - UINT64_C(0xca6be5075b5046f9), UINT64_C(0x3105a0555f6a3bac), UINT64_C(0x525b7cc4839ea6c5), UINT64_C(0xce1dd2aad7e83cf1), - UINT64_C(0xb4a9105674d79be6), UINT64_C(0x667eb8384834f7db), UINT64_C(0xb200a7a30f789150), UINT64_C(0x4ba4d2c780055821), - UINT64_C(0xb48a01ad5f7474c6), UINT64_C(0x3310ba4a1e25aab8), UINT64_C(0x64379d2408fd5735), UINT64_C(0xf11e9788704e5e0d), - UINT64_C(0xe9866ab0a8e90f4e), UINT64_C(0xaa344ffe50f7a934), UINT64_C(0xcce37a15b3870924), UINT64_C(0xe22135597a867f1c), - UINT64_C(0x8770a58d7fe57f99), UINT64_C(0xcafbbc8d2024bcbc), UINT64_C(0x2307e7f0fcdb1909), UINT64_C(0xdd016550b9ed2b2a), - UINT64_C(0xd0bcf0e9dee7df90), UINT64_C(0xe82d2e7daeab325c), UINT64_C(0x721a2aba71709aa7), UINT64_C(0x38cfabc260602614), - UINT64_C(0x3099ccb02b73b4c8), UINT64_C(0x00250ce48fd67df0), UINT64_C(0xcace64d8984b19cf), UINT64_C(0xee305dcbae8615ca), - UINT64_C(0xd187da55485b86ef), UINT64_C(0xebea32b2455e6486), UINT64_C(0x77cb912fa927d5c5), UINT64_C(0x911002ac8b62cbd8), - UINT64_C(0x70730c24c32c5870), UINT64_C(0x0a7cb6f89e988a83), UINT64_C(0x6b5e00839b7db787), UINT64_C(0xecae9f4cfd9ce924), - UINT64_C(0xae09926b714019a5), UINT64_C(0xbc1b2c59bc5ce769), UINT64_C(0x592756761e90349f), UINT64_C(0x95c9a69a21936de3), - UINT64_C(0x192b2119ee48eb9a), UINT64_C(0xcd8d11ebcd8a71c2), UINT64_C(0x34de8d4cad3151d6), UINT64_C(0x0fc4f3baf540eb1c), - UINT64_C(0x88bd85e02b2ec0e2), UINT64_C(0x5b65423e815dafb6), UINT64_C(0x66ec6fadd29f273e), UINT64_C(0xc3622fbc1f1c7bd0), - UINT64_C(0x50cc102827ff1acf), UINT64_C(0xe73cab705018a55f), UINT64_C(0xcd552b588a227f38), UINT64_C(0xc462735f28a9c597), - UINT64_C(0x3e3ccb00a16906e1), UINT64_C(0x79bdf5d7e7dfa593), UINT64_C(0xb333b6942d5db3a9), UINT64_C(0x3566edd901f25f20), - UINT64_C(0x8c5fe3e063253c7b), UINT64_C(0x9f0aa4160fb652ee), UINT64_C(0x2361d9bca2c92f43), UINT64_C(0x2d6a0339fe1de8ee), - UINT64_C(0x389b1bd9476b0470), UINT64_C(0xd7fa2522f0da451e), UINT64_C(0x43e6a01d67c62b2d), UINT64_C(0x5bdc15971dc0d5b3), - UINT64_C(0x38a0a80acbadf021), UINT64_C(0x2c66125ec66e1fad), UINT64_C(0xb58f61bb53b6a9ff), UINT64_C(0x492142919b2d61d6), - UINT64_C(0xd905263cc927ebd9), UINT64_C(0xca15f966e2279122), UINT64_C(0xf9dc67f8101119c9), UINT64_C(0x7f6755699c23d8c9), - UINT64_C(0x26146d38a23b0bdf), UINT64_C(0x0166c70bc773d9aa), UINT64_C(0x5b3317113904ec75), UINT64_C(0x5d3c4311b21e44d1), - UINT64_C(0x479c13c75df8cf18), UINT64_C(0x75a880dd38a8a4ff), UINT64_C(0xdf378e2eb432708d), UINT64_C(0xca1cb0f76b1c5f04), - UINT64_C(0x06c76e876516eb46), UINT64_C(0x965c10e60ec202ad), UINT64_C(0x67b18e2140e0aad3), UINT64_C(0x203ca38572b212b8), - UINT64_C(0x72adad835dd333c6), UINT64_C(0xdd02aa349680a96a), UINT64_C(0x69ab0df01d4b3eab), UINT64_C(0xfebfd83a2c43afd1), - UINT64_C(0x0dcd90c392b9fae4), UINT64_C(0x8a87b8033e4cd8cc), UINT64_C(0x3902150c36e99880), UINT64_C(0xb5b655e071474ebc), - UINT64_C(0x6c2dc9eeaffbd8d8), UINT64_C(0x3cf62bfa4986f0fe), UINT64_C(0xa68eaf0719a9afbc), UINT64_C(0xde1f4e9a4b190aef), - UINT64_C(0x7fbc9e8538999e56), UINT64_C(0xf6d5e9db2208a40c), UINT64_C(0x93b13abaddf4554c), UINT64_C(0xd8b5e4ad9911629f), - UINT64_C(0x6fdb9d7376488e52), UINT64_C(0xee604a7ce20d75ad), UINT64_C(0x94ec4abbaa9c2c1d), UINT64_C(0xdbd148c4fcd05ec1), - UINT64_C(0x0865c7c3b380a005), UINT64_C(0xa6da59a56992f211), UINT64_C(0x2eb1dc9f941c83ef), UINT64_C(0x3bf5ccf06910fae7), - UINT64_C(0x23a70e117e1f29f0), UINT64_C(0x4273791acbf6c4e5), UINT64_C(0x338414ec6b5e5d60), UINT64_C(0xa5873517e3d057d9), - UINT64_C(0xea88400a890764f6), UINT64_C(0xc0569d573ca5364f), UINT64_C(0x4c3fc02fc93316e0), UINT64_C(0x76597f718657e577), - UINT64_C(0x17052b8440c7d824), UINT64_C(0x9a7ec0a30be21a00), UINT64_C(0xab0453ac2173dac9), UINT64_C(0xb6f3706820512809), - UINT64_C(0xef44f0b07d46180a), UINT64_C(0x5e9aa12e99509a72), UINT64_C(0x6231337efc0182ca), UINT64_C(0x0963321a419da89b), - UINT64_C(0xfda3e7ad51f82b5e), UINT64_C(0x1ab8790c2f5bf1a3), UINT64_C(0x9ef177b8a59f28c0), UINT64_C(0x27d1c87da66c1652), - UINT64_C(0x1bd6bdf27c49d109), UINT64_C(0xc151e2a66994d599), UINT64_C(0x5e1b8d826b8c12a9), UINT64_C(0x39f41d57213261b5), - UINT64_C(0x16a57bd0bc78aada), UINT64_C(0x0127e7f9699b55c7), UINT64_C(0xd79eccc9f9d703be), UINT64_C(0xb41b81c61ba66d7d), - UINT64_C(0xcf8b79dcb95dce93), UINT64_C(0x5ca102a7743a6e0d), UINT64_C(0xf422a0c3a2ad7b28), UINT64_C(0x4a9137b4a0f03724), - UINT64_C(0x907dcf6425c829c2), UINT64_C(0x15551fd4432261fb), UINT64_C(0xa057dfbd55ef436c), UINT64_C(0x8b2541b9e0e0fa7e), - UINT64_C(0x7262166dcdf4b67e), UINT64_C(0xcf6533e5c608aaeb), UINT64_C(0xd6763d3967359786), UINT64_C(0x1f6b0228d257c676), - UINT64_C(0xc268c1064d2b458a), UINT64_C(0x6d8b2f6e75d2b613), UINT64_C(0xfaaf5adc43d72807), UINT64_C(0xb6376765e344f9f8), - UINT64_C(0xa8e18dd16a4bd501), UINT64_C(0xa71aa12a8ec11351), UINT64_C(0x1daaf130b537ebe0), UINT64_C(0x2e8aa415959d5d8f), - UINT64_C(0x2813ff3a3e5cbcfb), UINT64_C(0xf0fdd1d6d16a7c23), UINT64_C(0xbf2b55d2ecf0ee55), UINT64_C(0xbd4e9bec299381d0), - UINT64_C(0xac8827ab807eb180), UINT64_C(0x8514d75ac3b43b0b), UINT64_C(0xc9b5c78e45fb38a8), UINT64_C(0x4b66e6e7b797cd8f), - UINT64_C(0x1a482ffa6870a2d3), UINT64_C(0x98f55f701d4bf919), UINT64_C(0x7c0fda20e7e26ef8), UINT64_C(0x6ef795976fca3b54), - UINT64_C(0x79801cd422fa95cd), UINT64_C(0xce8a72301dbbe230), UINT64_C(0x5e79f4c925bdd0e0), UINT64_C(0x5729e93c99cc12b3), - UINT64_C(0x76d022747522392a), UINT64_C(0xb9d7652e917a6bc4), UINT64_C(0xc2978462dfa9551b), UINT64_C(0xac081b4a7528b0ce), - UINT64_C(0x5b7799fe02443b33), UINT64_C(0x6676e5687742e76a), UINT64_C(0x3e9836e33caf452b), UINT64_C(0x96ff93e427173943), - UINT64_C(0x30fa2f987359e0f6), UINT64_C(0xfaa730326c478363), UINT64_C(0x2bb0560d8986947e), UINT64_C(0x9f7c01d35aefc68f), - UINT64_C(0x6b81189bd90a0e45), UINT64_C(0xd592d2ad2df04128), UINT64_C(0xbcd0e0fe02816ec6), UINT64_C(0x1d6d84e5c1f8df0f), - UINT64_C(0xc4b55a73da2f8713), UINT64_C(0xdbd6510e7ad24d26), UINT64_C(0x7e3452b770e259bd), UINT64_C(0xd5fe716f2c3ee835), - UINT64_C(0x63a6d74ef78acd1d), UINT64_C(0x3bd673b27d5aa140), UINT64_C(0xe394f3a2a4f6d465), UINT64_C(0xf02f642cda7fee7e), - UINT64_C(0xe17ee2617b3d366a), UINT64_C(0x41cdb92402dce780), UINT64_C(0x4e5c54024fd18f6b), UINT64_C(0x6f45dd1c7c5a3f12), - UINT64_C(0xf6fd2b3f9ccda563), UINT64_C(0xe7628d358d971e26), UINT64_C(0x4dabc984370ed105), UINT64_C(0xec05f7d5c53cb70b), - UINT64_C(0xf48eccbc216dcf71), UINT64_C(0x8a571d0cb256f131), UINT64_C(0x4c05466392e32549), UINT64_C(0x91d3f9324ef03c3e), - UINT64_C(0xec0591069697e868), UINT64_C(0xa77da4079db8ffd8), UINT64_C(0x287335de3951784f), UINT64_C(0xe7afb90b4adbbf33), - UINT64_C(0x96e785b0c621dbbf), UINT64_C(0xc7f54753a5e1d81b), UINT64_C(0x4a3a42229fc7491e), UINT64_C(0xc9560ea788a62881), - UINT64_C(0xe34b9ee97b5bef12), UINT64_C(0xfae309a9fbff0656), UINT64_C(0xbc23f738a0bf4c58), UINT64_C(0xc6dd1ed9a7a706de), - UINT64_C(0x3473045c7f760007), UINT64_C(0x89b5f0a2e0ace69b), UINT64_C(0x7433c584785f3321), UINT64_C(0xa38220fab7357fc0), - UINT64_C(0x04e1d70ec8db6456), UINT64_C(0xa86065368c31fd72), UINT64_C(0x926cee3a66885fb3), UINT64_C(0xc09c39dbdb8240bc), - UINT64_C(0x1ee291407a9ac9db), UINT64_C(0xa6120818b86fd032), UINT64_C(0xa4c3a1cbf6a6666f), UINT64_C(0xb34ce856697db755), - UINT64_C(0xe3ef1a7123649d75), UINT64_C(0x814ea4e8549f30bc), UINT64_C(0xc8c12f327c1ee0a3), UINT64_C(0xc4ad0d22dbe77043), - UINT64_C(0x608451fb3ab06a00), UINT64_C(0x2e1141be52867cb9), UINT64_C(0x04b92abd9485965f), UINT64_C(0xcf91f012eb16b951), - UINT64_C(0xacc0a45db481b3b3), UINT64_C(0x523f65d99013b4d9), UINT64_C(0xf333b8f8613fae1f), UINT64_C(0x8b651a304f1c80b0), - UINT64_C(0xa91ecd6f061480d2), UINT64_C(0xbd01125685871081), UINT64_C(0x9933950983b6d41e), UINT64_C(0x1f4130fd7912c3e6), - UINT64_C(0x333230fc9385a4ba), UINT64_C(0x9d2d764680fb1581), UINT64_C(0x277e6bb16761eabf), UINT64_C(0x1829af028f40b602), - UINT64_C(0x9783144e64561566), UINT64_C(0x410d30cd66cb4e92), UINT64_C(0xce0e0df02a7ac717), UINT64_C(0xdbfc28dabb65c1e2), - UINT64_C(0x5a83f419f0610b35), UINT64_C(0xb0706efb6f56176b), UINT64_C(0x684148ee29c2a3d6), UINT64_C(0xc47213009755db33), - UINT64_C(0x2600f460fbea3831), UINT64_C(0x7037ec48a50dc3ec), UINT64_C(0xa761879a39764433), UINT64_C(0xcfd6983de3381424), - UINT64_C(0xfdc2524f5d605fc4), UINT64_C(0xbe84a33131a412c9), UINT64_C(0x1bd73706e51699b5), UINT64_C(0x7aea62c60dffb5ab), - UINT64_C(0x010fec687da2bbf4), UINT64_C(0x56aa74a28e54f75c), UINT64_C(0xba52dd2bb4019afe), UINT64_C(0x6ae298d992a98093), - UINT64_C(0xdbfc6eddb2348c70), UINT64_C(0xeab81b5b034b7836), UINT64_C(0x692b0fc00c8986ba), UINT64_C(0x02adf5476f927b39), - UINT64_C(0x0173c9bb282a94e7), UINT64_C(0x1e617773e554c877), UINT64_C(0x241d5db92d0aa39e), UINT64_C(0x902c43c4be589249), - UINT64_C(0x0b817ad8f9617273), UINT64_C(0x43508b7fb53d5d1f), UINT64_C(0xaf1d845886eeb50c), UINT64_C(0xc645d0758b0a08f2), - UINT64_C(0x3d1339390783be12), UINT64_C(0x376e4919f2fc41c9), UINT64_C(0x392c5bb8475370e6), UINT64_C(0x5e891f54eec6c015), - UINT64_C(0x16a12880b9ac0923), UINT64_C(0x6437af0453c57f36), UINT64_C(0x8dd1ec0ee82c5835), UINT64_C(0xc4738296f5085ef5), - UINT64_C(0x68c5d2b2d2d06381), UINT64_C(0x8a4627fb8fbef8df), UINT64_C(0x9d56ea18dd2590b3), UINT64_C(0x8dbdd1fd0ca96586), - UINT64_C(0x9c17bd827cc151ab), UINT64_C(0xdddb70eb24c36775), UINT64_C(0xb56277dfd02a9c4d), UINT64_C(0x5a8388d255264a83), - UINT64_C(0xcb7207a0b0155fa4), UINT64_C(0x2bbc2967864dd11a), UINT64_C(0x19fb91190adfc85a), UINT64_C(0xed562d76a7e244c3), - UINT64_C(0xf5438c5585588610), UINT64_C(0xbc16ff713cde2e48), UINT64_C(0x42248c858cf837cb), UINT64_C(0x59c8eeb9769cf08a), - UINT64_C(0x0f5260cc1dc624b7), UINT64_C(0x6b880672b5ebfdd5), UINT64_C(0x2e6d6cf57e3365cf), UINT64_C(0xe994b274628cdb20), - UINT64_C(0x939e00fbb43765d8), UINT64_C(0x093150ef5c7cd883), UINT64_C(0x8ae15f57f13b42f1), UINT64_C(0x3af5014a74f18355), - UINT64_C(0x7e1a2d0c860bcd23), UINT64_C(0x796312eee1445e38), UINT64_C(0x1cbde8ef8bdfee3d), UINT64_C(0x207592ed0910de04), - UINT64_C(0x150e839a79142012), UINT64_C(0xb920f5ff40de84a6), UINT64_C(0x0c05b146a932213b), UINT64_C(0x7406c434e2d92546), - UINT64_C(0x19376004d1fc67aa), UINT64_C(0x82f3677fcf0dd552), UINT64_C(0xd9daf63e3aa745a9), UINT64_C(0x8e1e09d0a9676fdf), - UINT64_C(0x2cb86571c0289958), UINT64_C(0x4c4c12eb3a97b760), UINT64_C(0x1e3468d9bf56d00c), UINT64_C(0x11f90498f14cb4a4), - UINT64_C(0x251664b4422a7c58), UINT64_C(0xad10e44d41c2b7c5), UINT64_C(0x663cf17121b6d221), UINT64_C(0x3fe40cdc49c541b8), - UINT64_C(0xb1b1a8b2a941f9c7), UINT64_C(0x83ffae6e34d4eb78), UINT64_C(0xa4564673c6728fbf), UINT64_C(0xe1499f6bd812a4b9), - UINT64_C(0xfb5507a915ed36a3), UINT64_C(0xe055a829c62de53c), UINT64_C(0x1ea06fc53acba653), UINT64_C(0xce0f8c15fd8f2258), - UINT64_C(0x7dd42e43e5ef6f4b), UINT64_C(0x0c55aecd7e1adc10), UINT64_C(0xc31b0e4d3a4e8b1c), UINT64_C(0x1205469d91599780), - UINT64_C(0xbba5d6df94390b83), UINT64_C(0xc97925cae2f17697), UINT64_C(0x3b98f3dc9e15ea08), UINT64_C(0x878203758954cd36), - UINT64_C(0x818deaef5ba91f77), UINT64_C(0x6f8f1786214acb89), UINT64_C(0x26c5c2162849ece8), UINT64_C(0xaf1c297b73471dd3), - UINT64_C(0x415c497c9fa7e936), UINT64_C(0xc1804e923aa3cce6), UINT64_C(0xdd7ca8ffb78dc68c), UINT64_C(0x5b912445ed7ba89a), - UINT64_C(0x95dec0af89a1f157), UINT64_C(0x7041c032d1fa5266), UINT64_C(0xc569835beabc20df), UINT64_C(0xcc662c0dbb7baaef), - UINT64_C(0x20d5d2c1383ff75c), UINT64_C(0x7efdaae3e1c4eaaf), UINT64_C(0x3575fad9533be200), UINT64_C(0xfb0fb500836d48dd), - UINT64_C(0xd211a5090e6d53e2), UINT64_C(0x34afe4050a01467c), UINT64_C(0x63457fe7bfe187c3), UINT64_C(0xc3ee000cb474d925), - UINT64_C(0x4fd32cbbb8326e22), UINT64_C(0xc2abcd1fc9bf14c2), UINT64_C(0xf34b534e55f28258), UINT64_C(0x094ff2a11972ddec), - UINT64_C(0x9744b26f181926a9), UINT64_C(0xa7fe6a0982135b29), UINT64_C(0x0f8d9e7a0de7d61b), UINT64_C(0x4bcd12d1b5d3d8a6), - UINT64_C(0x706e34dbac81bd39), UINT64_C(0xefea01605e9304c6), UINT64_C(0xee3bb6d1e510efe1), UINT64_C(0x84a094db3f4620f8), - UINT64_C(0xf1752fc679d6aeb3), UINT64_C(0x54921e5d6949a43f), UINT64_C(0xd3616f81f2ff8c55), UINT64_C(0x8bd9584eb62232bd), - UINT64_C(0xa990035eef6e7b13), UINT64_C(0xd4c56de5c11dcdda), UINT64_C(0x8048c23ec8bd072b), UINT64_C(0x407539904d984e51), - UINT64_C(0xeaf5a1d46eb3779b), UINT64_C(0x4b06e5769362f357), UINT64_C(0x931f75e21bc0d143), UINT64_C(0x9369439b81c92fc4), - UINT64_C(0x059fccc0d4afbb45), UINT64_C(0xd072671b3c927118), UINT64_C(0x61b6803f95c41115), UINT64_C(0xacb4b2c4381da3f5), - UINT64_C(0xd73bf897ee871c72), UINT64_C(0x241c9d52c953d3c0), UINT64_C(0x083c079e704d7b96), UINT64_C(0x8c431ee43e5171a5), - UINT64_C(0x66079596998b96b6), UINT64_C(0x041ea35d207b478e), UINT64_C(0xbe698683cf7b258e), UINT64_C(0x5457365cf6cbc5bb), - UINT64_C(0xc166c3ef7006b02d), UINT64_C(0x27789ff1e5365132), UINT64_C(0xae4a02397d308867), UINT64_C(0x0388704d03d7b613), - UINT64_C(0xf5c9d782d3fd58e3), UINT64_C(0xb51c3fe53965624e), UINT64_C(0xf785b86e7fe0adec), UINT64_C(0x19f72a9ef3a215e8), - UINT64_C(0x19db58361e6633d9), UINT64_C(0xf1fe7a08693d64ab), UINT64_C(0x07c3310adc3bbf03), UINT64_C(0x742e87d333077816), - UINT64_C(0xe817529af0f04970), UINT64_C(0xe7f343c941a044ff), UINT64_C(0xf9693fb4f37b4d2c), UINT64_C(0xb99da4a0b6ccb1ed), - UINT64_C(0x4eef654d39c7f631), UINT64_C(0xd06badd9354befc8), UINT64_C(0x3dea38b48a4fb6cf), UINT64_C(0xf6551a2de11ec63d), - UINT64_C(0xf0dd7ca2d08731e5), UINT64_C(0xfbbac6e989684aff), UINT64_C(0xe2b65b698f6ea652), UINT64_C(0x679e2fc32595fb51), - UINT64_C(0x6547fdc240571414), UINT64_C(0x6809f663de2d0466), UINT64_C(0x6c6b7a0a40a5e48f), UINT64_C(0xe5f43660d891606e), - UINT64_C(0xa44f283a5a5c10fd), UINT64_C(0x95635b53a60083be), UINT64_C(0x7e0f003a2698a45c), UINT64_C(0x2fd0eb2a3cb4db79), - UINT64_C(0x7416380640ad33c7), UINT64_C(0x988de04a8bfe794b), UINT64_C(0x6d00569ebd6839ff), UINT64_C(0x22ddd7d3d0efa384), - UINT64_C(0x20f9c1ae73b1a651), UINT64_C(0x32386da97bb626af), UINT64_C(0x263c358b8e1975fe), UINT64_C(0x32bd1e4fdb3e7f7c), - UINT64_C(0x2ebb53af95ab07db), UINT64_C(0xeccc526f7e6aca61), UINT64_C(0x186fd1f3ad161e28), UINT64_C(0xf96dd58eca026372), - UINT64_C(0x0403c8572fee3bf3), UINT64_C(0x2598261d29b22e84), UINT64_C(0xa4027ffeed481ae0), UINT64_C(0xe2f690ddcdb0fdaf), - UINT64_C(0x95d11d0d60c528fd), UINT64_C(0x0cc242f0eeae1d6c), UINT64_C(0xfa3440087835377f), UINT64_C(0x3d8fad475b8139e4), - UINT64_C(0x8e92fce862d8a97e), UINT64_C(0xc53bc4cb5ed50eb4), UINT64_C(0xc8f91ece0194e8d4), UINT64_C(0xf78d7c6b5cff07e1), - UINT64_C(0x3163d8458b924665), UINT64_C(0xc2ae6dc185c739bf), UINT64_C(0x2943e3eae337c6c6), UINT64_C(0x96bd36f0da4a49f7), - UINT64_C(0x98753f33282f27bf), UINT64_C(0xd5c33455bf0f69fd), UINT64_C(0x78cc9f69e0286682), UINT64_C(0x0631fadc21ec437c), - UINT64_C(0x521c3db58b6b1170), UINT64_C(0x2333f0f70e46b5cf), UINT64_C(0x87be027b8d434ac6), UINT64_C(0xba4c26796c582e4c), - UINT64_C(0x35d52e4f85db73e4), UINT64_C(0x8ac3723b24e99436), UINT64_C(0x4a2b6ce4b7a97a02), UINT64_C(0xcb8017cc584b287d), - UINT64_C(0x1ca3610bc2f30e9f), UINT64_C(0xc1c2dafdd385b283), UINT64_C(0xa812778eceff9a2b), UINT64_C(0x91b8429959ef5359), - UINT64_C(0xa2750c665bcab7d2), UINT64_C(0x9212f5d704b5320b), UINT64_C(0xfa46bb7a213be57f), UINT64_C(0xd20cbd122dce6c1d), - UINT64_C(0x82868b5aee7a4776), UINT64_C(0xf49ec5ddf8cec096), UINT64_C(0xa4fc2bf71ac9dcc2), UINT64_C(0x9d8b8f462bd2f17b), - UINT64_C(0x452703fe91008332), UINT64_C(0x919a288ada854bef), UINT64_C(0x75d2b2eb0f4eeed7), UINT64_C(0xd64885293558a96f), - UINT64_C(0x098d7efb4f8d5b31), UINT64_C(0x7ee77eef93a3928e), UINT64_C(0xb28eebae28b63dc8), UINT64_C(0x0f01129fc90af970), - UINT64_C(0xf3d5b92900d45181), UINT64_C(0xb9d8a408ea6715c0), UINT64_C(0xe44424fb8ca9e22e), UINT64_C(0xd81135834c1aaf96), - UINT64_C(0x445b3d67398e888b), UINT64_C(0x0dad43784fe36cda), UINT64_C(0xe6d1bd75c5d81518), UINT64_C(0x662f0e924150c5cb), - UINT64_C(0x78179f80df6e0709), UINT64_C(0xdd8fc687a741289c), UINT64_C(0x710873d7f5ab060e), UINT64_C(0xa1961d2b538f497c), - UINT64_C(0xb36bbf75bc8b8761), UINT64_C(0x675c608353017307), UINT64_C(0xade6b1aa0ec59bbe), UINT64_C(0xc803a2c9426b3c5f), - UINT64_C(0x48a8210409b5ffac), UINT64_C(0xc3d58389ce5f3b13), UINT64_C(0xa23ceb0e71b08443), UINT64_C(0xd9d192cd9c5e9a05), - UINT64_C(0x20d9cd878b94147d), UINT64_C(0x22329c7695f6df46), UINT64_C(0xaebdcdc2c2cbc0d9), UINT64_C(0xe95ae3d514f6f94b), - UINT64_C(0x59152e1f5715e782), UINT64_C(0xb3280d75a8134f15), UINT64_C(0x5bce3379e1fcb7b4), UINT64_C(0x437d9c3238c4169f), - UINT64_C(0x77db7e5ebd5125bd), UINT64_C(0x0dd3aef40336d438), UINT64_C(0x4a496a56bac81428), UINT64_C(0x72a128c3875dc93d), - UINT64_C(0x8eb605e5bef1747d), UINT64_C(0x666d4546567a4eef), UINT64_C(0xad5ad003399d2296), UINT64_C(0x19c74366682b52a0), - UINT64_C(0xb3c35c5a0e259420), UINT64_C(0xf98340503eb93d6d), UINT64_C(0xa51985b0bb7f81e8), UINT64_C(0x2a21510c6c7ca42f), - UINT64_C(0x3c1ac0b52c230998), UINT64_C(0x4e1d572a2d77000b), UINT64_C(0x8dd3adff3bfdec71), UINT64_C(0xdfb3a4a23e43d035), - UINT64_C(0xe12f748421173e62), UINT64_C(0x2f356145d2f72758), UINT64_C(0x31c13682374c445c), UINT64_C(0x09240a1f409fab88), - UINT64_C(0xa346e2d2f72fd5e8), UINT64_C(0x2c5b53bfc05f9f77), UINT64_C(0x0a9f7ab218574f6e), UINT64_C(0xc3fcb9b977f0cceb), - UINT64_C(0xac26889eb86459b9), UINT64_C(0x1082f785bc3dac21), UINT64_C(0x3c8c337a4c67ef18), UINT64_C(0x118e48d0e8a66e02), - UINT64_C(0xb777cef85278f2dc), UINT64_C(0x12a268a3dcda05bc), UINT64_C(0x75f5f7d3fde0bd9e), UINT64_C(0x62f5f1650ec91670), - UINT64_C(0x81fcf9e3e1c3adec), UINT64_C(0xf0b5e35ace23349c), UINT64_C(0xde7d514d058e53a4), UINT64_C(0x52a625e5f06242c7), - UINT64_C(0x3cc1346eda6a430a), UINT64_C(0x165bd737e851f6a1), UINT64_C(0xe52c53d745f1b49a), UINT64_C(0x15513074f676fafc), - UINT64_C(0xcb8797dbb29e6710), UINT64_C(0x27b92c8190fd679d), UINT64_C(0x0b39384ac668b176), UINT64_C(0x11341e6d7adad0e9), - UINT64_C(0x491b5b5390b70f94), UINT64_C(0x1f5eccf586d03746), UINT64_C(0x6502ca945646feae), UINT64_C(0x3abb5466229ef7d8), - UINT64_C(0x535b4effbe0ce5f6), UINT64_C(0x6575eefef9e916f5), UINT64_C(0x77a76fbf3c76f2d7), UINT64_C(0x1cc63124152994a7), - UINT64_C(0x6e33f80e95d4323d), UINT64_C(0xd711791d9b2e1d65), UINT64_C(0x7c766cd52013ae49), UINT64_C(0x08bc15230d2ef477), - UINT64_C(0xb751fa3b942ab063), UINT64_C(0xfe99a8b170a11941), UINT64_C(0x731979294908218a), UINT64_C(0x32166899c12f3097), - UINT64_C(0x8318df8e3823dd3d), UINT64_C(0x940e81f0b4ece3d8), UINT64_C(0x81ea0f12130235ea), UINT64_C(0x36603dfef356d752), - UINT64_C(0x409eeb16b992d793), UINT64_C(0xf4c675cca09e229a), UINT64_C(0x0ef989d732dae818), UINT64_C(0x269b4385573ad2f6), - UINT64_C(0x53df04584157173c), UINT64_C(0x260c347bedc5ce82), UINT64_C(0xb9fbfba9b58c1b09), UINT64_C(0x20115df9d0693a14), - UINT64_C(0x8c0fb27588303369), UINT64_C(0x3a9450974a66eaaf), UINT64_C(0x805f0d515d715679), UINT64_C(0x10f4b52a09898972), - UINT64_C(0x20e9c3449e84718e), UINT64_C(0x9eed8745b4e234e2), UINT64_C(0x946c3083bf840def), UINT64_C(0xb18de02e626f7dd9), - UINT64_C(0x9e8b496b1d035ed8), UINT64_C(0x6ef3891e7c690f77), UINT64_C(0xd62269e5ad1c07f5), UINT64_C(0x7117ed7eddc2883e), - UINT64_C(0x260f1d08457dfcca), UINT64_C(0xe0759189d723da9d), UINT64_C(0xd6d40adb9c9f94d7), UINT64_C(0x7c47c4b4a670b77e), - UINT64_C(0xb2b5179563a2abe1), UINT64_C(0x62118cb60f121507), UINT64_C(0x22c3a4a74379ceb1), UINT64_C(0xd5904c844fbfed74), - UINT64_C(0xa0afa38c06d50d92), UINT64_C(0xd6223dbbcfcf73f4), UINT64_C(0xf19623e7ec6f83dd), UINT64_C(0xd08c12de2b6265f6), - UINT64_C(0xc487d5dc19489db6), UINT64_C(0x759283ffd06fc796), UINT64_C(0xd61a735ad1cd7ccc), UINT64_C(0x32084ba3ca8fa3ee), - UINT64_C(0x17530308a1204968), UINT64_C(0x80328582a1eb8d8f), UINT64_C(0xd4c873deec7fb3d7), UINT64_C(0x11c825cc4bc8b181), - UINT64_C(0x0137fa50576b21eb), UINT64_C(0xc5ea2f958a3ddb53), UINT64_C(0x6ae611d92b67c9bc), UINT64_C(0xb798b3e1f9c3a851), - UINT64_C(0x22a42679fa4b013f), UINT64_C(0x2071f22dae8de629), UINT64_C(0x3faa3a80e45cbca6), UINT64_C(0xb0418f45808009ec), - UINT64_C(0x446063013dd5a0f4), UINT64_C(0x932445b680ef71ec), UINT64_C(0x2bc9a2d9ab8e2662), UINT64_C(0x8ebd57fbc56a6154), - UINT64_C(0xa28f3d2264ad0f10), UINT64_C(0xffff84df76a10c15), UINT64_C(0xac5c9b0e78fbee81), UINT64_C(0xc1f08e08982b237c), - UINT64_C(0x5907b7fa41daa2b8), UINT64_C(0xbed3856320d9c3c2), UINT64_C(0x500a342c1902f015), UINT64_C(0x0c3a5d539c71b7d6), - UINT64_C(0xa706750b1c3e5604), UINT64_C(0x1543ab593a8c824c), UINT64_C(0xbdfd9d26f151d83c), UINT64_C(0x1603bb40537de208), - UINT64_C(0x1501b0ba802daa2d), UINT64_C(0xdcbcc803f3c11f3c), UINT64_C(0x2bb283a389ec2f35), UINT64_C(0x3a27513ef9d14bf4), - UINT64_C(0xcb7c4fd02a39d8af), UINT64_C(0xcc6f61a03488e43f), UINT64_C(0xfdddf2b5fd6c4b05), UINT64_C(0xa015987625b9755d), - UINT64_C(0x14c5a9b03c63b253), UINT64_C(0x413f7d2608bf939e), UINT64_C(0x8bdb68c7176407e5), UINT64_C(0x436de64d8a614c32), - UINT64_C(0xc2aca4b10ff0bf8e), UINT64_C(0x3b56cc9c1df797e4), UINT64_C(0xb1750cce6cca57bb), UINT64_C(0x8c80e2303509012a), - UINT64_C(0x7f25bae3c4fea8af), UINT64_C(0xecf8ed9dac1367b8), UINT64_C(0x1a49274e39668f4e), UINT64_C(0xca4a0ae881c7dc39) + UINT64_C(0x6fa74b1b15047628), UINT64_C(0xa2b5ee64e9e8f629), UINT64_C(0xd0937853bdd0edca), UINT64_C(0x4e9fb2b2b0a637a6), + UINT64_C(0x26ac5a8fac69497e), UINT64_C(0x51e127f0db14aa48), UINT64_C(0xea5b9f512d8d6a09), UINT64_C(0xf3af1406a87de6a9), + UINT64_C(0x3b36e2ed14818955), UINT64_C(0xb0ac19ef2dde986c), UINT64_C(0xd34ed04929f8f66d), UINT64_C(0xe99978cff2b324ea), + UINT64_C(0x4032cb3ecff8cb38), UINT64_C(0xfa52274072d86042), UINT64_C(0x27437346dec26105), UINT64_C(0xec1cbf04b76aec71), + UINT64_C(0x6dd57b3dac56cd39), UINT64_C(0x34e9021797e95aad), UINT64_C(0xdc8d3363540c5999), UINT64_C(0x773d283eeeabf4ab), + UINT64_C(0x373c522657461aaf), UINT64_C(0x154cfe0f497d7f78), UINT64_C(0x6d377183b5ca6550), UINT64_C(0x614da5f6055e904b), + UINT64_C(0xd77b66b34896f00e), UINT64_C(0x122538125d6adaef), UINT64_C(0x1021e161206d9091), UINT64_C(0x38407c4313aefdfa), + UINT64_C(0xd941cc5dafc66162), UINT64_C(0xfc2432a6ea885315), UINT64_C(0x5576dc02b68b10ed), UINT64_C(0xd8449f9d4ab139a2), + UINT64_C(0xd333cbcd49cbacba), UINT64_C(0x700d20430e06eeb8), UINT64_C(0xdeb34810d6d0320a), UINT64_C(0x6743363d6cc8ba68), + UINT64_C(0xbd183cb526e6e936), UINT64_C(0xee62bf5ee97de5ea), UINT64_C(0xf6b855e743e76853), UINT64_C(0x83ac16a35d132df9), + UINT64_C(0x2046f2c70c2130b1), UINT64_C(0xaadd5007102b5ee4), UINT64_C(0x8eedac842e63cdac), UINT64_C(0xba02956e43c18608), + UINT64_C(0xd2688af010adbeaf), UINT64_C(0x4aaa5295377c17be), UINT64_C(0x83792382ba198f10), UINT64_C(0x6fc42849961a25b6), + UINT64_C(0x3501677f06fb1311), UINT64_C(0x1e18b89705c224dd), UINT64_C(0xa0a0b8684aa2e12d), UINT64_C(0x30d19aac3d40898e), + UINT64_C(0x41dd335a29272e9b), UINT64_C(0x5c5d445a07426e3f), UINT64_C(0x6f13080e67946fdc), UINT64_C(0x3ddabae21609bf08), + UINT64_C(0x8e6146d3cde11ca5), UINT64_C(0x9eff76a4c39eacf4), UINT64_C(0x71c66d0a423a21b7), UINT64_C(0x68515c0b712bbc4f), + UINT64_C(0x5edd17cec412a735), UINT64_C(0xa444f487c96f896c), UINT64_C(0xc161d16d4e54041a), UINT64_C(0x3a2d84d3e09bafb9), + UINT64_C(0x63a406b157a5f2f1), UINT64_C(0x18292d6007f839ba), UINT64_C(0xcaac5789618f2aac), UINT64_C(0x6f516d95f749dd97), + UINT64_C(0xb5784409560e219f), UINT64_C(0x12f0f0d6fbdcb81c), UINT64_C(0x993d6c2a47089679), UINT64_C(0xcc9247b35870aebf), + UINT64_C(0xa1ca8eff8b1bca70), UINT64_C(0x7a1d015397e558cc), UINT64_C(0xc504a4d4815f8722), UINT64_C(0x3e44258e93472b26), + UINT64_C(0x11bd0578a36c8044), UINT64_C(0x84c7087603a0a6ea), UINT64_C(0x457d0c59e84c9ac8), UINT64_C(0x32129275ee63dd95), + UINT64_C(0x66269220e943024d), UINT64_C(0x197de12f9d6e5c72), UINT64_C(0x06fdd09a4d6157dd), UINT64_C(0xf8c1a8b51fe95716), + UINT64_C(0x41eeb6129149f6cf), UINT64_C(0x42f510887a61de1b), UINT64_C(0xf3d2aa6e4fe5949d), UINT64_C(0xc0799007b85373aa), + UINT64_C(0x81577b167de515c3), UINT64_C(0x01f424fc6b856270), UINT64_C(0xff6247ed0658caa8), UINT64_C(0x63ad005e620fe4bb), + UINT64_C(0xdb919b9f63c93174), UINT64_C(0x5693dbd6c76c7683), UINT64_C(0xdaa9b82e85e0355a), UINT64_C(0x424c5c4e5672fc73), + UINT64_C(0x9de3ca332ba818f1), UINT64_C(0xb28f375a58bc6c1e), UINT64_C(0xef0af1e6041b9cd4), UINT64_C(0x0418afb53ef5408f), + UINT64_C(0x9a37634585d3330a), UINT64_C(0x3ab5aec014b097cd), UINT64_C(0x384a0739a3ff7dc8), UINT64_C(0x0ff31c11226e5d5a), + UINT64_C(0x71070735f1c16bb4), UINT64_C(0xc4f78905f49a3840), UINT64_C(0x561f68d6a5f44a81), UINT64_C(0xb09bd8cd8d932357), + UINT64_C(0xf270b47652354fdb), UINT64_C(0x47d6ca7bba50c2c7), UINT64_C(0x2720590d7b2b7b54), UINT64_C(0xcaac35df08cab300), + UINT64_C(0xd05759dee169d9fd), UINT64_C(0xdb8d0d0403a6aafb), UINT64_C(0xcd3ab85684ba537c), UINT64_C(0xad69c4e5240c158f), + UINT64_C(0x65427c4ff3637db2), UINT64_C(0x085ecbbf903a45ae), UINT64_C(0xeafed57a94384c62), UINT64_C(0xc99972367cd21eba), + UINT64_C(0xc1e2cf52270b20eb), UINT64_C(0x825dad5142681653), UINT64_C(0x47e99edc5e141d94), UINT64_C(0x125813bc26e42e07), + UINT64_C(0x06f41d2441b172ca), UINT64_C(0x5e9e640ed911730e), UINT64_C(0x5900403342f0f362), UINT64_C(0x57a600d157ee9945), + UINT64_C(0xbcc5d702f02dc7e0), UINT64_C(0x8258cf5a1a6435ab), UINT64_C(0xdf885b6a0343a3e0), UINT64_C(0xadd74c04a503b09a), + UINT64_C(0x0ea210122eeef589), UINT64_C(0x5217fd50f3ecaf85), UINT64_C(0xd0c39849df6b4756), UINT64_C(0xf66d9e1c91bd0981), + UINT64_C(0x0f355b00f40e3e6b), UINT64_C(0xc01dabcd14518520), UINT64_C(0x58691b4fa9e7d327), UINT64_C(0x357616c77c22fffe), + UINT64_C(0xb9fbf8de2ed23303), UINT64_C(0x0195932bc205c466), UINT64_C(0xef0763590a08a50d), UINT64_C(0xf546866c0028a938), + UINT64_C(0x41cc8732eaad496a), UINT64_C(0xadc61f16374896c6), UINT64_C(0x5eb8f93f25ad0457), UINT64_C(0x240f00f5db3fae25), + UINT64_C(0xcc48503596dc01ef), UINT64_C(0x351baaa904a306d5), UINT64_C(0x7111179ae328bb19), UINT64_C(0x6789a31719d5d453), + UINT64_C(0xf5318492c9613de6), UINT64_C(0xa0e8c24f3f0da716), UINT64_C(0xac15d68d54401b9d), UINT64_C(0xadafb35cf63092ee), + UINT64_C(0xceb5f8d63c7fec4c), UINT64_C(0x1ae71929b980fc9d), UINT64_C(0x6efdc5693ef4ee2a), UINT64_C(0xbedd8334cade7855), + UINT64_C(0x06f1b768b476a249), UINT64_C(0x9e614bedf41dd639), UINT64_C(0x9eca9c6c9e389a5d), UINT64_C(0x76999bf01b912df2), + UINT64_C(0x04d52fb2ac70ab31), UINT64_C(0xe467ea8172f5d066), UINT64_C(0x356ed51bb0e094ae), UINT64_C(0xab2047c21b54d8ba), + UINT64_C(0x21dbbfa0a6157474), UINT64_C(0x7de36edec62f1997), UINT64_C(0x306ef59f5204a58c), UINT64_C(0x954135a769d5b72e), + UINT64_C(0x9d7774a0c2d29380), UINT64_C(0xc03acfd63ac6b88c), UINT64_C(0x9989d5ee565322e6), UINT64_C(0x19d1a58324bdd145), + UINT64_C(0xe74685383cc6b27c), UINT64_C(0xf9edffe1c4d81108), UINT64_C(0x94950b5b6247cb43), UINT64_C(0xe3fa8c6468d419eb), + UINT64_C(0x29981bd802f77ac5), UINT64_C(0x6cf1a6cab28c1c36), UINT64_C(0x1d34a382a5d48973), UINT64_C(0xcd1d5d546e5e4d3d), + UINT64_C(0x4ad78b4a37e52322), UINT64_C(0x24da17671ab463f2), UINT64_C(0x527504b7c7bc5537), UINT64_C(0x7ba1d92e1969b2b5), + UINT64_C(0x53a130812c49d64a), UINT64_C(0x503af48d9510f1d7), UINT64_C(0x719db8a348dee165), UINT64_C(0xa85e4fad1f343e67), + UINT64_C(0xdafc1fa9203d2d45), UINT64_C(0x7730f245c903a407), UINT64_C(0xb7c04e53f913aeae), UINT64_C(0x39ed817e1e039153), + UINT64_C(0xf415ea2b3efc7606), UINT64_C(0x15e3c53fe43f104d), UINT64_C(0x1b71e4d83ccba83c), UINT64_C(0xfe088f4c90812841), + UINT64_C(0x1ff8e2ee0a04b6ae), UINT64_C(0xf4f4a23612b9eed2), UINT64_C(0xc596a66051b8aca1), UINT64_C(0xbc898edd3370a8dd), + UINT64_C(0xce7638a7a2f9152e), UINT64_C(0xd99192635c0d5c92), UINT64_C(0x62038c87c094a1ff), UINT64_C(0xa73f1bcaac7343af), + UINT64_C(0x93c797804faa5ff3), UINT64_C(0x9da7407c705da1f0), UINT64_C(0xa52cde7d37fef9f0), UINT64_C(0xb93a7db97e3fa7ff), + UINT64_C(0x75ee91392c60fb6b), UINT64_C(0x4d7f8e3db9383ae0), UINT64_C(0xe0aec397d5290d06), UINT64_C(0x159a20f22d740d81), + UINT64_C(0x231416cff9a9b014), UINT64_C(0x71ed3a6e513b4795), UINT64_C(0x190b08ebcb87f3bc), UINT64_C(0x36bb0bcb0e8df593), + UINT64_C(0xc1e63cdc4d78dfb3), UINT64_C(0x36e2c57ba6799460), UINT64_C(0x280c0618b19f63dc), UINT64_C(0xca2b8e49d6c71d2d), + UINT64_C(0xc881e59705270f09), UINT64_C(0x26fdf0dbb5f2f451), UINT64_C(0xc6d1a3697ca86855), UINT64_C(0xd00755a203980eb5), + UINT64_C(0xa85962163dd7de95), UINT64_C(0x622b7a1d2531d00e), UINT64_C(0xb6c1cfba74436ef7), UINT64_C(0x9578891a720bf317), + UINT64_C(0x5e325058bd3a343a), UINT64_C(0x9a468a5a888a475f), UINT64_C(0xa57f0edb414a0589), UINT64_C(0xa044aef7ea680f8c), + UINT64_C(0x2036717cee9b991a), UINT64_C(0x3925631ec66cb8aa), UINT64_C(0xdcb6a5da6b2fc78f), UINT64_C(0x17a8cd724b7b5e26), + UINT64_C(0x1c704c6a48a2dae0), UINT64_C(0x87d8f6738a0c30bc), UINT64_C(0xd8580262a4801240), UINT64_C(0x5812cea521ffaeaf), + UINT64_C(0x21b6ff923871f14c), UINT64_C(0x922dbd45c2b307d1), UINT64_C(0x5c67ecbaace24d31), UINT64_C(0xb90f5e3acfaeff9b), + UINT64_C(0xea5aa9f2f14efeb1), UINT64_C(0x08003af95ab5ce92), UINT64_C(0x5a39361e05692622), UINT64_C(0xd4b8cddc309e44da), + UINT64_C(0xe20bfe5f0a1343d9), UINT64_C(0x13848357d100b2b3), UINT64_C(0x912a1b220fa678f5), UINT64_C(0x7631242b7f6d6365), + UINT64_C(0x5a9f9a3284d95674), UINT64_C(0x0d5b02c98afd4279), UINT64_C(0xede70dbc04a7a3d9), UINT64_C(0xadb3f72865ba580e), + UINT64_C(0xc4a3c11163562e90), UINT64_C(0x482e567c69b6b128), UINT64_C(0x38ec96bfcb4d965d), UINT64_C(0x923fe02a6b4bdabe), + UINT64_C(0x0ae0ca91a2be0579), UINT64_C(0x137401e7f2acf3e8), UINT64_C(0xfdad100e85bc5622), UINT64_C(0x9c07483343c8030f), + UINT64_C(0x71872f8555dbd0a8), UINT64_C(0x8de5873dbfa538e0), UINT64_C(0x2922d0d9a2d9eb02), UINT64_C(0x2744006cfc375d0c), + UINT64_C(0xa82c09537574f583), UINT64_C(0x2ab2d255e73f6f83), UINT64_C(0x6cc5f73b682b3701), UINT64_C(0x6e59fc51ee28845d), + UINT64_C(0xe536b381533cc4cf), UINT64_C(0xfd2ac9f30025e109), UINT64_C(0xc26cdfa60b8be153), UINT64_C(0x62da136e08f0f885), + UINT64_C(0xeb6a7a065b640357), UINT64_C(0x7462b101e2adb3ff), UINT64_C(0x996ec340bf52ea07), UINT64_C(0xf0aa2a872333e60c), + UINT64_C(0x222884f9c4632341), UINT64_C(0x32b5289d94dac82e), UINT64_C(0x7cdd99055bd35f17), UINT64_C(0x92d3d262aefe21bc), + UINT64_C(0xc6c1b1029eb0dd4c), UINT64_C(0x28f046ec80f3c975), UINT64_C(0xc1f0c2d9745c5cb7), UINT64_C(0x92ada28cf6f7fe0b), + UINT64_C(0xdfb215a8df753a03), UINT64_C(0x942ecdad535f962d), UINT64_C(0x7d739b8c0b7a1669), UINT64_C(0xee95286e88be8510), + UINT64_C(0x4ae71aa9d3c3d36f), UINT64_C(0x2bd6d5d12452cc38), UINT64_C(0x16fa1504fbedf267), UINT64_C(0x4b835f8377f3937d), + UINT64_C(0x0004374053160cb7), UINT64_C(0xe44a676c90906fe8), UINT64_C(0x2389c459f53fbdcd), UINT64_C(0x4a7031455481da9e), + UINT64_C(0xb72c293d969a40cc), UINT64_C(0xd9b72ee09dde404d), UINT64_C(0xa31f4f98c5aabc97), UINT64_C(0x56f240ad0aea491c), + UINT64_C(0x86264ebf858d67bf), UINT64_C(0x93fd3b332948fd87), UINT64_C(0x79899120e2d72215), UINT64_C(0x36dedea1a614643e), + UINT64_C(0x1c5e947b88cba0f6), UINT64_C(0x20ec77907c771a4f), UINT64_C(0x587a65fe2c8f5487), UINT64_C(0x9b5431d881ff3b4a), + UINT64_C(0x8f55b2fd967902d7), UINT64_C(0xebd59a640fee9b7e), UINT64_C(0xd5a77b39543d5bef), UINT64_C(0x5dbf440d204f5d0f), + UINT64_C(0x4e22065f53ba213e), UINT64_C(0x4611a2d169ad5a0b), UINT64_C(0x41ea9888cb5be7d1), UINT64_C(0xf8a661f2359be997), + UINT64_C(0xde83a9e3a6562631), UINT64_C(0xd66dedc223dad775), UINT64_C(0x162e54732874a52a), UINT64_C(0xf6d91b1963c23d56), + UINT64_C(0x56d3c9a025a95772), UINT64_C(0x92ddff0a1caeb05c), UINT64_C(0x6cbeb9f263443bd7), UINT64_C(0xb4ad540e1b11894b), + UINT64_C(0xcfa573f2f78d8b29), UINT64_C(0xad477ed16d45543f), UINT64_C(0x0d0283973ed3423a), UINT64_C(0x5307f93f3654f284), + UINT64_C(0xbc9b362f504b145b), UINT64_C(0x5661193dc5bcb5ff), UINT64_C(0x151c9b1c7c0f246a), UINT64_C(0xad25cfcfd5e399d2), + UINT64_C(0xc5855adf08226db2), UINT64_C(0x5a027c03c078be13), UINT64_C(0xc2465bfb0dc5b99c), UINT64_C(0x8aaa55a9eca79b60), + UINT64_C(0x797a7c2608c23d9e), UINT64_C(0x692b8d7da8c7f748), UINT64_C(0xc23c7b1ab3e883e1), UINT64_C(0xe1ebb866f32ac6cf), + UINT64_C(0xca6be5075b5046f9), UINT64_C(0x3105a0555f6a3bac), UINT64_C(0x525b7cc4839ea6c5), UINT64_C(0xce1dd2aad7e83cf1), + UINT64_C(0xb4a9105674d79be6), UINT64_C(0x667eb8384834f7db), UINT64_C(0xb200a7a30f789150), UINT64_C(0x4ba4d2c780055821), + UINT64_C(0xb48a01ad5f7474c6), UINT64_C(0x3310ba4a1e25aab8), UINT64_C(0x64379d2408fd5735), UINT64_C(0xf11e9788704e5e0d), + UINT64_C(0xe9866ab0a8e90f4e), UINT64_C(0xaa344ffe50f7a934), UINT64_C(0xcce37a15b3870924), UINT64_C(0xe22135597a867f1c), + UINT64_C(0x8770a58d7fe57f99), UINT64_C(0xcafbbc8d2024bcbc), UINT64_C(0x2307e7f0fcdb1909), UINT64_C(0xdd016550b9ed2b2a), + UINT64_C(0xd0bcf0e9dee7df90), UINT64_C(0xe82d2e7daeab325c), UINT64_C(0x721a2aba71709aa7), UINT64_C(0x38cfabc260602614), + UINT64_C(0x3099ccb02b73b4c8), UINT64_C(0x00250ce48fd67df0), UINT64_C(0xcace64d8984b19cf), UINT64_C(0xee305dcbae8615ca), + UINT64_C(0xd187da55485b86ef), UINT64_C(0xebea32b2455e6486), UINT64_C(0x77cb912fa927d5c5), UINT64_C(0x911002ac8b62cbd8), + UINT64_C(0x70730c24c32c5870), UINT64_C(0x0a7cb6f89e988a83), UINT64_C(0x6b5e00839b7db787), UINT64_C(0xecae9f4cfd9ce924), + UINT64_C(0xae09926b714019a5), UINT64_C(0xbc1b2c59bc5ce769), UINT64_C(0x592756761e90349f), UINT64_C(0x95c9a69a21936de3), + UINT64_C(0x192b2119ee48eb9a), UINT64_C(0xcd8d11ebcd8a71c2), UINT64_C(0x34de8d4cad3151d6), UINT64_C(0x0fc4f3baf540eb1c), + UINT64_C(0x88bd85e02b2ec0e2), UINT64_C(0x5b65423e815dafb6), UINT64_C(0x66ec6fadd29f273e), UINT64_C(0xc3622fbc1f1c7bd0), + UINT64_C(0x50cc102827ff1acf), UINT64_C(0xe73cab705018a55f), UINT64_C(0xcd552b588a227f38), UINT64_C(0xc462735f28a9c597), + UINT64_C(0x3e3ccb00a16906e1), UINT64_C(0x79bdf5d7e7dfa593), UINT64_C(0xb333b6942d5db3a9), UINT64_C(0x3566edd901f25f20), + UINT64_C(0x8c5fe3e063253c7b), UINT64_C(0x9f0aa4160fb652ee), UINT64_C(0x2361d9bca2c92f43), UINT64_C(0x2d6a0339fe1de8ee), + UINT64_C(0x389b1bd9476b0470), UINT64_C(0xd7fa2522f0da451e), UINT64_C(0x43e6a01d67c62b2d), UINT64_C(0x5bdc15971dc0d5b3), + UINT64_C(0x38a0a80acbadf021), UINT64_C(0x2c66125ec66e1fad), UINT64_C(0xb58f61bb53b6a9ff), UINT64_C(0x492142919b2d61d6), + UINT64_C(0xd905263cc927ebd9), UINT64_C(0xca15f966e2279122), UINT64_C(0xf9dc67f8101119c9), UINT64_C(0x7f6755699c23d8c9), + UINT64_C(0x26146d38a23b0bdf), UINT64_C(0x0166c70bc773d9aa), UINT64_C(0x5b3317113904ec75), UINT64_C(0x5d3c4311b21e44d1), + UINT64_C(0x479c13c75df8cf18), UINT64_C(0x75a880dd38a8a4ff), UINT64_C(0xdf378e2eb432708d), UINT64_C(0xca1cb0f76b1c5f04), + UINT64_C(0x06c76e876516eb46), UINT64_C(0x965c10e60ec202ad), UINT64_C(0x67b18e2140e0aad3), UINT64_C(0x203ca38572b212b8), + UINT64_C(0x72adad835dd333c6), UINT64_C(0xdd02aa349680a96a), UINT64_C(0x69ab0df01d4b3eab), UINT64_C(0xfebfd83a2c43afd1), + UINT64_C(0x0dcd90c392b9fae4), UINT64_C(0x8a87b8033e4cd8cc), UINT64_C(0x3902150c36e99880), UINT64_C(0xb5b655e071474ebc), + UINT64_C(0x6c2dc9eeaffbd8d8), UINT64_C(0x3cf62bfa4986f0fe), UINT64_C(0xa68eaf0719a9afbc), UINT64_C(0xde1f4e9a4b190aef), + UINT64_C(0x7fbc9e8538999e56), UINT64_C(0xf6d5e9db2208a40c), UINT64_C(0x93b13abaddf4554c), UINT64_C(0xd8b5e4ad9911629f), + UINT64_C(0x6fdb9d7376488e52), UINT64_C(0xee604a7ce20d75ad), UINT64_C(0x94ec4abbaa9c2c1d), UINT64_C(0xdbd148c4fcd05ec1), + UINT64_C(0x0865c7c3b380a005), UINT64_C(0xa6da59a56992f211), UINT64_C(0x2eb1dc9f941c83ef), UINT64_C(0x3bf5ccf06910fae7), + UINT64_C(0x23a70e117e1f29f0), UINT64_C(0x4273791acbf6c4e5), UINT64_C(0x338414ec6b5e5d60), UINT64_C(0xa5873517e3d057d9), + UINT64_C(0xea88400a890764f6), UINT64_C(0xc0569d573ca5364f), UINT64_C(0x4c3fc02fc93316e0), UINT64_C(0x76597f718657e577), + UINT64_C(0x17052b8440c7d824), UINT64_C(0x9a7ec0a30be21a00), UINT64_C(0xab0453ac2173dac9), UINT64_C(0xb6f3706820512809), + UINT64_C(0xef44f0b07d46180a), UINT64_C(0x5e9aa12e99509a72), UINT64_C(0x6231337efc0182ca), UINT64_C(0x0963321a419da89b), + UINT64_C(0xfda3e7ad51f82b5e), UINT64_C(0x1ab8790c2f5bf1a3), UINT64_C(0x9ef177b8a59f28c0), UINT64_C(0x27d1c87da66c1652), + UINT64_C(0x1bd6bdf27c49d109), UINT64_C(0xc151e2a66994d599), UINT64_C(0x5e1b8d826b8c12a9), UINT64_C(0x39f41d57213261b5), + UINT64_C(0x16a57bd0bc78aada), UINT64_C(0x0127e7f9699b55c7), UINT64_C(0xd79eccc9f9d703be), UINT64_C(0xb41b81c61ba66d7d), + UINT64_C(0xcf8b79dcb95dce93), UINT64_C(0x5ca102a7743a6e0d), UINT64_C(0xf422a0c3a2ad7b28), UINT64_C(0x4a9137b4a0f03724), + UINT64_C(0x907dcf6425c829c2), UINT64_C(0x15551fd4432261fb), UINT64_C(0xa057dfbd55ef436c), UINT64_C(0x8b2541b9e0e0fa7e), + UINT64_C(0x7262166dcdf4b67e), UINT64_C(0xcf6533e5c608aaeb), UINT64_C(0xd6763d3967359786), UINT64_C(0x1f6b0228d257c676), + UINT64_C(0xc268c1064d2b458a), UINT64_C(0x6d8b2f6e75d2b613), UINT64_C(0xfaaf5adc43d72807), UINT64_C(0xb6376765e344f9f8), + UINT64_C(0xa8e18dd16a4bd501), UINT64_C(0xa71aa12a8ec11351), UINT64_C(0x1daaf130b537ebe0), UINT64_C(0x2e8aa415959d5d8f), + UINT64_C(0x2813ff3a3e5cbcfb), UINT64_C(0xf0fdd1d6d16a7c23), UINT64_C(0xbf2b55d2ecf0ee55), UINT64_C(0xbd4e9bec299381d0), + UINT64_C(0xac8827ab807eb180), UINT64_C(0x8514d75ac3b43b0b), UINT64_C(0xc9b5c78e45fb38a8), UINT64_C(0x4b66e6e7b797cd8f), + UINT64_C(0x1a482ffa6870a2d3), UINT64_C(0x98f55f701d4bf919), UINT64_C(0x7c0fda20e7e26ef8), UINT64_C(0x6ef795976fca3b54), + UINT64_C(0x79801cd422fa95cd), UINT64_C(0xce8a72301dbbe230), UINT64_C(0x5e79f4c925bdd0e0), UINT64_C(0x5729e93c99cc12b3), + UINT64_C(0x76d022747522392a), UINT64_C(0xb9d7652e917a6bc4), UINT64_C(0xc2978462dfa9551b), UINT64_C(0xac081b4a7528b0ce), + UINT64_C(0x5b7799fe02443b33), UINT64_C(0x6676e5687742e76a), UINT64_C(0x3e9836e33caf452b), UINT64_C(0x96ff93e427173943), + UINT64_C(0x30fa2f987359e0f6), UINT64_C(0xfaa730326c478363), UINT64_C(0x2bb0560d8986947e), UINT64_C(0x9f7c01d35aefc68f), + UINT64_C(0x6b81189bd90a0e45), UINT64_C(0xd592d2ad2df04128), UINT64_C(0xbcd0e0fe02816ec6), UINT64_C(0x1d6d84e5c1f8df0f), + UINT64_C(0xc4b55a73da2f8713), UINT64_C(0xdbd6510e7ad24d26), UINT64_C(0x7e3452b770e259bd), UINT64_C(0xd5fe716f2c3ee835), + UINT64_C(0x63a6d74ef78acd1d), UINT64_C(0x3bd673b27d5aa140), UINT64_C(0xe394f3a2a4f6d465), UINT64_C(0xf02f642cda7fee7e), + UINT64_C(0xe17ee2617b3d366a), UINT64_C(0x41cdb92402dce780), UINT64_C(0x4e5c54024fd18f6b), UINT64_C(0x6f45dd1c7c5a3f12), + UINT64_C(0xf6fd2b3f9ccda563), UINT64_C(0xe7628d358d971e26), UINT64_C(0x4dabc984370ed105), UINT64_C(0xec05f7d5c53cb70b), + UINT64_C(0xf48eccbc216dcf71), UINT64_C(0x8a571d0cb256f131), UINT64_C(0x4c05466392e32549), UINT64_C(0x91d3f9324ef03c3e), + UINT64_C(0xec0591069697e868), UINT64_C(0xa77da4079db8ffd8), UINT64_C(0x287335de3951784f), UINT64_C(0xe7afb90b4adbbf33), + UINT64_C(0x96e785b0c621dbbf), UINT64_C(0xc7f54753a5e1d81b), UINT64_C(0x4a3a42229fc7491e), UINT64_C(0xc9560ea788a62881), + UINT64_C(0xe34b9ee97b5bef12), UINT64_C(0xfae309a9fbff0656), UINT64_C(0xbc23f738a0bf4c58), UINT64_C(0xc6dd1ed9a7a706de), + UINT64_C(0x3473045c7f760007), UINT64_C(0x89b5f0a2e0ace69b), UINT64_C(0x7433c584785f3321), UINT64_C(0xa38220fab7357fc0), + UINT64_C(0x04e1d70ec8db6456), UINT64_C(0xa86065368c31fd72), UINT64_C(0x926cee3a66885fb3), UINT64_C(0xc09c39dbdb8240bc), + UINT64_C(0x1ee291407a9ac9db), UINT64_C(0xa6120818b86fd032), UINT64_C(0xa4c3a1cbf6a6666f), UINT64_C(0xb34ce856697db755), + UINT64_C(0xe3ef1a7123649d75), UINT64_C(0x814ea4e8549f30bc), UINT64_C(0xc8c12f327c1ee0a3), UINT64_C(0xc4ad0d22dbe77043), + UINT64_C(0x608451fb3ab06a00), UINT64_C(0x2e1141be52867cb9), UINT64_C(0x04b92abd9485965f), UINT64_C(0xcf91f012eb16b951), + UINT64_C(0xacc0a45db481b3b3), UINT64_C(0x523f65d99013b4d9), UINT64_C(0xf333b8f8613fae1f), UINT64_C(0x8b651a304f1c80b0), + UINT64_C(0xa91ecd6f061480d2), UINT64_C(0xbd01125685871081), UINT64_C(0x9933950983b6d41e), UINT64_C(0x1f4130fd7912c3e6), + UINT64_C(0x333230fc9385a4ba), UINT64_C(0x9d2d764680fb1581), UINT64_C(0x277e6bb16761eabf), UINT64_C(0x1829af028f40b602), + UINT64_C(0x9783144e64561566), UINT64_C(0x410d30cd66cb4e92), UINT64_C(0xce0e0df02a7ac717), UINT64_C(0xdbfc28dabb65c1e2), + UINT64_C(0x5a83f419f0610b35), UINT64_C(0xb0706efb6f56176b), UINT64_C(0x684148ee29c2a3d6), UINT64_C(0xc47213009755db33), + UINT64_C(0x2600f460fbea3831), UINT64_C(0x7037ec48a50dc3ec), UINT64_C(0xa761879a39764433), UINT64_C(0xcfd6983de3381424), + UINT64_C(0xfdc2524f5d605fc4), UINT64_C(0xbe84a33131a412c9), UINT64_C(0x1bd73706e51699b5), UINT64_C(0x7aea62c60dffb5ab), + UINT64_C(0x010fec687da2bbf4), UINT64_C(0x56aa74a28e54f75c), UINT64_C(0xba52dd2bb4019afe), UINT64_C(0x6ae298d992a98093), + UINT64_C(0xdbfc6eddb2348c70), UINT64_C(0xeab81b5b034b7836), UINT64_C(0x692b0fc00c8986ba), UINT64_C(0x02adf5476f927b39), + UINT64_C(0x0173c9bb282a94e7), UINT64_C(0x1e617773e554c877), UINT64_C(0x241d5db92d0aa39e), UINT64_C(0x902c43c4be589249), + UINT64_C(0x0b817ad8f9617273), UINT64_C(0x43508b7fb53d5d1f), UINT64_C(0xaf1d845886eeb50c), UINT64_C(0xc645d0758b0a08f2), + UINT64_C(0x3d1339390783be12), UINT64_C(0x376e4919f2fc41c9), UINT64_C(0x392c5bb8475370e6), UINT64_C(0x5e891f54eec6c015), + UINT64_C(0x16a12880b9ac0923), UINT64_C(0x6437af0453c57f36), UINT64_C(0x8dd1ec0ee82c5835), UINT64_C(0xc4738296f5085ef5), + UINT64_C(0x68c5d2b2d2d06381), UINT64_C(0x8a4627fb8fbef8df), UINT64_C(0x9d56ea18dd2590b3), UINT64_C(0x8dbdd1fd0ca96586), + UINT64_C(0x9c17bd827cc151ab), UINT64_C(0xdddb70eb24c36775), UINT64_C(0xb56277dfd02a9c4d), UINT64_C(0x5a8388d255264a83), + UINT64_C(0xcb7207a0b0155fa4), UINT64_C(0x2bbc2967864dd11a), UINT64_C(0x19fb91190adfc85a), UINT64_C(0xed562d76a7e244c3), + UINT64_C(0xf5438c5585588610), UINT64_C(0xbc16ff713cde2e48), UINT64_C(0x42248c858cf837cb), UINT64_C(0x59c8eeb9769cf08a), + UINT64_C(0x0f5260cc1dc624b7), UINT64_C(0x6b880672b5ebfdd5), UINT64_C(0x2e6d6cf57e3365cf), UINT64_C(0xe994b274628cdb20), + UINT64_C(0x939e00fbb43765d8), UINT64_C(0x093150ef5c7cd883), UINT64_C(0x8ae15f57f13b42f1), UINT64_C(0x3af5014a74f18355), + UINT64_C(0x7e1a2d0c860bcd23), UINT64_C(0x796312eee1445e38), UINT64_C(0x1cbde8ef8bdfee3d), UINT64_C(0x207592ed0910de04), + UINT64_C(0x150e839a79142012), UINT64_C(0xb920f5ff40de84a6), UINT64_C(0x0c05b146a932213b), UINT64_C(0x7406c434e2d92546), + UINT64_C(0x19376004d1fc67aa), UINT64_C(0x82f3677fcf0dd552), UINT64_C(0xd9daf63e3aa745a9), UINT64_C(0x8e1e09d0a9676fdf), + UINT64_C(0x2cb86571c0289958), UINT64_C(0x4c4c12eb3a97b760), UINT64_C(0x1e3468d9bf56d00c), UINT64_C(0x11f90498f14cb4a4), + UINT64_C(0x251664b4422a7c58), UINT64_C(0xad10e44d41c2b7c5), UINT64_C(0x663cf17121b6d221), UINT64_C(0x3fe40cdc49c541b8), + UINT64_C(0xb1b1a8b2a941f9c7), UINT64_C(0x83ffae6e34d4eb78), UINT64_C(0xa4564673c6728fbf), UINT64_C(0xe1499f6bd812a4b9), + UINT64_C(0xfb5507a915ed36a3), UINT64_C(0xe055a829c62de53c), UINT64_C(0x1ea06fc53acba653), UINT64_C(0xce0f8c15fd8f2258), + UINT64_C(0x7dd42e43e5ef6f4b), UINT64_C(0x0c55aecd7e1adc10), UINT64_C(0xc31b0e4d3a4e8b1c), UINT64_C(0x1205469d91599780), + UINT64_C(0xbba5d6df94390b83), UINT64_C(0xc97925cae2f17697), UINT64_C(0x3b98f3dc9e15ea08), UINT64_C(0x878203758954cd36), + UINT64_C(0x818deaef5ba91f77), UINT64_C(0x6f8f1786214acb89), UINT64_C(0x26c5c2162849ece8), UINT64_C(0xaf1c297b73471dd3), + UINT64_C(0x415c497c9fa7e936), UINT64_C(0xc1804e923aa3cce6), UINT64_C(0xdd7ca8ffb78dc68c), UINT64_C(0x5b912445ed7ba89a), + UINT64_C(0x95dec0af89a1f157), UINT64_C(0x7041c032d1fa5266), UINT64_C(0xc569835beabc20df), UINT64_C(0xcc662c0dbb7baaef), + UINT64_C(0x20d5d2c1383ff75c), UINT64_C(0x7efdaae3e1c4eaaf), UINT64_C(0x3575fad9533be200), UINT64_C(0xfb0fb500836d48dd), + UINT64_C(0xd211a5090e6d53e2), UINT64_C(0x34afe4050a01467c), UINT64_C(0x63457fe7bfe187c3), UINT64_C(0xc3ee000cb474d925), + UINT64_C(0x4fd32cbbb8326e22), UINT64_C(0xc2abcd1fc9bf14c2), UINT64_C(0xf34b534e55f28258), UINT64_C(0x094ff2a11972ddec), + UINT64_C(0x9744b26f181926a9), UINT64_C(0xa7fe6a0982135b29), UINT64_C(0x0f8d9e7a0de7d61b), UINT64_C(0x4bcd12d1b5d3d8a6), + UINT64_C(0x706e34dbac81bd39), UINT64_C(0xefea01605e9304c6), UINT64_C(0xee3bb6d1e510efe1), UINT64_C(0x84a094db3f4620f8), + UINT64_C(0xf1752fc679d6aeb3), UINT64_C(0x54921e5d6949a43f), UINT64_C(0xd3616f81f2ff8c55), UINT64_C(0x8bd9584eb62232bd), + UINT64_C(0xa990035eef6e7b13), UINT64_C(0xd4c56de5c11dcdda), UINT64_C(0x8048c23ec8bd072b), UINT64_C(0x407539904d984e51), + UINT64_C(0xeaf5a1d46eb3779b), UINT64_C(0x4b06e5769362f357), UINT64_C(0x931f75e21bc0d143), UINT64_C(0x9369439b81c92fc4), + UINT64_C(0x059fccc0d4afbb45), UINT64_C(0xd072671b3c927118), UINT64_C(0x61b6803f95c41115), UINT64_C(0xacb4b2c4381da3f5), + UINT64_C(0xd73bf897ee871c72), UINT64_C(0x241c9d52c953d3c0), UINT64_C(0x083c079e704d7b96), UINT64_C(0x8c431ee43e5171a5), + UINT64_C(0x66079596998b96b6), UINT64_C(0x041ea35d207b478e), UINT64_C(0xbe698683cf7b258e), UINT64_C(0x5457365cf6cbc5bb), + UINT64_C(0xc166c3ef7006b02d), UINT64_C(0x27789ff1e5365132), UINT64_C(0xae4a02397d308867), UINT64_C(0x0388704d03d7b613), + UINT64_C(0xf5c9d782d3fd58e3), UINT64_C(0xb51c3fe53965624e), UINT64_C(0xf785b86e7fe0adec), UINT64_C(0x19f72a9ef3a215e8), + UINT64_C(0x19db58361e6633d9), UINT64_C(0xf1fe7a08693d64ab), UINT64_C(0x07c3310adc3bbf03), UINT64_C(0x742e87d333077816), + UINT64_C(0xe817529af0f04970), UINT64_C(0xe7f343c941a044ff), UINT64_C(0xf9693fb4f37b4d2c), UINT64_C(0xb99da4a0b6ccb1ed), + UINT64_C(0x4eef654d39c7f631), UINT64_C(0xd06badd9354befc8), UINT64_C(0x3dea38b48a4fb6cf), UINT64_C(0xf6551a2de11ec63d), + UINT64_C(0xf0dd7ca2d08731e5), UINT64_C(0xfbbac6e989684aff), UINT64_C(0xe2b65b698f6ea652), UINT64_C(0x679e2fc32595fb51), + UINT64_C(0x6547fdc240571414), UINT64_C(0x6809f663de2d0466), UINT64_C(0x6c6b7a0a40a5e48f), UINT64_C(0xe5f43660d891606e), + UINT64_C(0xa44f283a5a5c10fd), UINT64_C(0x95635b53a60083be), UINT64_C(0x7e0f003a2698a45c), UINT64_C(0x2fd0eb2a3cb4db79), + UINT64_C(0x7416380640ad33c7), UINT64_C(0x988de04a8bfe794b), UINT64_C(0x6d00569ebd6839ff), UINT64_C(0x22ddd7d3d0efa384), + UINT64_C(0x20f9c1ae73b1a651), UINT64_C(0x32386da97bb626af), UINT64_C(0x263c358b8e1975fe), UINT64_C(0x32bd1e4fdb3e7f7c), + UINT64_C(0x2ebb53af95ab07db), UINT64_C(0xeccc526f7e6aca61), UINT64_C(0x186fd1f3ad161e28), UINT64_C(0xf96dd58eca026372), + UINT64_C(0x0403c8572fee3bf3), UINT64_C(0x2598261d29b22e84), UINT64_C(0xa4027ffeed481ae0), UINT64_C(0xe2f690ddcdb0fdaf), + UINT64_C(0x95d11d0d60c528fd), UINT64_C(0x0cc242f0eeae1d6c), UINT64_C(0xfa3440087835377f), UINT64_C(0x3d8fad475b8139e4), + UINT64_C(0x8e92fce862d8a97e), UINT64_C(0xc53bc4cb5ed50eb4), UINT64_C(0xc8f91ece0194e8d4), UINT64_C(0xf78d7c6b5cff07e1), + UINT64_C(0x3163d8458b924665), UINT64_C(0xc2ae6dc185c739bf), UINT64_C(0x2943e3eae337c6c6), UINT64_C(0x96bd36f0da4a49f7), + UINT64_C(0x98753f33282f27bf), UINT64_C(0xd5c33455bf0f69fd), UINT64_C(0x78cc9f69e0286682), UINT64_C(0x0631fadc21ec437c), + UINT64_C(0x521c3db58b6b1170), UINT64_C(0x2333f0f70e46b5cf), UINT64_C(0x87be027b8d434ac6), UINT64_C(0xba4c26796c582e4c), + UINT64_C(0x35d52e4f85db73e4), UINT64_C(0x8ac3723b24e99436), UINT64_C(0x4a2b6ce4b7a97a02), UINT64_C(0xcb8017cc584b287d), + UINT64_C(0x1ca3610bc2f30e9f), UINT64_C(0xc1c2dafdd385b283), UINT64_C(0xa812778eceff9a2b), UINT64_C(0x91b8429959ef5359), + UINT64_C(0xa2750c665bcab7d2), UINT64_C(0x9212f5d704b5320b), UINT64_C(0xfa46bb7a213be57f), UINT64_C(0xd20cbd122dce6c1d), + UINT64_C(0x82868b5aee7a4776), UINT64_C(0xf49ec5ddf8cec096), UINT64_C(0xa4fc2bf71ac9dcc2), UINT64_C(0x9d8b8f462bd2f17b), + UINT64_C(0x452703fe91008332), UINT64_C(0x919a288ada854bef), UINT64_C(0x75d2b2eb0f4eeed7), UINT64_C(0xd64885293558a96f), + UINT64_C(0x098d7efb4f8d5b31), UINT64_C(0x7ee77eef93a3928e), UINT64_C(0xb28eebae28b63dc8), UINT64_C(0x0f01129fc90af970), + UINT64_C(0xf3d5b92900d45181), UINT64_C(0xb9d8a408ea6715c0), UINT64_C(0xe44424fb8ca9e22e), UINT64_C(0xd81135834c1aaf96), + UINT64_C(0x445b3d67398e888b), UINT64_C(0x0dad43784fe36cda), UINT64_C(0xe6d1bd75c5d81518), UINT64_C(0x662f0e924150c5cb), + UINT64_C(0x78179f80df6e0709), UINT64_C(0xdd8fc687a741289c), UINT64_C(0x710873d7f5ab060e), UINT64_C(0xa1961d2b538f497c), + UINT64_C(0xb36bbf75bc8b8761), UINT64_C(0x675c608353017307), UINT64_C(0xade6b1aa0ec59bbe), UINT64_C(0xc803a2c9426b3c5f), + UINT64_C(0x48a8210409b5ffac), UINT64_C(0xc3d58389ce5f3b13), UINT64_C(0xa23ceb0e71b08443), UINT64_C(0xd9d192cd9c5e9a05), + UINT64_C(0x20d9cd878b94147d), UINT64_C(0x22329c7695f6df46), UINT64_C(0xaebdcdc2c2cbc0d9), UINT64_C(0xe95ae3d514f6f94b), + UINT64_C(0x59152e1f5715e782), UINT64_C(0xb3280d75a8134f15), UINT64_C(0x5bce3379e1fcb7b4), UINT64_C(0x437d9c3238c4169f), + UINT64_C(0x77db7e5ebd5125bd), UINT64_C(0x0dd3aef40336d438), UINT64_C(0x4a496a56bac81428), UINT64_C(0x72a128c3875dc93d), + UINT64_C(0x8eb605e5bef1747d), UINT64_C(0x666d4546567a4eef), UINT64_C(0xad5ad003399d2296), UINT64_C(0x19c74366682b52a0), + UINT64_C(0xb3c35c5a0e259420), UINT64_C(0xf98340503eb93d6d), UINT64_C(0xa51985b0bb7f81e8), UINT64_C(0x2a21510c6c7ca42f), + UINT64_C(0x3c1ac0b52c230998), UINT64_C(0x4e1d572a2d77000b), UINT64_C(0x8dd3adff3bfdec71), UINT64_C(0xdfb3a4a23e43d035), + UINT64_C(0xe12f748421173e62), UINT64_C(0x2f356145d2f72758), UINT64_C(0x31c13682374c445c), UINT64_C(0x09240a1f409fab88), + UINT64_C(0xa346e2d2f72fd5e8), UINT64_C(0x2c5b53bfc05f9f77), UINT64_C(0x0a9f7ab218574f6e), UINT64_C(0xc3fcb9b977f0cceb), + UINT64_C(0xac26889eb86459b9), UINT64_C(0x1082f785bc3dac21), UINT64_C(0x3c8c337a4c67ef18), UINT64_C(0x118e48d0e8a66e02), + UINT64_C(0xb777cef85278f2dc), UINT64_C(0x12a268a3dcda05bc), UINT64_C(0x75f5f7d3fde0bd9e), UINT64_C(0x62f5f1650ec91670), + UINT64_C(0x81fcf9e3e1c3adec), UINT64_C(0xf0b5e35ace23349c), UINT64_C(0xde7d514d058e53a4), UINT64_C(0x52a625e5f06242c7), + UINT64_C(0x3cc1346eda6a430a), UINT64_C(0x165bd737e851f6a1), UINT64_C(0xe52c53d745f1b49a), UINT64_C(0x15513074f676fafc), + UINT64_C(0xcb8797dbb29e6710), UINT64_C(0x27b92c8190fd679d), UINT64_C(0x0b39384ac668b176), UINT64_C(0x11341e6d7adad0e9), + UINT64_C(0x491b5b5390b70f94), UINT64_C(0x1f5eccf586d03746), UINT64_C(0x6502ca945646feae), UINT64_C(0x3abb5466229ef7d8), + UINT64_C(0x535b4effbe0ce5f6), UINT64_C(0x6575eefef9e916f5), UINT64_C(0x77a76fbf3c76f2d7), UINT64_C(0x1cc63124152994a7), + UINT64_C(0x6e33f80e95d4323d), UINT64_C(0xd711791d9b2e1d65), UINT64_C(0x7c766cd52013ae49), UINT64_C(0x08bc15230d2ef477), + UINT64_C(0xb751fa3b942ab063), UINT64_C(0xfe99a8b170a11941), UINT64_C(0x731979294908218a), UINT64_C(0x32166899c12f3097), + UINT64_C(0x8318df8e3823dd3d), UINT64_C(0x940e81f0b4ece3d8), UINT64_C(0x81ea0f12130235ea), UINT64_C(0x36603dfef356d752), + UINT64_C(0x409eeb16b992d793), UINT64_C(0xf4c675cca09e229a), UINT64_C(0x0ef989d732dae818), UINT64_C(0x269b4385573ad2f6), + UINT64_C(0x53df04584157173c), UINT64_C(0x260c347bedc5ce82), UINT64_C(0xb9fbfba9b58c1b09), UINT64_C(0x20115df9d0693a14), + UINT64_C(0x8c0fb27588303369), UINT64_C(0x3a9450974a66eaaf), UINT64_C(0x805f0d515d715679), UINT64_C(0x10f4b52a09898972), + UINT64_C(0x20e9c3449e84718e), UINT64_C(0x9eed8745b4e234e2), UINT64_C(0x946c3083bf840def), UINT64_C(0xb18de02e626f7dd9), + UINT64_C(0x9e8b496b1d035ed8), UINT64_C(0x6ef3891e7c690f77), UINT64_C(0xd62269e5ad1c07f5), UINT64_C(0x7117ed7eddc2883e), + UINT64_C(0x260f1d08457dfcca), UINT64_C(0xe0759189d723da9d), UINT64_C(0xd6d40adb9c9f94d7), UINT64_C(0x7c47c4b4a670b77e), + UINT64_C(0xb2b5179563a2abe1), UINT64_C(0x62118cb60f121507), UINT64_C(0x22c3a4a74379ceb1), UINT64_C(0xd5904c844fbfed74), + UINT64_C(0xa0afa38c06d50d92), UINT64_C(0xd6223dbbcfcf73f4), UINT64_C(0xf19623e7ec6f83dd), UINT64_C(0xd08c12de2b6265f6), + UINT64_C(0xc487d5dc19489db6), UINT64_C(0x759283ffd06fc796), UINT64_C(0xd61a735ad1cd7ccc), UINT64_C(0x32084ba3ca8fa3ee), + UINT64_C(0x17530308a1204968), UINT64_C(0x80328582a1eb8d8f), UINT64_C(0xd4c873deec7fb3d7), UINT64_C(0x11c825cc4bc8b181), + UINT64_C(0x0137fa50576b21eb), UINT64_C(0xc5ea2f958a3ddb53), UINT64_C(0x6ae611d92b67c9bc), UINT64_C(0xb798b3e1f9c3a851), + UINT64_C(0x22a42679fa4b013f), UINT64_C(0x2071f22dae8de629), UINT64_C(0x3faa3a80e45cbca6), UINT64_C(0xb0418f45808009ec), + UINT64_C(0x446063013dd5a0f4), UINT64_C(0x932445b680ef71ec), UINT64_C(0x2bc9a2d9ab8e2662), UINT64_C(0x8ebd57fbc56a6154), + UINT64_C(0xa28f3d2264ad0f10), UINT64_C(0xffff84df76a10c15), UINT64_C(0xac5c9b0e78fbee81), UINT64_C(0xc1f08e08982b237c), + UINT64_C(0x5907b7fa41daa2b8), UINT64_C(0xbed3856320d9c3c2), UINT64_C(0x500a342c1902f015), UINT64_C(0x0c3a5d539c71b7d6), + UINT64_C(0xa706750b1c3e5604), UINT64_C(0x1543ab593a8c824c), UINT64_C(0xbdfd9d26f151d83c), UINT64_C(0x1603bb40537de208), + UINT64_C(0x1501b0ba802daa2d), UINT64_C(0xdcbcc803f3c11f3c), UINT64_C(0x2bb283a389ec2f35), UINT64_C(0x3a27513ef9d14bf4), + UINT64_C(0xcb7c4fd02a39d8af), UINT64_C(0xcc6f61a03488e43f), UINT64_C(0xfdddf2b5fd6c4b05), UINT64_C(0xa015987625b9755d), + UINT64_C(0x14c5a9b03c63b253), UINT64_C(0x413f7d2608bf939e), UINT64_C(0x8bdb68c7176407e5), UINT64_C(0x436de64d8a614c32), + UINT64_C(0xc2aca4b10ff0bf8e), UINT64_C(0x3b56cc9c1df797e4), UINT64_C(0xb1750cce6cca57bb), UINT64_C(0x8c80e2303509012a), + UINT64_C(0x7f25bae3c4fea8af), UINT64_C(0xecf8ed9dac1367b8), UINT64_C(0x1a49274e39668f4e), UINT64_C(0xca4a0ae881c7dc39) }; -static const int STATE = 32; -static const uint64_t MASK = UINT64_C(0xffffffffffffff); +static const int STATE = 32; +static const uint64_t MASK = UINT64_C(0xffffffffffffff); //-------- // State mix function -static FORCE_INLINE uint8_t beam_ROTR8(uint8_t v, int n) { +static FORCE_INLINE uint8_t beam_ROTR8( uint8_t v, int n ) { n = n & 7; - if (n) - v = (v >> n) | (v << (8-n)); + if (n) { + v = (v >> n) | (v << (8 - n)); + } return v; } -static FORCE_INLINE uint64_t beam_ROTR64(uint64_t v, int n) { +static FORCE_INLINE uint64_t beam_ROTR64( uint64_t v, int n ) { n = n & 63; - if (n) + if (n) { v = ROTR64(v, n); + } return v; } +static FORCE_INLINE void mix( uint64_t * state, const uint32_t A ) { + const uint32_t B = A + 1; + const uint32_t iv = state[A ] & 1023; + const uint64_t M = T [iv]; -static FORCE_INLINE void mix(uint64_t * state, const uint32_t A) { - const uint32_t B = A+1; - const uint32_t iv = state[A] & 1023; - const uint64_t M = T[iv]; - state[B] += M + state[A]; + state[B] += state[A] + M; - state[A] ^= state[B]; - state[B] ^= state[A]; - state[A] ^= state[B]; + state[A] ^= state[B]; + state[B] ^= state[A]; + state[A] ^= state[B]; - state[B] = beam_ROTR64(state[B], state[A]); + state[B] = beam_ROTR64(state[B], state[A]); } //--------- // Hash round function -template < bool bswap > -static FORCE_INLINE void round(uint64_t * const state, const uint8_t * m8, uint32_t len) { +template +static FORCE_INLINE void round( uint64_t * const state, const uint8_t * m8, uint32_t len ) { uint8_t * const state8 = (uint8_t *)state; - uint32_t index = 0; - uint32_t sindex = 0; + uint32_t index = 0; + uint32_t sindex = 0; for (uint32_t Len = len >> 3; index < Len; index++) { - uint64_t blk = GET_U64(m8, index*8); - state[sindex] += beam_ROTR64(blk + index + 1, - state[sindex] + index + 1); + uint64_t blk = GET_U64(m8, index * 8); + state[sindex] += beam_ROTR64(blk + index + 1, state[sindex] + index + 1); if (sindex == 1) { mix(state, 0); } else if (sindex == 3) { @@ -345,13 +346,12 @@ static FORCE_INLINE void round(uint64_t * const state, const uint8_t * m8, uint3 mix(state, 0); index <<= 3; - sindex = index&31; - for( ; index < len; index++) { - const uint32_t ssindex = bswap ? (sindex^7) : sindex; - state8[ssindex] += beam_ROTR8(m8[index] + index + 1, - state8[ssindex] + index + 1); + sindex = index & 31; + for (; index < len; index++) { + const uint32_t ssindex = bswap ? (sindex ^ 7) : sindex; + state8[ssindex] += beam_ROTR8(m8[index] + index + 1, state8[ssindex] + index + 1); // state+[0,1,2] - mix(state, index%3); + mix(state, index % 3); if (sindex >= 31) { sindex = -1; } @@ -365,26 +365,26 @@ static FORCE_INLINE void round(uint64_t * const state, const uint8_t * m8, uint3 //--------- // main hash function -template < bool bswap > -static void beamsplitter_64(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * key8Arr = (uint8_t *)in; - uint32_t seedbuf[2] = {0}; +template +static void beamsplitter_64( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * key8Arr = (uint8_t *)in; + uint32_t seedbuf[2] = { 0 }; if (len >= UINT32_C(0xffffffff)) { return; } // the cali number from the Matrix (1999) uint32_t seed32 = seed; if (!bswap) { - seedbuf[0] = 0xc5550690; + seedbuf[0] = 0xc5550690; seedbuf[0] -= seed32; - seedbuf[1] = ~(1 - seed32); + seedbuf[1] = ~(1 - seed32); } else { - seedbuf[1] = 0xc5550690; + seedbuf[1] = 0xc5550690; seedbuf[1] -= seed32; - seedbuf[0] = ~(1 - seed32); + seedbuf[0] = ~(1 - seed32); } - uint64_t state[STATE/8]; + uint64_t state[STATE / 8]; // nothing up my sleeve state[0] = UINT64_C(0x123456789abcdef0); state[1] = UINT64_C(0x0fedcba987654321); @@ -396,50 +396,50 @@ static void beamsplitter_64(const void * in, const size_t len, const seed_t seed round(state, key8Arr, (uint32_t)len); round(state, key8Arr, (uint32_t)len); round(state, key8Arr, (uint32_t)len); - round(state, (uint8_t *)seedbuf, 8); - round(state, (uint8_t *)seedbuf, 8); + round(state, (uint8_t *)seedbuf, 8 ); + round(state, (uint8_t *)seedbuf, 8 ); round(state, key8Arr, (uint32_t)len); round(state, key8Arr, (uint32_t)len); round(state, key8Arr, (uint32_t)len); /* - //printf("state = %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 "\n", - // state[0], state[1], state[2], state[3] ); - */ + * //printf("state = %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 "\n", + * // state[0], state[1], state[2], state[3] ); + */ - //printf("state = %#018" PRIx64 " %#018" PRIx64 "\n", + // printf("state = %#018" PRIx64 " %#018" PRIx64 "\n", // state[0], state[1] ); - uint64_t h[2] = {0}; + uint64_t h[2] = { 0 }; // The new combination step - h[0] = state[2]; - h[1] = state[3]; + h[0] = state[2]; + h[1] = state[3]; - h[0] += h[1]; + h[0] += h [1]; PUT_U64(h[0], (uint8_t *)out, 0); } REGISTER_FAMILY(beamsplitter, - $.src_url = "https://github.com/crisdosyago/beamsplitter", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/crisdosyago/beamsplitter", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); // Yes, this has no bad seeds! See note at the top near "thread_local". REGISTER_HASH(beamsplitter, - $.desc = "A possibly universal hash made with a 10x64 s-box", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x1BDF358B, - $.verification_BE = 0x4791907E, - $.hashfn_native = beamsplitter_64, - $.hashfn_bswap = beamsplitter_64, - $.badseeds = {} -); + $.desc = "A possibly universal hash made with a 10x64 s-box", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x1BDF358B, + $.verification_BE = 0x4791907E, + $.hashfn_native = beamsplitter_64, + $.hashfn_bswap = beamsplitter_64, + $.badseeds = {} + ); diff --git a/hashes/blake2.cpp b/hashes/blake2.cpp index e778cdcf..54fa96f2 100644 --- a/hashes/blake2.cpp +++ b/hashes/blake2.cpp @@ -30,110 +30,107 @@ #include "Platform.h" #include "Hashlib.h" -static const uint64_t blake2b_IV[8] = -{ - UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), - UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), - UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), - UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) +static const uint64_t blake2b_IV [ 8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; -static const uint32_t blake2s_IV[8] = -{ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +static const uint32_t blake2s_IV [ 8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -static const uint8_t blake2_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +static const uint8_t blake2_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; typedef struct blake2b_context_ { - uint64_t h[8]; - uint64_t t[2]; - uint64_t f[2]; - uint8_t buf[128]; - size_t buflen; + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[128]; + size_t buflen; } blake2b_context; typedef struct blake2s_context_ { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[64]; - size_t buflen; + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[64]; + size_t buflen; } blake2s_context; // This layout is explicitly little-endian struct blake2_params_prefix { - uint8_t digest_length; /* 1 */ - uint8_t key_length; /* 2 */ - uint8_t fanout; /* 3 */ - uint8_t depth; /* 4 */ - uint32_t zero; /* 8 */ + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t zero; /* 8 */ }; -template < typename T > -NEVER_INLINE static void blake2_Init(T * ctx, unsigned hashbits, uint64_t seed) { - const uint32_t seedlo = seed & 0xFFFFFFFF; - const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; +template +NEVER_INLINE static void blake2_Init( T * ctx, unsigned hashbits, uint64_t seed ) { + const uint32_t seedlo = seed & 0xFFFFFFFF; + const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + + memset(ctx , 0, sizeof(*ctx) ); + for (int i = 0; i < 8; i++) { + if (sizeof(ctx->h[0]) == 8) { + ctx->h[i] = blake2b_IV[i]; + } else { + ctx->h[i] = blake2s_IV[i]; + } + } - memset(ctx, 0, sizeof(*ctx)); - for (int i = 0; i < 8; i++) { + struct blake2_params_prefix params; + memset(¶ms, 0, sizeof(params)); + params.digest_length = hashbits / 8; + params.fanout = 1; + params.depth = 1; if (sizeof(ctx->h[0]) == 8) { - ctx->h[i] = blake2b_IV[i]; + ctx->h[0] ^= isLE() ? + GET_U64((const uint8_t *)(¶ms), 0) : + GET_U64((const uint8_t *)(¶ms), 0); } else { - ctx->h[i] = blake2s_IV[i]; + ctx->h[0] ^= isLE() ? + GET_U32((const uint8_t *)(¶ms), 0) : + GET_U32((const uint8_t *)(¶ms), 0); } - } - - struct blake2_params_prefix params; - memset(¶ms, 0, sizeof(params)); - params.digest_length = hashbits/8; - params.fanout = 1; - params.depth = 1; - if (sizeof(ctx->h[0]) == 8) { - ctx->h[0] ^= isLE() ? - GET_U64((const uint8_t *)(¶ms), 0) : - GET_U64 ((const uint8_t *)(¶ms), 0); - } else { - ctx->h[0] ^= isLE() ? - GET_U32((const uint8_t *)(¶ms), 0) : - GET_U32 ((const uint8_t *)(¶ms), 0); - } - - // Legacy homegrown BLAKE2 seeding for SMHasher3 - ctx->h[0] ^= seedlo; - ctx->h[1] ^= seedhi; + + // Legacy homegrown BLAKE2 seeding for SMHasher3 + ctx->h[0] ^= seedlo; + ctx->h[1] ^= seedhi; } -template < typename T > +template static int blake2_is_lastblock( const T * ctx ) { - return ctx->f[0] != 0; + return ctx->f[0] != 0; } -template < typename T > +template static void blake2_set_lastblock( T * ctx ) { ctx->f[0] = 0; ctx->f[0]--; } -template < typename T > +template static void blake2_increment_counter( T * ctx, const uint64_t inc ) { - ctx->t[0] += inc; - ctx->t[1] += ( ctx->t[0] < inc ); + ctx->t[0] += inc; + ctx->t[1] += (ctx->t[0] < inc); } // @@ -144,272 +141,272 @@ static void blake2_increment_counter( T * ctx, const uint64_t inc ) { // static void blake2_compress(T * ctx, const uint8_t * in) { // } #if defined(HAVE_SSE_2) -#include "Intrinsics.h" -#include "blake2/compress-sse2-plus.h" + #include "Intrinsics.h" + #include "blake2/compress-sse2-plus.h" #else -#include "blake2/compress-portable.h" + #include "blake2/compress-portable.h" #endif -template < bool bswap, typename T > -static void blake2_Update(T * ctx, const uint8_t * in, size_t inlen) { - const uint64_t BLOCKBYTES = sizeof(ctx->buf); - - if ( inlen > 0 ) { - size_t left = ctx->buflen; - size_t fill = BLOCKBYTES - left; - if ( inlen > fill ) { - ctx->buflen = 0; - memcpy( ctx->buf + left, in, fill ); /* Fill buffer */ - blake2_increment_counter(ctx, BLOCKBYTES ); - blake2_compress(ctx, ctx->buf ); /* Compress */ - in += fill; inlen -= fill; - while(inlen > BLOCKBYTES) { - blake2_increment_counter(ctx, BLOCKBYTES); - blake2_compress(ctx,in); - in += BLOCKBYTES; - inlen -= BLOCKBYTES; - } +template +static void blake2_Update( T * ctx, const uint8_t * in, size_t inlen ) { + const uint64_t BLOCKBYTES = sizeof(ctx->buf); + + if (inlen > 0) { + size_t left = ctx->buflen; + size_t fill = BLOCKBYTES - left; + if (inlen > fill) { + ctx->buflen = 0; + memcpy(ctx->buf + left, in, fill); /* Fill buffer */ + blake2_increment_counter(ctx, BLOCKBYTES); + blake2_compress(ctx, ctx->buf); /* Compress */ + in += fill; inlen -= fill; + while (inlen > BLOCKBYTES) { + blake2_increment_counter(ctx, BLOCKBYTES); + blake2_compress(ctx, in); + in += BLOCKBYTES; + inlen -= BLOCKBYTES; + } + } + memcpy(ctx->buf + ctx->buflen, in, inlen); + ctx->buflen += inlen; } - memcpy( ctx->buf + ctx->buflen, in, inlen ); - ctx->buflen += inlen; - } } -template < bool bswap, typename T > -static void blake2_Finalize(T * ctx) { - const uint64_t BLOCKBYTES = sizeof(ctx->buf); +template +static void blake2_Finalize( T * ctx ) { + const uint64_t BLOCKBYTES = sizeof(ctx->buf); - if (blake2_is_lastblock(ctx)) { - return; - } + if (blake2_is_lastblock(ctx)) { + return; + } - blake2_increment_counter( ctx, ctx->buflen ); - blake2_set_lastblock( ctx ); - memset( ctx->buf + ctx->buflen, 0, BLOCKBYTES - ctx->buflen ); /* Padding */ - blake2_compress( ctx, ctx->buf ); + blake2_increment_counter(ctx, ctx->buflen); + blake2_set_lastblock(ctx); + memset(ctx->buf + ctx->buflen, 0, BLOCKBYTES - ctx->buflen); /* Padding */ + blake2_compress(ctx, ctx->buf); } -template < uint32_t hashbits, uint32_t outbits, bool bswap > -static void BLAKE2B(const void * in, const size_t len, const seed_t seed, void * out) { - blake2b_context ctx; +template +static void BLAKE2B( const void * in, const size_t len, const seed_t seed, void * out ) { + blake2b_context ctx; - blake2_Init(&ctx, hashbits, (uint64_t)seed); - blake2_Update(&ctx, (const uint8_t *)in, len); - blake2_Finalize(&ctx); + blake2_Init(&ctx, hashbits, (uint64_t)seed); + blake2_Update(&ctx, (const uint8_t *)in, len); + blake2_Finalize(&ctx); - uint8_t buf[32]; - for (int i = 0; i < 4; ++i ) { - PUT_U64(ctx.h[i], buf, i*8); - } - memcpy(out, buf, (outbits >= 256) ? 32 : (outbits+7)/8); + uint8_t buf[32]; + for (int i = 0; i < 4; ++i) { + PUT_U64(ctx.h[i], buf, i * 8); + } + memcpy(out, buf, (outbits >= 256) ? 32 : (outbits + 7) / 8); } -template < uint32_t hashbits, uint32_t outbits, bool bswap > -static void BLAKE2S(const void * in, const size_t len, const seed_t seed, void * out) { - blake2s_context ctx; +template +static void BLAKE2S( const void * in, const size_t len, const seed_t seed, void * out ) { + blake2s_context ctx; - blake2_Init(&ctx, hashbits, (uint64_t)seed); - blake2_Update(&ctx, (const uint8_t *)in, len); - blake2_Finalize(&ctx); + blake2_Init(&ctx, hashbits, (uint64_t)seed); + blake2_Update(&ctx, (const uint8_t *)in, len); + blake2_Finalize(&ctx); - uint8_t buf[32]; - for (int i = 0; i < 8; ++i ) { - PUT_U32(ctx.h[i], buf, i*4); - } - memcpy(out, buf, (outbits >= 256) ? 32 : (outbits+7)/8); + uint8_t buf[32]; + for (int i = 0; i < 8; ++i) { + PUT_U32(ctx.h[i], buf, i * 4); + } + memcpy(out, buf, (outbits >= 256) ? 32 : (outbits + 7) / 8); } REGISTER_FAMILY(blake2, - $.src_url = "https://github.com/BLAKE2/BLAKE2", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/BLAKE2/BLAKE2", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(blake2b_256, - $.desc = "BLAKE 2b, 256-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 256, - $.verification_LE = 0xC9D8D995, - $.verification_BE = 0xCDB3E566, - $.hashfn_native = BLAKE2B<256,256,false>, - $.hashfn_bswap = BLAKE2B<256,256,true> -); + $.desc = "BLAKE 2b, 256-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 256, + $.verification_LE = 0xC9D8D995, + $.verification_BE = 0xCDB3E566, + $.hashfn_native = BLAKE2B<256, 256, false>, + $.hashfn_bswap = BLAKE2B<256, 256, true> + ); REGISTER_HASH(blake2b_224, - $.desc = "BLAKE 2b, 224-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 224, - $.verification_LE = 0x101A62A4, - $.verification_BE = 0x77BE80ED, - $.hashfn_native = BLAKE2B<224,224,false>, - $.hashfn_bswap = BLAKE2B<224,224,true> -); + $.desc = "BLAKE 2b, 224-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 224, + $.verification_LE = 0x101A62A4, + $.verification_BE = 0x77BE80ED, + $.hashfn_native = BLAKE2B<224, 224, false>, + $.hashfn_bswap = BLAKE2B<224, 224, true> + ); REGISTER_HASH(blake2b_160, - $.desc = "BLAKE 2b, 160-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 160, - $.verification_LE = 0x28ADDA30, - $.verification_BE = 0xFF79839E, - $.hashfn_native = BLAKE2B<160,160,false>, - $.hashfn_bswap = BLAKE2B<160,160,true> -); + $.desc = "BLAKE 2b, 160-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 160, + $.verification_LE = 0x28ADDA30, + $.verification_BE = 0xFF79839E, + $.hashfn_native = BLAKE2B<160, 160, false>, + $.hashfn_bswap = BLAKE2B<160, 160, true> + ); REGISTER_HASH(blake2b_128, - $.desc = "BLAKE 2b, 128-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 128, - $.verification_LE = 0x7DC97611, - $.verification_BE = 0xDD6695FD, - $.hashfn_native = BLAKE2B<128,128,false>, - $.hashfn_bswap = BLAKE2B<128,128,true> -); + $.desc = "BLAKE 2b, 128-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 128, + $.verification_LE = 0x7DC97611, + $.verification_BE = 0xDD6695FD, + $.hashfn_native = BLAKE2B<128, 128, false>, + $.hashfn_bswap = BLAKE2B<128, 128, true> + ); REGISTER_HASH(blake2b_256__64, - $.desc = "BLAKE 2b, 256-bit digest, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0xCF4F7EC3, - $.verification_BE = 0x0EB38190, - $.hashfn_native = BLAKE2B<256,64,false>, - $.hashfn_bswap = BLAKE2B<256,64,true> -); + $.desc = "BLAKE 2b, 256-bit digest, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0xCF4F7EC3, + $.verification_BE = 0x0EB38190, + $.hashfn_native = BLAKE2B<256, 64, false>, + $.hashfn_bswap = BLAKE2B<256, 64, true> + ); REGISTER_HASH(blake2s_256, - $.desc = "BLAKE 2s, 256-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 256, - $.verification_LE = 0x841D6354, - $.verification_BE = 0x9F85F5C2, - $.hashfn_native = BLAKE2S<256,256,false>, - $.hashfn_bswap = BLAKE2S<256,256,true> -); + $.desc = "BLAKE 2s, 256-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 256, + $.verification_LE = 0x841D6354, + $.verification_BE = 0x9F85F5C2, + $.hashfn_native = BLAKE2S<256, 256, false>, + $.hashfn_bswap = BLAKE2S<256, 256, true> + ); REGISTER_HASH(blake2s_224, - $.desc = "BLAKE 2s, 224-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 224, - $.verification_LE = 0x19B36D2C, - $.verification_BE = 0xBD261F10, - $.hashfn_native = BLAKE2S<224,224,false>, - $.hashfn_bswap = BLAKE2S<224,224,true> -); + $.desc = "BLAKE 2s, 224-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 224, + $.verification_LE = 0x19B36D2C, + $.verification_BE = 0xBD261F10, + $.hashfn_native = BLAKE2S<224, 224, false>, + $.hashfn_bswap = BLAKE2S<224, 224, true> + ); REGISTER_HASH(blake2s_160, - $.desc = "BLAKE 2s, 160-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 160, - $.verification_LE = 0xD50FF144, - $.verification_BE = 0xF9579BEA, - $.hashfn_native = BLAKE2S<160,160,false>, - $.hashfn_bswap = BLAKE2S<160,160,true> -); + $.desc = "BLAKE 2s, 160-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 160, + $.verification_LE = 0xD50FF144, + $.verification_BE = 0xF9579BEA, + $.hashfn_native = BLAKE2S<160, 160, false>, + $.hashfn_bswap = BLAKE2S<160, 160, true> + ); REGISTER_HASH(blake2s_128, - $.desc = "BLAKE 2s, 128-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 128, - $.verification_LE = 0xE8D8FCDF, - $.verification_BE = 0x9C786057, - $.hashfn_native = BLAKE2S<128,128,false>, - $.hashfn_bswap = BLAKE2S<128,128,true> -); + $.desc = "BLAKE 2s, 128-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 128, + $.verification_LE = 0xE8D8FCDF, + $.verification_BE = 0x9C786057, + $.hashfn_native = BLAKE2S<128, 128, false>, + $.hashfn_bswap = BLAKE2S<128, 128, true> + ); REGISTER_HASH(blake2s_256__64, - $.desc = "BLAKE 2s, 256-bit digest, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0x53000BB2, - $.verification_BE = 0x901DDE1D, - $.hashfn_native = BLAKE2S<256,64,false>, - $.hashfn_bswap = BLAKE2S<256,64,true> -); + $.desc = "BLAKE 2s, 256-bit digest, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0x53000BB2, + $.verification_BE = 0x901DDE1D, + $.hashfn_native = BLAKE2S<256, 64, false>, + $.hashfn_bswap = BLAKE2S<256, 64, true> + ); diff --git a/hashes/blake2/compress-portable.h b/hashes/blake2/compress-portable.h index 0d64a865..89c16fd5 100644 --- a/hashes/blake2/compress-portable.h +++ b/hashes/blake2/compress-portable.h @@ -1,13 +1,13 @@ -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2_sigma[r][2*i+0]]; \ - d = ROTR64(d ^ a, 32); \ - c = c + d; \ - b = ROTR64(b ^ c, 24); \ - a = a + b + m[blake2_sigma[r][2*i+1]]; \ - d = ROTR64(d ^ a, 16); \ - c = c + d; \ - b = ROTR64(b ^ c, 63); \ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2_sigma[r][2*i+0]]; \ + d = ROTR64(d ^ a, 32); \ + c = c + d; \ + b = ROTR64(b ^ c, 24); \ + a = a + b + m[blake2_sigma[r][2*i+1]]; \ + d = ROTR64(d ^ a, 16); \ + c = c + d; \ + b = ROTR64(b ^ c, 63); \ } while(0) #define ROUND(r) \ @@ -22,100 +22,99 @@ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while(0) -template < bool bswap > -static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { - uint64_t m[16]; - uint64_t v[16]; - size_t i; - - for( i = 0; i < 16; ++i ) { - m[i] = GET_U64(in, i * sizeof(m[i])); - } - - for( i = 0; i < 8; ++i ) { - v[i] = ctx->h[i]; - } - - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = blake2b_IV[4] ^ ctx->t[0]; - v[13] = blake2b_IV[5] ^ ctx->t[1]; - v[14] = blake2b_IV[6] ^ ctx->f[0]; - v[15] = blake2b_IV[7] ^ ctx->f[1]; - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - for( i = 0; i < 8; ++i ) { - ctx->h[i] = ctx->h[i] ^ v[i] ^ v[i + 8]; - } +template +static void blake2_compress( blake2b_context * ctx, const uint8_t * in ) { + uint64_t m[16]; + uint64_t v[16]; + size_t i; + + for (i = 0; i < 16; ++i) { + m[i] = GET_U64(in, i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = ctx->h[i]; + } + + v[ 8] = blake2b_IV[0]; + v[ 9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ ctx->t[0]; + v[13] = blake2b_IV[5] ^ ctx->t[1]; + v[14] = blake2b_IV[6] ^ ctx->f[0]; + v[15] = blake2b_IV[7] ^ ctx->f[1]; + + ROUND( 0); + ROUND( 1); + ROUND( 2); + ROUND( 3); + ROUND( 4); + ROUND( 5); + ROUND( 6); + ROUND( 7); + ROUND( 8); + ROUND( 9); + ROUND(10); + ROUND(11); + + for (i = 0; i < 8; ++i) { + ctx->h[i] = ctx->h[i] ^ v[i] ^ v[i + 8]; + } } #undef G -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2_sigma[r][2*i+0]]; \ - d = ROTR32(d ^ a, 16); \ - c = c + d; \ - b = ROTR32(b ^ c, 12); \ - a = a + b + m[blake2_sigma[r][2*i+1]]; \ - d = ROTR32(d ^ a, 8); \ - c = c + d; \ - b = ROTR32(b ^ c, 7); \ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2_sigma[r][2*i+0]]; \ + d = ROTR32(d ^ a, 16); \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a = a + b + m[blake2_sigma[r][2*i+1]]; \ + d = ROTR32(d ^ a, 8); \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ } while(0) -template < bool bswap > -static void blake2_compress(blake2s_context * ctx, const uint8_t * in) { - uint32_t m[16]; - uint32_t v[16]; - size_t i; - - for( i = 0; i < 16; ++i ) { - m[i] = GET_U32(in, i * sizeof(m[i])); - } - - for( i = 0; i < 8; ++i ) { - v[i] = ctx->h[i]; - } - - v[ 8] = blake2s_IV[0]; - v[ 9] = blake2s_IV[1]; - v[10] = blake2s_IV[2]; - v[11] = blake2s_IV[3]; - v[12] = blake2s_IV[4] ^ ctx->t[0]; - v[13] = blake2s_IV[5] ^ ctx->t[1]; - v[14] = blake2s_IV[6] ^ ctx->f[0]; - v[15] = blake2s_IV[7] ^ ctx->f[1]; - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - - for( i = 0; i < 8; ++i ) { - ctx->h[i] = ctx->h[i] ^ v[i] ^ v[i + 8]; - } +template +static void blake2_compress( blake2s_context * ctx, const uint8_t * in ) { + uint32_t m[16]; + uint32_t v[16]; + size_t i; + + for (i = 0; i < 16; ++i) { + m[i] = GET_U32(in, i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = ctx->h[i]; + } + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = blake2s_IV[4] ^ ctx->t[0]; + v[13] = blake2s_IV[5] ^ ctx->t[1]; + v[14] = blake2s_IV[6] ^ ctx->f[0]; + v[15] = blake2s_IV[7] ^ ctx->f[1]; + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + for (i = 0; i < 8; ++i) { + ctx->h[i] = ctx->h[i] ^ v[i] ^ v[i + 8]; + } } #undef G #undef ROUND - diff --git a/hashes/blake2/compress-sse2-plus.h b/hashes/blake2/compress-sse2-plus.h index 05b9c0a5..2cd1d3d1 100644 --- a/hashes/blake2/compress-sse2-plus.h +++ b/hashes/blake2/compress-sse2-plus.h @@ -5,285 +5,285 @@ // It is generally assumed that supporting a later/higher instruction // set includes support for previous/lower instruction sets. -#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) -#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) +#define LOADU(p) _mm_loadu_si128((const __m128i *)(p)) +#define STOREU(p, r) _mm_storeu_si128((__m128i *)(p), r) //----------------------------------------------------------------------------- // BLAKE2b code #if defined(HAVE_SSE_4_1) -#define LOAD_MSG_0_1(b0, b1) b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); -#define LOAD_MSG_0_2(b0, b1) b0 = _mm_unpackhi_epi64(m0, m1); b1 = _mm_unpackhi_epi64(m2, m3); -#define LOAD_MSG_0_3(b0, b1) b0 = _mm_unpacklo_epi64(m4, m5); b1 = _mm_unpacklo_epi64(m6, m7); -#define LOAD_MSG_0_4(b0, b1) b0 = _mm_unpackhi_epi64(m4, m5); b1 = _mm_unpackhi_epi64(m6, m7); -#define LOAD_MSG_1_1(b0, b1) b0 = _mm_unpacklo_epi64(m7, m2); b1 = _mm_unpackhi_epi64(m4, m6); -#define LOAD_MSG_1_2(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_alignr_epi8(m3, m7, 8); -#define LOAD_MSG_1_3(b0, b1) b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); b1 = _mm_unpackhi_epi64(m5, m2); -#define LOAD_MSG_1_4(b0, b1) b0 = _mm_unpacklo_epi64(m6, m1); b1 = _mm_unpackhi_epi64(m3, m1); -#define LOAD_MSG_2_1(b0, b1) b0 = _mm_alignr_epi8(m6, m5, 8); b1 = _mm_unpackhi_epi64(m2, m7); -#define LOAD_MSG_2_2(b0, b1) b0 = _mm_unpacklo_epi64(m4, m0); b1 = _mm_blend_epi16(m1, m6, 0xF0); -#define LOAD_MSG_2_3(b0, b1) b0 = _mm_blend_epi16(m5, m1, 0xF0); b1 = _mm_unpackhi_epi64(m3, m4); -#define LOAD_MSG_2_4(b0, b1) b0 = _mm_unpacklo_epi64(m7, m3); b1 = _mm_alignr_epi8(m2, m0, 8); -#define LOAD_MSG_3_1(b0, b1) b0 = _mm_unpackhi_epi64(m3, m1); b1 = _mm_unpackhi_epi64(m6, m5); -#define LOAD_MSG_3_2(b0, b1) b0 = _mm_unpackhi_epi64(m4, m0); b1 = _mm_unpacklo_epi64(m6, m7); -#define LOAD_MSG_3_3(b0, b1) b0 = _mm_blend_epi16(m1, m2, 0xF0); b1 = _mm_blend_epi16(m2, m7, 0xF0); -#define LOAD_MSG_3_4(b0, b1) b0 = _mm_unpacklo_epi64(m3, m5); b1 = _mm_unpacklo_epi64(m0, m4); -#define LOAD_MSG_4_1(b0, b1) b0 = _mm_unpackhi_epi64(m4, m2); b1 = _mm_unpacklo_epi64(m1, m5); -#define LOAD_MSG_4_2(b0, b1) b0 = _mm_blend_epi16(m0, m3, 0xF0); b1 = _mm_blend_epi16(m2, m7, 0xF0); -#define LOAD_MSG_4_3(b0, b1) b0 = _mm_blend_epi16(m7, m5, 0xF0); b1 = _mm_blend_epi16(m3, m1, 0xF0); -#define LOAD_MSG_4_4(b0, b1) b0 = _mm_alignr_epi8(m6, m0, 8); b1 = _mm_blend_epi16(m4, m6, 0xF0); -#define LOAD_MSG_5_1(b0, b1) b0 = _mm_unpacklo_epi64(m1, m3); b1 = _mm_unpacklo_epi64(m0, m4); -#define LOAD_MSG_5_2(b0, b1) b0 = _mm_unpacklo_epi64(m6, m5); b1 = _mm_unpackhi_epi64(m5, m1); -#define LOAD_MSG_5_3(b0, b1) b0 = _mm_blend_epi16(m2, m3, 0xF0); b1 = _mm_unpackhi_epi64(m7, m0); -#define LOAD_MSG_5_4(b0, b1) b0 = _mm_unpackhi_epi64(m6, m2); b1 = _mm_blend_epi16(m7, m4, 0xF0); -#define LOAD_MSG_6_1(b0, b1) b0 = _mm_blend_epi16(m6, m0, 0xF0); b1 = _mm_unpacklo_epi64(m7, m2); -#define LOAD_MSG_6_2(b0, b1) b0 = _mm_unpackhi_epi64(m2, m7); b1 = _mm_alignr_epi8(m5, m6, 8); -#define LOAD_MSG_6_3(b0, b1) b0 = _mm_unpacklo_epi64(m0, m3); b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); -#define LOAD_MSG_6_4(b0, b1) b0 = _mm_unpackhi_epi64(m3, m1); b1 = _mm_blend_epi16(m1, m5, 0xF0); -#define LOAD_MSG_7_1(b0, b1) b0 = _mm_unpackhi_epi64(m6, m3); b1 = _mm_blend_epi16(m6, m1, 0xF0); -#define LOAD_MSG_7_2(b0, b1) b0 = _mm_alignr_epi8(m7, m5, 8); b1 = _mm_unpackhi_epi64(m0, m4); -#define LOAD_MSG_7_3(b0, b1) b0 = _mm_unpackhi_epi64(m2, m7); b1 = _mm_unpacklo_epi64(m4, m1); -#define LOAD_MSG_7_4(b0, b1) b0 = _mm_unpacklo_epi64(m0, m2); b1 = _mm_unpacklo_epi64(m3, m5); -#define LOAD_MSG_8_1(b0, b1) b0 = _mm_unpacklo_epi64(m3, m7); b1 = _mm_alignr_epi8(m0, m5, 8); -#define LOAD_MSG_8_2(b0, b1) b0 = _mm_unpackhi_epi64(m7, m4); b1 = _mm_alignr_epi8(m4, m1, 8); -#define LOAD_MSG_8_3(b0, b1) b0 = m6; b1 = _mm_alignr_epi8(m5, m0, 8); -#define LOAD_MSG_8_4(b0, b1) b0 = _mm_blend_epi16(m1, m3, 0xF0); b1 = m2; -#define LOAD_MSG_9_1(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_unpackhi_epi64(m3, m0); -#define LOAD_MSG_9_2(b0, b1) b0 = _mm_unpacklo_epi64(m1, m2); b1 = _mm_blend_epi16(m3, m2, 0xF0); -#define LOAD_MSG_9_3(b0, b1) b0 = _mm_unpackhi_epi64(m7, m4); b1 = _mm_unpackhi_epi64(m1, m6); -#define LOAD_MSG_9_4(b0, b1) b0 = _mm_alignr_epi8(m7, m5, 8); b1 = _mm_unpacklo_epi64(m6, m0); -#define LOAD_MSG_10_1(b0, b1) b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); -#define LOAD_MSG_10_2(b0, b1) b0 = _mm_unpackhi_epi64(m0, m1); b1 = _mm_unpackhi_epi64(m2, m3); -#define LOAD_MSG_10_3(b0, b1) b0 = _mm_unpacklo_epi64(m4, m5); b1 = _mm_unpacklo_epi64(m6, m7); -#define LOAD_MSG_10_4(b0, b1) b0 = _mm_unpackhi_epi64(m4, m5); b1 = _mm_unpackhi_epi64(m6, m7); -#define LOAD_MSG_11_1(b0, b1) b0 = _mm_unpacklo_epi64(m7, m2); b1 = _mm_unpackhi_epi64(m4, m6); -#define LOAD_MSG_11_2(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_alignr_epi8(m3, m7, 8); -#define LOAD_MSG_11_3(b0, b1) b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); b1 = _mm_unpackhi_epi64(m5, m2); -#define LOAD_MSG_11_4(b0, b1) b0 = _mm_unpacklo_epi64(m6, m1); b1 = _mm_unpackhi_epi64(m3, m1); + #define LOAD_MSG_0_1(b0, b1) b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); + #define LOAD_MSG_0_2(b0, b1) b0 = _mm_unpackhi_epi64(m0, m1); b1 = _mm_unpackhi_epi64(m2, m3); + #define LOAD_MSG_0_3(b0, b1) b0 = _mm_unpacklo_epi64(m4, m5); b1 = _mm_unpacklo_epi64(m6, m7); + #define LOAD_MSG_0_4(b0, b1) b0 = _mm_unpackhi_epi64(m4, m5); b1 = _mm_unpackhi_epi64(m6, m7); + #define LOAD_MSG_1_1(b0, b1) b0 = _mm_unpacklo_epi64(m7, m2); b1 = _mm_unpackhi_epi64(m4, m6); + #define LOAD_MSG_1_2(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_alignr_epi8(m3, m7, 8); + #define LOAD_MSG_1_3(b0, b1) b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); b1 = _mm_unpackhi_epi64(m5, m2); + #define LOAD_MSG_1_4(b0, b1) b0 = _mm_unpacklo_epi64(m6, m1); b1 = _mm_unpackhi_epi64(m3, m1); + #define LOAD_MSG_2_1(b0, b1) b0 = _mm_alignr_epi8(m6, m5, 8); b1 = _mm_unpackhi_epi64(m2, m7); + #define LOAD_MSG_2_2(b0, b1) b0 = _mm_unpacklo_epi64(m4, m0); b1 = _mm_blend_epi16(m1, m6, 0xF0); + #define LOAD_MSG_2_3(b0, b1) b0 = _mm_blend_epi16(m5, m1, 0xF0); b1 = _mm_unpackhi_epi64(m3, m4); + #define LOAD_MSG_2_4(b0, b1) b0 = _mm_unpacklo_epi64(m7, m3); b1 = _mm_alignr_epi8(m2, m0, 8); + #define LOAD_MSG_3_1(b0, b1) b0 = _mm_unpackhi_epi64(m3, m1); b1 = _mm_unpackhi_epi64(m6, m5); + #define LOAD_MSG_3_2(b0, b1) b0 = _mm_unpackhi_epi64(m4, m0); b1 = _mm_unpacklo_epi64(m6, m7); + #define LOAD_MSG_3_3(b0, b1) b0 = _mm_blend_epi16(m1, m2, 0xF0); b1 = _mm_blend_epi16(m2, m7, 0xF0); + #define LOAD_MSG_3_4(b0, b1) b0 = _mm_unpacklo_epi64(m3, m5); b1 = _mm_unpacklo_epi64(m0, m4); + #define LOAD_MSG_4_1(b0, b1) b0 = _mm_unpackhi_epi64(m4, m2); b1 = _mm_unpacklo_epi64(m1, m5); + #define LOAD_MSG_4_2(b0, b1) b0 = _mm_blend_epi16(m0, m3, 0xF0); b1 = _mm_blend_epi16(m2, m7, 0xF0); + #define LOAD_MSG_4_3(b0, b1) b0 = _mm_blend_epi16(m7, m5, 0xF0); b1 = _mm_blend_epi16(m3, m1, 0xF0); + #define LOAD_MSG_4_4(b0, b1) b0 = _mm_alignr_epi8(m6, m0, 8); b1 = _mm_blend_epi16(m4, m6, 0xF0); + #define LOAD_MSG_5_1(b0, b1) b0 = _mm_unpacklo_epi64(m1, m3); b1 = _mm_unpacklo_epi64(m0, m4); + #define LOAD_MSG_5_2(b0, b1) b0 = _mm_unpacklo_epi64(m6, m5); b1 = _mm_unpackhi_epi64(m5, m1); + #define LOAD_MSG_5_3(b0, b1) b0 = _mm_blend_epi16(m2, m3, 0xF0); b1 = _mm_unpackhi_epi64(m7, m0); + #define LOAD_MSG_5_4(b0, b1) b0 = _mm_unpackhi_epi64(m6, m2); b1 = _mm_blend_epi16(m7, m4, 0xF0); + #define LOAD_MSG_6_1(b0, b1) b0 = _mm_blend_epi16(m6, m0, 0xF0); b1 = _mm_unpacklo_epi64(m7, m2); + #define LOAD_MSG_6_2(b0, b1) b0 = _mm_unpackhi_epi64(m2, m7); b1 = _mm_alignr_epi8(m5, m6, 8); + #define LOAD_MSG_6_3(b0, b1) b0 = _mm_unpacklo_epi64(m0, m3); b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); + #define LOAD_MSG_6_4(b0, b1) b0 = _mm_unpackhi_epi64(m3, m1); b1 = _mm_blend_epi16(m1, m5, 0xF0); + #define LOAD_MSG_7_1(b0, b1) b0 = _mm_unpackhi_epi64(m6, m3); b1 = _mm_blend_epi16(m6, m1, 0xF0); + #define LOAD_MSG_7_2(b0, b1) b0 = _mm_alignr_epi8(m7, m5, 8); b1 = _mm_unpackhi_epi64(m0, m4); + #define LOAD_MSG_7_3(b0, b1) b0 = _mm_unpackhi_epi64(m2, m7); b1 = _mm_unpacklo_epi64(m4, m1); + #define LOAD_MSG_7_4(b0, b1) b0 = _mm_unpacklo_epi64(m0, m2); b1 = _mm_unpacklo_epi64(m3, m5); + #define LOAD_MSG_8_1(b0, b1) b0 = _mm_unpacklo_epi64(m3, m7); b1 = _mm_alignr_epi8(m0, m5, 8); + #define LOAD_MSG_8_2(b0, b1) b0 = _mm_unpackhi_epi64(m7, m4); b1 = _mm_alignr_epi8(m4, m1, 8); + #define LOAD_MSG_8_3(b0, b1) b0 = m6; b1 = _mm_alignr_epi8(m5, m0, 8); + #define LOAD_MSG_8_4(b0, b1) b0 = _mm_blend_epi16(m1, m3, 0xF0); b1 = m2; + #define LOAD_MSG_9_1(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_unpackhi_epi64(m3, m0); + #define LOAD_MSG_9_2(b0, b1) b0 = _mm_unpacklo_epi64(m1, m2); b1 = _mm_blend_epi16(m3, m2, 0xF0); + #define LOAD_MSG_9_3(b0, b1) b0 = _mm_unpackhi_epi64(m7, m4); b1 = _mm_unpackhi_epi64(m1, m6); + #define LOAD_MSG_9_4(b0, b1) b0 = _mm_alignr_epi8(m7, m5, 8); b1 = _mm_unpacklo_epi64(m6, m0); + #define LOAD_MSG_10_1(b0, b1) b0 = _mm_unpacklo_epi64(m0, m1); b1 = _mm_unpacklo_epi64(m2, m3); + #define LOAD_MSG_10_2(b0, b1) b0 = _mm_unpackhi_epi64(m0, m1); b1 = _mm_unpackhi_epi64(m2, m3); + #define LOAD_MSG_10_3(b0, b1) b0 = _mm_unpacklo_epi64(m4, m5); b1 = _mm_unpacklo_epi64(m6, m7); + #define LOAD_MSG_10_4(b0, b1) b0 = _mm_unpackhi_epi64(m4, m5); b1 = _mm_unpackhi_epi64(m6, m7); + #define LOAD_MSG_11_1(b0, b1) b0 = _mm_unpacklo_epi64(m7, m2); b1 = _mm_unpackhi_epi64(m4, m6); + #define LOAD_MSG_11_2(b0, b1) b0 = _mm_unpacklo_epi64(m5, m4); b1 = _mm_alignr_epi8(m3, m7, 8); + #define LOAD_MSG_11_3(b0, b1) b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); b1 = _mm_unpackhi_epi64(m5, m2); + #define LOAD_MSG_11_4(b0, b1) b0 = _mm_unpacklo_epi64(m6, m1); b1 = _mm_unpackhi_epi64(m3, m1); #else -#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) -#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) -#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) -#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) -#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) -#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) -#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) -#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) -#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) -#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) -#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) -#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) -#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) -#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) -#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) -#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) -#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) -#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) -#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) -#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) -#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) -#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) -#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) -#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) -#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) -#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) -#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) -#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) -#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) -#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) -#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) -#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) -#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) -#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) -#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) -#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) -#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) -#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) -#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) -#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) -#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) -#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) -#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) -#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) -#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) -#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) -#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) -#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) + #define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2 , m0 ); b1 = _mm_set_epi64x(m6, m4) + #define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3 , m1 ); b1 = _mm_set_epi64x(m7, m5) + #define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8 ); b1 = _mm_set_epi64x(m14, m12) + #define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9 ); b1 = _mm_set_epi64x(m15, m13) + #define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4 , m14); b1 = _mm_set_epi64x(m13, m9) + #define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8 , m10); b1 = _mm_set_epi64x(m6, m15) + #define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0 , m1 ); b1 = _mm_set_epi64x(m5, m11) + #define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2 , m12); b1 = _mm_set_epi64x(m3, m7) + #define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) + #define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0 , m8 ); b1 = _mm_set_epi64x(m13, m2) + #define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3 , m10); b1 = _mm_set_epi64x(m9, m7) + #define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6 , m14); b1 = _mm_set_epi64x(m4, m1) + #define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3 , m7 ); b1 = _mm_set_epi64x(m11, m13) + #define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1 , m9 ); b1 = _mm_set_epi64x(m14, m12) + #define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5 , m2 ); b1 = _mm_set_epi64x(m15, m4) + #define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6 ); b1 = _mm_set_epi64x(m8, m0) + #define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5 , m9 ); b1 = _mm_set_epi64x(m10, m2) + #define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7 , m0 ); b1 = _mm_set_epi64x(m15, m4) + #define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) + #define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1 ); b1 = _mm_set_epi64x(m13, m8) + #define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6 , m2 ); b1 = _mm_set_epi64x(m8, m0) + #define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) + #define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7 , m4 ); b1 = _mm_set_epi64x(m1, m15) + #define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5 , m13); b1 = _mm_set_epi64x(m9, m14) + #define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1 , m12); b1 = _mm_set_epi64x(m4, m14) + #define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5 ); b1 = _mm_set_epi64x(m10, m13) + #define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6 , m0 ); b1 = _mm_set_epi64x(m8, m9) + #define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3 , m7 ); b1 = _mm_set_epi64x(m11, m2) + #define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7 , m13); b1 = _mm_set_epi64x(m3, m12) + #define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) + #define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5 ); b1 = _mm_set_epi64x(m2, m8) + #define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4 , m0 ); b1 = _mm_set_epi64x(m10, m6) + #define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6 ); b1 = _mm_set_epi64x(m0, m11) + #define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9 , m15); b1 = _mm_set_epi64x(m8, m3) + #define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) + #define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7 , m2 ); b1 = _mm_set_epi64x(m5, m4) + #define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8 , m10); b1 = _mm_set_epi64x(m1, m7) + #define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4 , m2 ); b1 = _mm_set_epi64x(m5, m6) + #define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9 , m15); b1 = _mm_set_epi64x(m13, m3) + #define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) + #define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2 , m0 ); b1 = _mm_set_epi64x(m6, m4) + #define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3 , m1 ); b1 = _mm_set_epi64x(m7, m5) + #define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8 ); b1 = _mm_set_epi64x(m14, m12) + #define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9 ); b1 = _mm_set_epi64x(m15, m13) + #define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4 , m14); b1 = _mm_set_epi64x(m13, m9) + #define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8 , m10); b1 = _mm_set_epi64x(m6, m15) + #define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0 , m1 ); b1 = _mm_set_epi64x(m5, m11) + #define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2 , m12); b1 = _mm_set_epi64x(m3, m7) #endif #if defined(HAVE_SSSE_3) && !defined(HAVE_XOP) -#define _mm_roti_epi64(x, c) \ - (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ - : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ - : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ - : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) #elif !defined(HAVE_SSSE_3) && !defined(HAVE_XOP) -#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) )) + #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) #endif #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -32); \ - row4h = _mm_roti_epi64(row4h, -32); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -24); \ - row2h = _mm_roti_epi64(row2h, -24); \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -16); \ - row4h = _mm_roti_epi64(row4h, -16); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -63); \ - row2h = _mm_roti_epi64(row2h, -63); \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ #if defined(HAVE_SSSE_3) #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2h, row2l, 8); \ - t1 = _mm_alignr_epi8(row2l, row2h, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4h, row4l, 8); \ - t1 = _mm_alignr_epi8(row4l, row4h, 8); \ - row4l = t1; \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ row4h = t0; #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2l, row2h, 8); \ - t1 = _mm_alignr_epi8(row2h, row2l, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4l, row4h, 8); \ - t1 = _mm_alignr_epi8(row4h, row4l, 8); \ - row4l = t1; \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ row4h = t0; #else -#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = row4l;\ - t1 = row2l;\ - row4l = row3l;\ - row3l = row3h;\ - row3h = row4l;\ - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row4l; \ + t1 = row2l; \ + row4l = row3l; \ + row3l = row3h; \ + row3h = row4l; \ + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = row3l;\ - row3l = row3h;\ - row3h = t0;\ - t0 = row2l;\ - t1 = row4l;\ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + t0 = row2l; \ + t1 = row4l; \ row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) #endif -#define ROUND(r) \ - LOAD_MSG_ ##r ##_1(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_2(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - LOAD_MSG_ ##r ##_3(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_4(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); -template < bool bswap > -static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { - __m128i row1l, row1h; - __m128i row2l, row2h; - __m128i row3l, row3h; - __m128i row4l, row4h; - __m128i b0, b1; - __m128i t0, t1; - - const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); - const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); - - const __m128i m0 = bswap ? mm_bswap64(LOADU(in + 00)) : LOADU( in + 00 ); - const __m128i m1 = bswap ? mm_bswap64(LOADU(in + 16)) : LOADU( in + 16 ); - const __m128i m2 = bswap ? mm_bswap64(LOADU(in + 32)) : LOADU( in + 32 ); - const __m128i m3 = bswap ? mm_bswap64(LOADU(in + 48)) : LOADU( in + 48 ); - const __m128i m4 = bswap ? mm_bswap64(LOADU(in + 64)) : LOADU( in + 64 ); - const __m128i m5 = bswap ? mm_bswap64(LOADU(in + 80)) : LOADU( in + 80 ); - const __m128i m6 = bswap ? mm_bswap64(LOADU(in + 96)) : LOADU( in + 96 ); - const __m128i m7 = bswap ? mm_bswap64(LOADU(in + 112)) : LOADU( in + 112 ); - - row1l = LOADU( &(ctx->h[0]) ); - row1h = LOADU( &(ctx->h[2]) ); - row2l = LOADU( &(ctx->h[4]) ); - row2h = LOADU( &(ctx->h[6]) ); - row3l = LOADU( &blake2b_IV[0] ); - row3h = LOADU( &blake2b_IV[2] ); - row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &(ctx->t[0]) ) ); - row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &(ctx->f[0]) ) ); - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - row1l = _mm_xor_si128( row3l, row1l ); - row1h = _mm_xor_si128( row3h, row1h ); - STOREU( &(ctx->h[0]), _mm_xor_si128( LOADU( &(ctx->h[0]) ), row1l ) ); - STOREU( &(ctx->h[2]), _mm_xor_si128( LOADU( &(ctx->h[2]) ), row1h ) ); - row2l = _mm_xor_si128( row4l, row2l ); - row2h = _mm_xor_si128( row4h, row2h ); - STOREU( &(ctx->h[4]), _mm_xor_si128( LOADU( &(ctx->h[4]) ), row2l ) ); - STOREU( &(ctx->h[6]), _mm_xor_si128( LOADU( &(ctx->h[6]) ), row2h ) ); +template +static void blake2_compress( blake2b_context * ctx, const uint8_t * in ) { + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; + + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + + const __m128i m0 = bswap ? mm_bswap64(LOADU(in + 00)) : LOADU(in + 00); + const __m128i m1 = bswap ? mm_bswap64(LOADU(in + 16)) : LOADU(in + 16); + const __m128i m2 = bswap ? mm_bswap64(LOADU(in + 32)) : LOADU(in + 32); + const __m128i m3 = bswap ? mm_bswap64(LOADU(in + 48)) : LOADU(in + 48); + const __m128i m4 = bswap ? mm_bswap64(LOADU(in + 64)) : LOADU(in + 64); + const __m128i m5 = bswap ? mm_bswap64(LOADU(in + 80)) : LOADU(in + 80); + const __m128i m6 = bswap ? mm_bswap64(LOADU(in + 96)) : LOADU(in + 96); + const __m128i m7 = bswap ? mm_bswap64(LOADU(in + 112)) : LOADU(in + 112); + + row1l = LOADU(&(ctx->h [0])); + row1h = LOADU(&(ctx->h [2])); + row2l = LOADU(&(ctx->h [4])); + row2h = LOADU(&(ctx->h [6])); + row3l = LOADU(&blake2b_IV[0] ); + row3h = LOADU(&blake2b_IV[2] ); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&(ctx->t[0]))); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&(ctx->f[0]))); + + ROUND( 0); + ROUND( 1); + ROUND( 2); + ROUND( 3); + ROUND( 4); + ROUND( 5); + ROUND( 6); + ROUND( 7); + ROUND( 8); + ROUND( 9); + ROUND(10); + ROUND(11); + + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&(ctx->h[0]), _mm_xor_si128(LOADU(&(ctx->h[0])), row1l)); + STOREU(&(ctx->h[2]), _mm_xor_si128(LOADU(&(ctx->h[2])), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&(ctx->h[4]), _mm_xor_si128(LOADU(&(ctx->h[4])), row2l)); + STOREU(&(ctx->h[6]), _mm_xor_si128(LOADU(&(ctx->h[6])), row2h)); } #undef G1 @@ -348,12 +348,12 @@ static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { #if defined(HAVE_XOP) -#define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */ + #define TOB(x) ((x) * 4 * 0x01010101 + 0x03020100) /* ..or not TOB */ -#define LOAD_MSG_0_1(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); -#define LOAD_MSG_0_2(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); -#define LOAD_MSG_0_3(buf) buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(4),TOB(2),TOB(0),TOB(6)) ); -#define LOAD_MSG_0_4(buf) buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(5),TOB(3),TOB(1),TOB(7)) ); + #define LOAD_MSG_0_1(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6), TOB(4), TOB(2), TOB(0))); + #define LOAD_MSG_0_2(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7), TOB(5), TOB(3), TOB(1))); + #define LOAD_MSG_0_3(buf) buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(4), TOB(2), TOB(0), TOB(6))); + #define LOAD_MSG_0_4(buf) buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(5), TOB(3), TOB(1), TOB(7))); #define LOAD_MSG_1_1(buf) t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); #define LOAD_MSG_1_2(buf) t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ @@ -371,7 +371,7 @@ static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { #define LOAD_MSG_2_4(buf) t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) ); #define LOAD_MSG_3_1(buf) t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ - t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ + t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); #define LOAD_MSG_3_2(buf) t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); @@ -384,7 +384,7 @@ static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { #define LOAD_MSG_4_2(buf) t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); #define LOAD_MSG_4_3(buf) t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ - t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ + t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) ); #define LOAD_MSG_4_4(buf) t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(4),TOB(0),TOB(5)) ); @@ -409,21 +409,21 @@ static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { #define LOAD_MSG_7_2(buf) t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); #define LOAD_MSG_7_3(buf) t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ - t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ + t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(7),TOB(0),TOB(3)) ); #define LOAD_MSG_7_4(buf) t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(6)) ); #define LOAD_MSG_8_1(buf) t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ - t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ + t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); #define LOAD_MSG_8_2(buf) t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); #define LOAD_MSG_8_3(buf) t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(5),TOB(4),TOB(3)) ); -#define LOAD_MSG_8_4(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(7),TOB(2),TOB(5)) ); + #define LOAD_MSG_8_4(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4), TOB(7), TOB(2), TOB(5))); #define LOAD_MSG_9_1(buf) t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); -#define LOAD_MSG_9_2(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); + #define LOAD_MSG_9_2(buf) buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5), TOB(6), TOB(4), TOB(2))); #define LOAD_MSG_9_3(buf) t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(7),TOB(5)) ); #define LOAD_MSG_9_4(buf) t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ @@ -431,270 +431,270 @@ static void blake2_compress(blake2b_context * ctx, const uint8_t * in) { #elif defined(HAVE_SSE_4_1) -#define LOAD_MSG_0_1(buf) buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); -#define LOAD_MSG_0_2(buf) buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); + #define LOAD_MSG_0_1(buf) buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2, 0, 2, 0))); + #define LOAD_MSG_0_2(buf) buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3, 1, 3, 1))); #define LOAD_MSG_0_3(buf) t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3,2,0,1)); \ - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,1,3,2)); \ + t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,1,3,2)); \ buf = _mm_blend_epi16(t0, t1, 0xC3); -#define LOAD_MSG_0_4(buf) t0 = _mm_blend_epi16(t0, t1, 0x3C); \ +#define LOAD_MSG_0_4(buf) t0 = _mm_blend_epi16(t0, t1, 0x3C); \ buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2,3,0,1)); #define LOAD_MSG_1_1(buf) t0 = _mm_blend_epi16(m1, m2, 0x0C); \ - t1 = _mm_slli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0, t1, 0xF0); \ + t1 = _mm_slli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0, t1, 0xF0); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); #define LOAD_MSG_1_2(buf) t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ - t1 = _mm_blend_epi16(m1,m3,0xC0); \ - t2 = _mm_blend_epi16(t0, t1, 0xF0); \ + t1 = _mm_blend_epi16(m1,m3,0xC0); \ + t2 = _mm_blend_epi16(t0, t1, 0xF0); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); #define LOAD_MSG_1_3(buf) t0 = _mm_slli_si128(m1, 4); \ - t1 = _mm_blend_epi16(m2, t0, 0x30); \ - t2 = _mm_blend_epi16(m0, t1, 0xF0); \ + t1 = _mm_blend_epi16(m2, t0, 0x30); \ + t2 = _mm_blend_epi16(m0, t1, 0xF0); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2)); #define LOAD_MSG_1_4(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_slli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0, t1, 0x0C); \ + t1 = _mm_slli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0, t1, 0x0C); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2)); -#define LOAD_MSG_2_1(buf) t0 = _mm_unpackhi_epi32(m2,m3); \ - t1 = _mm_blend_epi16(m3,m1,0x0C); \ - t2 = _mm_blend_epi16(t0, t1, 0x0F); \ +#define LOAD_MSG_2_1(buf) t0 = _mm_unpackhi_epi32(m2,m3); \ + t1 = _mm_blend_epi16(m3,m1,0x0C); \ + t2 = _mm_blend_epi16(t0, t1, 0x0F); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); -#define LOAD_MSG_2_2(buf) t0 = _mm_unpacklo_epi32(m2,m0); \ - t1 = _mm_blend_epi16(t0, m0, 0xF0); \ - t2 = _mm_slli_si128(m3, 8); \ +#define LOAD_MSG_2_2(buf) t0 = _mm_unpacklo_epi32(m2,m0); \ + t1 = _mm_blend_epi16(t0, m0, 0xF0); \ + t2 = _mm_slli_si128(m3, 8); \ buf = _mm_blend_epi16(t1, t2, 0xC0); -#define LOAD_MSG_2_3(buf) t0 = _mm_blend_epi16(m0, m2, 0x3C); \ - t1 = _mm_srli_si128(m1, 12); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ +#define LOAD_MSG_2_3(buf) t0 = _mm_blend_epi16(m0, m2, 0x3C); \ + t1 = _mm_srli_si128(m1, 12); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,3,2,1)); -#define LOAD_MSG_2_4(buf) t0 = _mm_slli_si128(m3, 4); \ - t1 = _mm_blend_epi16(m0, m1, 0x33); \ - t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +#define LOAD_MSG_2_4(buf) t0 = _mm_slli_si128(m3, 4); \ + t1 = _mm_blend_epi16(m0, m1, 0x33); \ + t2 = _mm_blend_epi16(t1, t0, 0xC0); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); -#define LOAD_MSG_3_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_unpackhi_epi32(t0, m2); \ - t2 = _mm_blend_epi16(t1, m3, 0x0C); \ +#define LOAD_MSG_3_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_unpackhi_epi32(t0, m2); \ + t2 = _mm_blend_epi16(t1, m3, 0x0C); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); -#define LOAD_MSG_3_2(buf) t0 = _mm_slli_si128(m2, 8); \ - t1 = _mm_blend_epi16(m3,m0,0x0C); \ - t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +#define LOAD_MSG_3_2(buf) t0 = _mm_slli_si128(m2, 8); \ + t1 = _mm_blend_epi16(m3,m0,0x0C); \ + t2 = _mm_blend_epi16(t1, t0, 0xC0); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); -#define LOAD_MSG_3_3(buf) t0 = _mm_blend_epi16(m0,m1,0x0F); \ - t1 = _mm_blend_epi16(t0, m3, 0xC0); \ +#define LOAD_MSG_3_3(buf) t0 = _mm_blend_epi16(m0,m1,0x0F); \ + t1 = _mm_blend_epi16(t0, m3, 0xC0); \ buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3)); -#define LOAD_MSG_3_4(buf) t0 = _mm_alignr_epi8(m0, m1, 4); \ +#define LOAD_MSG_3_4(buf) t0 = _mm_alignr_epi8(m0, m1, 4); \ buf = _mm_blend_epi16(t0, m2, 0x33); #define LOAD_MSG_4_1(buf) t0 = _mm_unpacklo_epi64(m1,m2); \ - t1 = _mm_unpackhi_epi64(m0,m2); \ - t2 = _mm_blend_epi16(t0,t1,0x33); \ + t1 = _mm_unpackhi_epi64(m0,m2); \ + t2 = _mm_blend_epi16(t0,t1,0x33); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); -#define LOAD_MSG_4_2(buf) t0 = _mm_unpackhi_epi64(m1,m3); \ - t1 = _mm_unpacklo_epi64(m0,m1); \ +#define LOAD_MSG_4_2(buf) t0 = _mm_unpackhi_epi64(m1,m3); \ + t1 = _mm_unpacklo_epi64(m0,m1); \ buf = _mm_blend_epi16(t0,t1,0x33); #define LOAD_MSG_4_3(buf) t0 = _mm_unpackhi_epi64(m3,m1); \ - t1 = _mm_unpackhi_epi64(m2,m0); \ - t2 = _mm_blend_epi16(t1,t0,0x33); \ + t1 = _mm_unpackhi_epi64(m2,m0); \ + t2 = _mm_blend_epi16(t1,t0,0x33); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); -#define LOAD_MSG_4_4(buf) t0 = _mm_blend_epi16(m0,m2,0x03); \ - t1 = _mm_slli_si128(t0, 8); \ - t2 = _mm_blend_epi16(t1,m3,0x0F); \ +#define LOAD_MSG_4_4(buf) t0 = _mm_blend_epi16(m0,m2,0x03); \ + t1 = _mm_slli_si128(t0, 8); \ + t2 = _mm_blend_epi16(t1,m3,0x0F); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,3,1)); -#define LOAD_MSG_5_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_unpacklo_epi32(m0,m2); \ +#define LOAD_MSG_5_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_unpacklo_epi32(m0,m2); \ buf = _mm_unpacklo_epi64(t0,t1); -#define LOAD_MSG_5_2(buf) t0 = _mm_srli_si128(m2, 4); \ - t1 = _mm_blend_epi16(m0,m3,0x03); \ +#define LOAD_MSG_5_2(buf) t0 = _mm_srli_si128(m2, 4); \ + t1 = _mm_blend_epi16(m0,m3,0x03); \ buf = _mm_blend_epi16(t1,t0,0x3C); #define LOAD_MSG_5_3(buf) t0 = _mm_blend_epi16(m1,m0,0x0C); \ - t1 = _mm_srli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0,t1,0x30); \ + t1 = _mm_srli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0,t1,0x30); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); #define LOAD_MSG_5_4(buf) t0 = _mm_unpacklo_epi64(m2,m1); \ - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2,0,1,0)); \ - t2 = _mm_srli_si128(t0, 4); \ + t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2,0,1,0)); \ + t2 = _mm_srli_si128(t0, 4); \ buf = _mm_blend_epi16(t1,t2,0x33); #define LOAD_MSG_6_1(buf) t0 = _mm_slli_si128(m1, 12); \ - t1 = _mm_blend_epi16(m0,m3,0x33); \ + t1 = _mm_blend_epi16(m0,m3,0x33); \ buf = _mm_blend_epi16(t1,t0,0xC0); #define LOAD_MSG_6_2(buf) t0 = _mm_blend_epi16(m3,m2,0x30); \ - t1 = _mm_srli_si128(m1, 4); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ + t1 = _mm_srli_si128(m1, 4); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); -#define LOAD_MSG_6_3(buf) t0 = _mm_unpacklo_epi64(m0,m2); \ - t1 = _mm_srli_si128(m1, 4); \ - t2 = _mm_blend_epi16(t0,t1,0x0C); \ +#define LOAD_MSG_6_3(buf) t0 = _mm_unpacklo_epi64(m0,m2); \ + t1 = _mm_srli_si128(m1, 4); \ + t2 = _mm_blend_epi16(t0,t1,0x0C); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); #define LOAD_MSG_6_4(buf) t0 = _mm_unpackhi_epi32(m1,m2); \ - t1 = _mm_unpackhi_epi64(m0,t0); \ + t1 = _mm_unpackhi_epi64(m0,t0); \ buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3)); -#define LOAD_MSG_7_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_blend_epi16(t0,m3,0x0F); \ +#define LOAD_MSG_7_1(buf) t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_blend_epi16(t0,m3,0x0F); \ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); #define LOAD_MSG_7_2(buf) t0 = _mm_blend_epi16(m2,m3,0x30); \ - t1 = _mm_srli_si128(m0,4); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ + t1 = _mm_srli_si128(m0,4); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); #define LOAD_MSG_7_3(buf) t0 = _mm_unpackhi_epi64(m0,m3); \ - t1 = _mm_unpacklo_epi64(m1,m2); \ - t2 = _mm_blend_epi16(t0,t1,0x3C); \ + t1 = _mm_unpacklo_epi64(m1,m2); \ + t2 = _mm_blend_epi16(t0,t1,0x3C); \ buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(2,3,1,0)); -#define LOAD_MSG_7_4(buf) t0 = _mm_unpacklo_epi32(m0,m1); \ - t1 = _mm_unpackhi_epi32(m1,m2); \ - t2 = _mm_unpacklo_epi64(t0,t1); \ +#define LOAD_MSG_7_4(buf) t0 = _mm_unpacklo_epi32(m0,m1); \ + t1 = _mm_unpackhi_epi32(m1,m2); \ + t2 = _mm_unpacklo_epi64(t0,t1); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); -#define LOAD_MSG_8_1(buf) t0 = _mm_unpackhi_epi32(m1,m3); \ - t1 = _mm_unpacklo_epi64(t0,m0); \ - t2 = _mm_blend_epi16(t1,m2,0xC0); \ +#define LOAD_MSG_8_1(buf) t0 = _mm_unpackhi_epi32(m1,m3); \ + t1 = _mm_unpacklo_epi64(t0,m0); \ + t2 = _mm_blend_epi16(t1,m2,0xC0); \ buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); -#define LOAD_MSG_8_2(buf) t0 = _mm_unpackhi_epi32(m0,m3); \ - t1 = _mm_blend_epi16(m2,t0,0xF0); \ +#define LOAD_MSG_8_2(buf) t0 = _mm_unpackhi_epi32(m0,m3); \ + t1 = _mm_blend_epi16(m2,t0,0xF0); \ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); -#define LOAD_MSG_8_3(buf) t0 = _mm_unpacklo_epi64(m0,m3); \ - t1 = _mm_srli_si128(m2,8); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ +#define LOAD_MSG_8_3(buf) t0 = _mm_unpacklo_epi64(m0,m3); \ + t1 = _mm_srli_si128(m2,8); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,3,2,0)); #define LOAD_MSG_8_4(buf) t0 = _mm_blend_epi16(m1,m0,0x30); \ buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(0,3,2,1)); -#define LOAD_MSG_9_1(buf) t0 = _mm_blend_epi16(m0,m2,0x03); \ - t1 = _mm_blend_epi16(m1,m2,0x30); \ - t2 = _mm_blend_epi16(t1,t0,0x0F); \ +#define LOAD_MSG_9_1(buf) t0 = _mm_blend_epi16(m0,m2,0x03); \ + t1 = _mm_blend_epi16(m1,m2,0x30); \ + t2 = _mm_blend_epi16(t1,t0,0x0F); \ buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); -#define LOAD_MSG_9_2(buf) t0 = _mm_slli_si128(m0,4); \ - t1 = _mm_blend_epi16(m1,t0,0xC0); \ +#define LOAD_MSG_9_2(buf) t0 = _mm_slli_si128(m0,4); \ + t1 = _mm_blend_epi16(m1,t0,0xC0); \ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); -#define LOAD_MSG_9_3(buf) t0 = _mm_unpackhi_epi32(m0,m3); \ - t1 = _mm_unpacklo_epi32(m2,m3); \ - t2 = _mm_unpackhi_epi64(t0,t1); \ +#define LOAD_MSG_9_3(buf) t0 = _mm_unpackhi_epi32(m0,m3); \ + t1 = _mm_unpacklo_epi32(m2,m3); \ + t2 = _mm_unpackhi_epi64(t0,t1); \ buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,1,3)); -#define LOAD_MSG_9_4(buf) t0 = _mm_blend_epi16(m3,m2,0xC0); \ - t1 = _mm_unpacklo_epi32(m0,m3); \ - t2 = _mm_blend_epi16(t0,t1,0x0F); \ +#define LOAD_MSG_9_4(buf) t0 = _mm_blend_epi16(m3,m2,0xC0); \ + t1 = _mm_unpacklo_epi32(m0,m3); \ + t2 = _mm_blend_epi16(t0,t1,0x0F); \ buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,2,3,0)); #else -#define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0) -#define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1) -#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m12,m10,m8,m14) -#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m13,m11,m9,m15) -#define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14) -#define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10) -#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m11,m0,m1,m5) -#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m7,m2,m12,m3) -#define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11) -#define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8) -#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m7,m3,m10,m9) -#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m1,m6,m14,m4) -#define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7) -#define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9) -#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m4,m5,m2,m15) -#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m0,m10,m6,m8) -#define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9) -#define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0) -#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m6,m11,m14,m3) -#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m8,m12,m1,m13) -#define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2) -#define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12) -#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m15,m7,m4,m1) -#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m14,m5,m13,m9) -#define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12) -#define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5) -#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m9,m6,m0,m8) -#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m2,m3,m7,m11) -#define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13) -#define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11) -#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m8,m15,m5,m2) -#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m6,m4,m0,m10) -#define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6) -#define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15) -#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m1,m13,m12,m10) -#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m4,m7,m2,m5) -#define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10) -#define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2) -#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m3,m9,m15,m13) -#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m12,m14,m11,m0) + #define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6 , m4 , m2 , m0 ) + #define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7 , m5 , m3 , m1 ) + #define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m12, m10, m8 , m14) + #define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m13, m11, m9 , m15) + #define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13, m9 , m4 , m14) + #define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6 , m15, m8 , m10) + #define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m11, m0 , m1 , m5 ) + #define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m7 , m2 , m12, m3 ) + #define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15, m5 , m12, m11) + #define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13, m2 , m0 , m8 ) + #define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m7 , m3 , m10, m9 ) + #define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m1 , m6 , m14, m4 ) + #define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11, m13, m3 , m7 ) + #define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14, m12, m1 , m9 ) + #define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m4 , m5 , m2 , m15) + #define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m0 , m10, m6 , m8 ) + #define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10, m2 , m5 , m9 ) + #define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15, m4 , m7 , m0 ) + #define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m6 , m11, m14, m3 ) + #define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m8 , m12, m1 , m13) + #define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8 , m0 , m6 , m2 ) + #define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3 , m11, m10, m12) + #define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m15, m7 , m4 , m1 ) + #define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m14, m5 , m13, m9 ) + #define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4 , m14, m1 , m12) + #define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10, m13, m15, m5 ) + #define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m9 , m6 , m0 , m8 ) + #define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m2 , m3 , m7 , m11) + #define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3 , m12, m7 , m13) + #define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9 , m1 , m14, m11) + #define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m8 , m15, m5 , m2 ) + #define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m6 , m4 , m0 , m10) + #define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0 , m11, m14, m6 ) + #define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8 , m3 , m9 , m15) + #define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m1 , m13, m12, m10) + #define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m4 , m7 , m2 , m5 ) + #define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1 , m7 , m8 , m10) + #define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5 , m6 , m4 , m2 ) + #define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m3 , m9 , m15, m13) + #define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m12, m14, m11, m0 ) #endif #if defined(HAVE_SSSE_3) && !defined(HAVE_XOP) -#define _mm_roti_epi32(r, c) ( \ - (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ +#define _mm_roti_epi32(r, c) ( \ + (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) ) #elif !defined(HAVE_SSSE_3) && !defined(HAVE_XOP) -#define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) + #define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32((r), -(c)), _mm_slli_epi32((r), 32 - (-(c)))) #endif -#define G1(row1,row2,row3,row4,buf) \ +#define G1(row1,row2,row3,row4,buf) \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ - row4 = _mm_roti_epi32(row4, -16); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -16); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_roti_epi32(row2, -12); -#define G2(row1,row2,row3,row4,buf) \ +#define G2(row1,row2,row3,row4,buf) \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ - row4 = _mm_roti_epi32(row4, -8); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -8); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_roti_epi32(row2, -7); -#define DIAGONALIZE(row1,row2,row3,row4) \ +#define DIAGONALIZE(row1,row2,row3,row4) \ row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(2,1,0,3) ); \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(0,3,2,1) ); -#define UNDIAGONALIZE(row1,row2,row3,row4) \ +#define UNDIAGONALIZE(row1,row2,row3,row4) \ row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(0,3,2,1) ); \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(2,1,0,3) ); -#define ROUND(r) \ - LOAD_MSG_ ##r ##_1(buf1); \ - G1(row1,row2,row3,row4,buf1); \ - LOAD_MSG_ ##r ##_2(buf2); \ - G2(row1,row2,row3,row4,buf2); \ - DIAGONALIZE(row1,row2,row3,row4); \ - LOAD_MSG_ ##r ##_3(buf3); \ - G1(row1,row2,row3,row4,buf3); \ - LOAD_MSG_ ##r ##_4(buf4); \ - G2(row1,row2,row3,row4,buf4); \ +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(buf1); \ + G1(row1,row2,row3,row4,buf1); \ + LOAD_MSG_ ##r ##_2(buf2); \ + G2(row1,row2,row3,row4,buf2); \ + DIAGONALIZE(row1,row2,row3,row4); \ + LOAD_MSG_ ##r ##_3(buf3); \ + G1(row1,row2,row3,row4,buf3); \ + LOAD_MSG_ ##r ##_4(buf4); \ + G2(row1,row2,row3,row4,buf4); \ UNDIAGONALIZE(row1,row2,row3,row4); \ -template < bool bswap > -static void blake2_compress(blake2s_context * ctx, const uint8_t * in) { - __m128i row1, row2, row3, row4; - __m128i buf1, buf2, buf3, buf4; - __m128i t0, t1, t2; - __m128i ff0, ff1; - - const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); - const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); - - const __m128i m0 = bswap ? mm_bswap32(LOADU(in + 00)) : LOADU( in + 00 ); - const __m128i m1 = bswap ? mm_bswap32(LOADU(in + 16)) : LOADU( in + 16 ); - const __m128i m2 = bswap ? mm_bswap32(LOADU(in + 32)) : LOADU( in + 32 ); - const __m128i m3 = bswap ? mm_bswap32(LOADU(in + 48)) : LOADU( in + 48 ); - - row1 = ff0 = LOADU( &ctx->h[0] ); - row2 = ff1 = LOADU( &ctx->h[4] ); - row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] ); - row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &ctx->t[0] ) ); - - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - - STOREU( &ctx->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); - STOREU( &ctx->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); +template +static void blake2_compress( blake2s_context * ctx, const uint8_t * in ) { + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; + __m128i t0, t1, t2; + __m128i ff0, ff1; + + const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); + const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + const __m128i m0 = bswap ? mm_bswap32(LOADU(in + 00)) : LOADU(in + 00); + const __m128i m1 = bswap ? mm_bswap32(LOADU(in + 16)) : LOADU(in + 16); + const __m128i m2 = bswap ? mm_bswap32(LOADU(in + 32)) : LOADU(in + 32); + const __m128i m3 = bswap ? mm_bswap32(LOADU(in + 48)) : LOADU(in + 48); + + row1 = ff0 = LOADU(&ctx->h[0]); + row2 = ff1 = LOADU(&ctx->h[4]); + row3 = _mm_loadu_si128((__m128i const *)&blake2s_IV[0]); + row4 = _mm_xor_si128(_mm_loadu_si128((__m128i const *)&blake2s_IV[4]), LOADU(&ctx->t[0])); + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + STOREU(&ctx->h[0], _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); + STOREU(&ctx->h[4], _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); } #undef G1 diff --git a/hashes/blake3.cpp b/hashes/blake3.cpp index ac9b2658..29484843 100644 --- a/hashes/blake3.cpp +++ b/hashes/blake3.cpp @@ -30,29 +30,31 @@ #include "Platform.h" #include "Hashlib.h" -static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, - 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, - 0x1F83D9ABUL, 0x5BE0CD19UL}; +static const uint32_t IV [8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL +}; static const uint8_t MSG_SCHEDULE[7][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, - {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, - {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, - {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, - {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, - {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, }; // internal flags enum blake3_flags { - CHUNK_START = 1 << 0, - CHUNK_END = 1 << 1, - PARENT = 1 << 2, - ROOT = 1 << 3, - KEYED_HASH = 1 << 4, - DERIVE_KEY_CONTEXT = 1 << 5, - DERIVE_KEY_MATERIAL = 1 << 6, + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, }; #define BLAKE3_KEY_LEN 32 @@ -61,235 +63,231 @@ enum blake3_flags { #define BLAKE3_CHUNK_LEN 1024 #define BLAKE3_MAX_DEPTH 54 -static FORCE_INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } +static FORCE_INLINE uint32_t counter_low( uint64_t counter ) { return (uint32_t)counter; } -static FORCE_INLINE uint32_t counter_high(uint64_t counter) { - return (uint32_t)(counter >> 32); +static FORCE_INLINE uint32_t counter_high( uint64_t counter ) { + return (uint32_t)(counter >> 32); } -static FORCE_INLINE uint64_t round_down_to_power_of_2(uint64_t x) { - return 1ULL << (63 ^ clz8(x | 1)); +static FORCE_INLINE uint64_t round_down_to_power_of_2( uint64_t x ) { + return 1ULL << (63 ^ clz8(x | 1)); } -static FORCE_INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len - // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; - return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +static FORCE_INLINE size_t left_len( size_t content_len ) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; } -static FORCE_INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); +static FORCE_INLINE void store32( void * dst, uint32_t w ) { + uint8_t * p = (uint8_t *)dst; + + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); } -static FORCE_INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { - store32(&bytes_out[0 * 4], cv_words[0]); - store32(&bytes_out[1 * 4], cv_words[1]); - store32(&bytes_out[2 * 4], cv_words[2]); - store32(&bytes_out[3 * 4], cv_words[3]); - store32(&bytes_out[4 * 4], cv_words[4]); - store32(&bytes_out[5 * 4], cv_words[5]); - store32(&bytes_out[6 * 4], cv_words[6]); - store32(&bytes_out[7 * 4], cv_words[7]); +static FORCE_INLINE void store_cv_words( uint8_t bytes_out[32], uint32_t cv_words[8] ) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); } typedef struct { - uint32_t cv[8]; - uint64_t chunk_counter; - uint8_t buf[BLAKE3_BLOCK_LEN]; - uint8_t buf_len; - uint8_t blocks_compressed; - uint8_t flags; + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; } blake3_chunk_state; typedef struct { - uint32_t key[8]; - blake3_chunk_state chunk; - uint8_t cv_stack_len; - uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; } blake3_hasher; typedef struct { - uint32_t input_cv[8]; - uint64_t counter; - uint8_t block[BLAKE3_BLOCK_LEN]; - uint8_t block_len; - uint8_t flags; + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; } output_t; -static void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags); -static void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); - -static FORCE_INLINE void chunk_state_init(blake3_chunk_state * self, const uint32_t key[8], - uint8_t flags) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->chunk_counter = 0; - self->buf_len = 0; - self->blocks_compressed = 0; - self->flags = flags; +static void blake3_compress_in_place( uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ); +static void blake3_compress_xof( const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64] ); + +static FORCE_INLINE void chunk_state_init( blake3_chunk_state * self, const uint32_t key[8], uint8_t flags ) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->chunk_counter = 0; + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; } -static FORCE_INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], - uint64_t chunk_counter) { - memcpy(self->cv, key, BLAKE3_KEY_LEN); - self->chunk_counter = chunk_counter; - self->blocks_compressed = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - self->buf_len = 0; +static FORCE_INLINE void chunk_state_reset( blake3_chunk_state * self, const uint32_t key[8], uint64_t chunk_counter ) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; } -static FORCE_INLINE output_t make_output(const uint32_t input_cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - output_t ret; - memcpy(ret.input_cv, input_cv, 32); - memcpy(ret.block, block, BLAKE3_BLOCK_LEN); - ret.block_len = block_len; - ret.counter = counter; - ret.flags = flags; - return ret; +static FORCE_INLINE output_t make_output( const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + output_t ret; + + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block , block , BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; } -static FORCE_INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { - if (self->blocks_compressed == 0) { - return CHUNK_START; - } else { - return 0; - } +static FORCE_INLINE uint8_t chunk_state_maybe_start_flag( const blake3_chunk_state * self ) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } } -static FORCE_INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, - const uint8_t *input, size_t input_len) { - size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); - if (take > input_len) { - take = input_len; - } - uint8_t *dest = self->buf + ((size_t)self->buf_len); - memcpy(dest, input, take); - self->buf_len += (uint8_t)take; - return take; +static FORCE_INLINE size_t chunk_state_fill_buf( blake3_chunk_state * self, const uint8_t * input, size_t input_len ) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + + if (take > input_len) { + take = input_len; + } + uint8_t * dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; } -static FORCE_INLINE output_t chunk_state_output(const blake3_chunk_state *self) { - uint8_t block_flags = - self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; - return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, - block_flags); +static FORCE_INLINE output_t chunk_state_output( const blake3_chunk_state * self ) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); } -static FORCE_INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], - const uint32_t key[8], uint8_t flags) { - return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +static FORCE_INLINE output_t parent_output( const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags ) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); } -static FORCE_INLINE size_t chunk_state_len(const blake3_chunk_state *self) { - return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + - ((size_t)self->buf_len); +static FORCE_INLINE size_t chunk_state_len( const blake3_chunk_state * self ) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); } -static FORCE_INLINE void output_root_bytes(const output_t * self, uint8_t * out, size_t out_len) { - uint64_t output_block_counter = 0; - size_t offset_within_block = 0; - uint8_t wide_buf[64]; - while (out_len > 0) { - blake3_compress_xof(self->input_cv, self->block, self->block_len, - output_block_counter, self->flags | ROOT, wide_buf); - size_t available_bytes = 64 - offset_within_block; - size_t memcpy_len; - if (out_len > available_bytes) { - memcpy_len = available_bytes; - } else { - memcpy_len = out_len; +static FORCE_INLINE void output_root_bytes( const output_t * self, uint8_t * out, size_t out_len ) { + uint64_t output_block_counter = 0; + size_t offset_within_block = 0; + uint8_t wide_buf[64]; + + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; } - memcpy(out, wide_buf + offset_within_block, memcpy_len); - out += memcpy_len; - out_len -= memcpy_len; - output_block_counter += 1; - offset_within_block = 0; - } } -static FORCE_INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { - uint32_t cv_words[8]; - memcpy(cv_words, self->input_cv, 32); - blake3_compress_in_place(cv_words, self->block, self->block_len, - self->counter, self->flags); - store_cv_words(cv, cv_words); +static FORCE_INLINE void output_chaining_value( const output_t * self, uint8_t cv[32] ) { + uint32_t cv_words[8]; + + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); + store_cv_words(cv, cv_words); } -static FORCE_INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { - size_t post_merge_stack_len = (size_t)popcount8(total_len); - while (self->cv_stack_len > post_merge_stack_len) { - uint8_t *parent_node = - &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; - output_t output = parent_output(parent_node, self->key, self->chunk.flags); - output_chaining_value(&output, parent_node); - self->cv_stack_len -= 1; - } +static FORCE_INLINE void hasher_merge_cv_stack( blake3_hasher * self, uint64_t total_len ) { + size_t post_merge_stack_len = (size_t)popcount8(total_len); + + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t * parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } } -static FORCE_INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], - uint64_t chunk_counter) { - hasher_merge_cv_stack(self, chunk_counter); - memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, - BLAKE3_OUT_LEN); - self->cv_stack_len += 1; +static FORCE_INLINE void hasher_push_cv( blake3_hasher * self, + uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter ) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); + self->cv_stack_len += 1; } -static FORCE_INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, - size_t input_len) { - if (self->buf_len > 0) { +static FORCE_INLINE void chunk_state_update( blake3_chunk_state * self, const uint8_t * input, size_t input_len ) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; + input += take; input_len -= take; - if (input_len > 0) { - blake3_compress_in_place( - self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - self->buf_len = 0; - memset(self->buf, 0, BLAKE3_BLOCK_LEN); - } - } - - while (input_len > BLAKE3_BLOCK_LEN) { - blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, - self->chunk_counter, - self->flags | chunk_state_maybe_start_flag(self)); - self->blocks_compressed += 1; - input += BLAKE3_BLOCK_LEN; - input_len -= BLAKE3_BLOCK_LEN; - } - - size_t take = chunk_state_fill_buf(self, input, input_len); - input += take; - input_len -= take; } -static void blake3_hasher_init(blake3_hasher * self) { - memcpy(self->key, IV, BLAKE3_KEY_LEN); - chunk_state_init(&self->chunk, IV, 0); - self->cv_stack_len = 0; +static void blake3_hasher_init( blake3_hasher * self ) { + memcpy(self->key, IV, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, IV, 0); + self->cv_stack_len = 0; } // Home-grown SMHasher3 seeding -static void blake3_seed(blake3_hasher * hasher, uint64_t seed) { - const uint32_t seedlo = seed & 0xFFFFFFFF; - const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; - - hasher->key[0] ^= seedlo; - hasher->chunk.cv[0] ^= seedlo; - hasher->key[1] ^= seedhi; - hasher->chunk.cv[1] ^= seedhi; +static void blake3_seed( blake3_hasher * hasher, uint64_t seed ) { + const uint32_t seedlo = seed & 0xFFFFFFFF; + const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + + hasher->key[0] ^= seedlo; + hasher->chunk.cv[0] ^= seedlo; + hasher->key[1] ^= seedhi; + hasher->chunk.cv[1] ^= seedhi; } // @@ -307,8 +305,8 @@ static void blake3_seed(blake3_hasher * hasher, uint64_t seed) { // // FORCE_INLINE void hash_one(const uint8_t *input, size_t blocks, // const uint32_t key[8], uint64_t counter, -// uint8_t flags, uint8_t flags_start, -// uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]); +// uint8_t flags, uint8_t flags_start, +// uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]); // // void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, // size_t blocks, const uint32_t key[8], @@ -322,343 +320,325 @@ static void blake3_seed(blake3_hasher * hasher, uint64_t seed) { // #define SIMD_DEGREE // #if defined(HAVE_SSE_4_1) -#include "Intrinsics.h" -#include "blake3/compress-sse41.h" + #include "Intrinsics.h" + #include "blake3/compress-sse41.h" #elif defined(HAVE_SSE_2) -#include "Intrinsics.h" -#include "blake3/compress-sse2.h" + #include "Intrinsics.h" + #include "blake3/compress-sse2.h" #else -#include "blake3/compress-portable.h" + #include "blake3/compress-portable.h" #endif -static FORCE_INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, - size_t num_chaining_values, - const uint32_t key[8], uint8_t flags, - uint8_t *out) { - const uint8_t *parents_array[SIMD_DEGREE_OR_2]; - size_t parents_array_len = 0; - while (num_chaining_values - (2 * parents_array_len) >= 2) { - parents_array[parents_array_len] = - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; - parents_array_len += 1; - } - - blake3_hash_many(parents_array, parents_array_len, 1, key, - 0, // Parents always use counter 0. - false, flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out); - - // If there's an odd child left over, it becomes an output. - if (num_chaining_values > 2 * parents_array_len) { - memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], - &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], - BLAKE3_OUT_LEN); - return parents_array_len + 1; - } else { - return parents_array_len; - } +static FORCE_INLINE size_t compress_parents_parallel( const uint8_t * child_chaining_values, size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, uint8_t * out ) { + const uint8_t * parents_array[SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, 0, // Parents always use counter 0. + false, flags | PARENT, 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } } -static FORCE_INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, - uint8_t *out) { - const uint8_t *chunks_array[SIMD_DEGREE]; - size_t input_position = 0; - size_t chunks_array_len = 0; - while (input_len - input_position >= BLAKE3_CHUNK_LEN) { - chunks_array[chunks_array_len] = &input[input_position]; - input_position += BLAKE3_CHUNK_LEN; - chunks_array_len += 1; - } - - blake3_hash_many(chunks_array, chunks_array_len, - BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, - true, flags, CHUNK_START, CHUNK_END, out); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - if (input_len > input_position) { - uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, key, flags); - chunk_state.chunk_counter = counter; - chunk_state_update(&chunk_state, &input[input_position], - input_len - input_position); - output_t output = chunk_state_output(&chunk_state); - output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); - return chunks_array_len + 1; - } else { - return chunks_array_len; - } +static FORCE_INLINE size_t compress_chunks_parallel( const uint8_t * input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t * out ) { + const uint8_t * chunks_array[SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, + key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } } -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. If this implementation adds multi-threading in the future, - // this gives us the option of multi-threading even the 2-chunk case, which - // can help performance on smaller platforms. - if (input_len <= SIMD_DEGREE * BLAKE3_CHUNK_LEN) { - return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, - out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); - size_t right_input_len = input_len - left_input_len; - const uint8_t *right_input = &input[left_input_len]; - uint64_t right_chunk_counter = - chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); - - uint8_t cv_array[2 * SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t degree = SIMD_DEGREE; - if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { - // The special case: We always use a degree of at least two, to make - // sure there are two outputs. Except, as noted above, at the chunk - // level, where we allow degree=1. (Note that the 1-chunk-input case is - // a different codepath.) - degree = 2; - } - uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - if (left_n == 1) { - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); - return 2; - } +static size_t blake3_compress_subtree_wide( const uint8_t * input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t * out ) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= SIMD_DEGREE * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t * right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + uint8_t cv_array[2 * SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = SIMD_DEGREE; + if ((left_input_len > BLAKE3_CHUNK_LEN) && (degree == 1)) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t * right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input , left_input_len , key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide(right_input, right_input_len, + key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } - // Otherwise, do one layer of parent node compression. - size_t num_chaining_values = left_n + right_n; - return compress_parents_parallel(cv_array, num_chaining_values, key, flags, - out); + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); } -static FORCE_INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { - uint8_t cv_array[SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; - size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - uint8_t out_array[SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - // The second half of this loop condition is always true, and we just - // asserted it above. But GCC can't tell that it's always true, and if NDEBUG - // is set on platforms where SIMD_DEGREE_OR_2 == 2, GCC emits spurious - // warnings here. GCC 8.5 is particularly sensitive, so if you're changing - // this code, test it against that version. - while (num_cvs > 2 && num_cvs <= SIMD_DEGREE_OR_2) { - num_cvs = - compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); - memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); - } - memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +static FORCE_INLINE void compress_subtree_to_parent_node( const uint8_t * input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN] ) { + uint8_t cv_array[SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= SIMD_DEGREE_OR_2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } -static void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_update(&hasher, v.data(), v.size()); - if (input_len == 0) { - return; - } - - const uint8_t *input_bytes = (const uint8_t *)input; - - // If we have some partial chunk bytes in the internal chunk_state, we need - // to finish that chunk first. - if (chunk_state_len(&self->chunk) > 0) { - size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); - if (take > input_len) { - take = input_len; +static void blake3_hasher_update( blake3_hasher * self, const void * input, size_t input_len ) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; } - chunk_state_update(&self->chunk, input_bytes, take); - input_bytes += take; - input_len -= take; - // If we've filled the current chunk and there's more coming, finalize this - // chunk and proceed. In this case we know it's not the root. - if (input_len > 0) { - output_t output = chunk_state_output(&self->chunk); - uint8_t chunk_cv[32]; - output_chaining_value(&output, chunk_cv); - hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); - chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); - } else { - return; + + const uint8_t * input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } } - } - - // Now the chunk_state is clear, and we have more input. If there's more than - // a single chunk (so, definitely not the root chunk), hash the largest whole - // subtree we can, with the full benefits of SIMD (and maybe in the future, - // multi-threading) parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees along - // the right edge can be incomplete, and we don't know where the right edge - // is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until this - // point (if total is not 0). If the current incomplete subtree is only - // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have - // to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or to - // evenly divide what we already have, this part runs in a loop. - while (input_len > BLAKE3_CHUNK_LEN) { - size_t subtree_len = round_down_to_power_of_2(input_len); - uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until it evenly divides the count so far. We know - // that subtree_len itself is a power of 2, so we can use a bitmasking - // trick instead of an actual remainder operation. (Note that if the caller - // consistently passes power-of-2 inputs of the same size, as is hopefully - // typical, this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. For - // example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still - // get the right answer in the end, and we might get to use 2-way SIMD - // parallelism. The problem with this optimization, is that it gets us - // stuck always hashing 2 chunks. The total number of chunks will remain - // odd, and we'll never graduate to higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { - subtree_len /= 2; + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; } - // The shrunken subtree_len might now be 1 chunk long. If so, hash that one - // chunk by itself. Otherwise, compress the subtree into a pair of CVs. - uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; - if (subtree_len <= BLAKE3_CHUNK_LEN) { - blake3_chunk_state chunk_state; - chunk_state_init(&chunk_state, self->key, self->chunk.flags); - chunk_state.chunk_counter = self->chunk.chunk_counter; - chunk_state_update(&chunk_state, input_bytes, subtree_len); - output_t output = chunk_state_output(&chunk_state); - uint8_t cv[BLAKE3_OUT_LEN]; - output_chaining_value(&output, cv); - hasher_push_cv(self, cv, chunk_state.chunk_counter); - } else { - // This is the high-performance happy path, though getting here depends - // on the caller giving us a long enough input. - uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; - compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, - self->chunk.chunk_counter, - self->chunk.flags, cv_pair); - hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); - hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], - self->chunk.chunk_counter + (subtree_chunks / 2)); + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); } - self->chunk.chunk_counter += subtree_chunks; - input_bytes += subtree_len; - input_len -= subtree_len; - } - - // If there's any remaining input less than a full chunk, add it to the chunk - // state. In that case, also do a final merge loop to make sure the subtree - // stack doesn't contain any unmerged pairs. The remaining input means we - // know these merges are non-root. This merge loop isn't strictly necessary - // here, because hasher_push_chunk_cv already does its own merge loop, but it - // simplifies blake3_hasher_finalize below. - if (input_len > 0) { - chunk_state_update(&self->chunk, input_bytes, input_len); - hasher_merge_cv_stack(self, self->chunk.chunk_counter); - } } -static void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { - // Explicitly checking for zero avoids causing UB by passing a null pointer - // to memcpy. This comes up in practice with things like: - // std::vector v; - // blake3_hasher_finalize(&hasher, v.data(), v.size()); - if (out_len == 0) { - return; - } - - // If the subtree stack is empty, then the current chunk is the root. - if (self->cv_stack_len == 0) { - output_t output = chunk_state_output(&self->chunk); +static void blake3_hasher_finalize( const blake3_hasher * self, uint8_t * out, size_t out_len ) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, out, out_len); + return; + } + + // If there are any bytes in the chunk state, finalize that chunk + // and do a roll-up merge between that chunk hash and every subtree + // in the stack. In this case, the extra merge loop at the end of + // blake3_hasher_update guarantees that none of the subtrees in the + // stack need to be merged with each other first. Otherwise, if + // there are no bytes in the chunk state, then the top of the stack + // is a chunk hash, and we start the merge from that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } output_root_bytes(&output, out, out_len); - return; - } - - // If there are any bytes in the chunk state, finalize that chunk - // and do a roll-up merge between that chunk hash and every subtree - // in the stack. In this case, the extra merge loop at the end of - // blake3_hasher_update guarantees that none of the subtrees in the - // stack need to be merged with each other first. Otherwise, if - // there are no bytes in the chunk state, then the top of the stack - // is a chunk hash, and we start the merge from that. - output_t output; - size_t cvs_remaining; - if (chunk_state_len(&self->chunk) > 0) { - cvs_remaining = self->cv_stack_len; - output = chunk_state_output(&self->chunk); - } else { - // There are always at least 2 CVs in the stack in this case. - cvs_remaining = self->cv_stack_len - 2; - output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, - self->chunk.flags); - } - while (cvs_remaining > 0) { - cvs_remaining -= 1; - uint8_t parent_block[BLAKE3_BLOCK_LEN]; - memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); - output_chaining_value(&output, &parent_block[32]); - output = parent_output(parent_block, self->key, self->chunk.flags); - } - output_root_bytes(&output, out, out_len); } -template < uint32_t outbits, bool bswap > -static void BLAKE3(const void * in, const size_t len, const seed_t seed, void * out) { - blake3_hasher hasher; +template +static void BLAKE3( const void * in, const size_t len, const seed_t seed, void * out ) { + blake3_hasher hasher; - blake3_hasher_init(&hasher); - blake3_seed(&hasher, seed); - blake3_hasher_update(&hasher, in, len); - blake3_hasher_finalize(&hasher, (uint8_t *)out, (outbits >= 256) ? 32 : (outbits+7)/8); + blake3_hasher_init(&hasher); + blake3_seed(&hasher, seed); + blake3_hasher_update(&hasher, in, len); + blake3_hasher_finalize(&hasher, (uint8_t *)out, (outbits >= 256) ? 32 : (outbits + 7) / 8); } REGISTER_FAMILY(blake3, - $.src_url = "https://github.com/BLAKE3-team/BLAKE3", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/BLAKE3-team/BLAKE3", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); // The NO_SEED flag is not actually true, but need to replace // homegrown with real seeding. REGISTER_HASH(blake3, - $.desc = "BLAKE 3, 256-bit digest", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT , - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL , - $.bits = 256, - $.verification_LE = 0x50E4CD91, - $.verification_BE = 0x50E4CD91, - $.hashfn_native = BLAKE3<256,false>, - $.hashfn_bswap = BLAKE3<256,true> -); + $.desc = "BLAKE 3, 256-bit digest", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL, + $.bits = 256, + $.verification_LE = 0x50E4CD91, + $.verification_BE = 0x50E4CD91, + $.hashfn_native = BLAKE3<256, false>, + $.hashfn_bswap = BLAKE3<256, true> + ); diff --git a/hashes/blake3/compress-portable.h b/hashes/blake3/compress-portable.h index 351afe5e..2f620f22 100644 --- a/hashes/blake3/compress-portable.h +++ b/hashes/blake3/compress-portable.h @@ -1,162 +1,155 @@ #define SIMD_DEGREE_OR_2 2 #define SIMD_DEGREE 1 -static FORCE_INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = ROTR32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = ROTR32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = ROTR32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = ROTR32(state[b] ^ state[c], 7); +static FORCE_INLINE void g( uint32_t * state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y ) { + state[a] = state[a] + state[b] + x; + state[d] = ROTR32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = ROTR32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = ROTR32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = ROTR32(state[b] ^ state[c], 7); } -static FORCE_INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +static FORCE_INLINE void round_fn( uint32_t state[16], const uint32_t * msg, size_t round ) { + // Select the message schedule based on the round. + const uint8_t * schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]] , msg[schedule[1]] ); + g(state, 1, 5, 9, 13, msg[schedule[2]] , msg[schedule[3]] ); + g(state, 2, 6, 10, 14, msg[schedule[4]] , msg[schedule[5]] ); + g(state, 3, 7, 11, 15, msg[schedule[6]] , msg[schedule[7]] ); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]] , msg[schedule[9]] ); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } -static FORCE_INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +static FORCE_INLINE uint32_t load32( const void * src ) { + const uint8_t * p = (const uint8_t *)src; + + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); } -static FORCE_INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - state[0] = cv[0]; - state[1] = cv[1]; - state[2] = cv[2]; - state[3] = cv[3]; - state[4] = cv[4]; - state[5] = cv[5]; - state[6] = cv[6]; - state[7] = cv[7]; - state[8] = IV[0]; - state[9] = IV[1]; - state[10] = IV[2]; - state[11] = IV[3]; - state[12] = counter_low(counter); - state[13] = counter_high(counter); - state[14] = (uint32_t)block_len; - state[15] = (uint32_t)flags; - - round_fn(state, &block_words[0], 0); - round_fn(state, &block_words[0], 1); - round_fn(state, &block_words[0], 2); - round_fn(state, &block_words[0], 3); - round_fn(state, &block_words[0], 4); - round_fn(state, &block_words[0], 5); - round_fn(state, &block_words[0], 6); +static FORCE_INLINE void compress_pre( uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + uint32_t block_words[16]; + + block_words[ 0] = load32(block + 4 * 0); + block_words[ 1] = load32(block + 4 * 1); + block_words[ 2] = load32(block + 4 * 2); + block_words[ 3] = load32(block + 4 * 3); + block_words[ 4] = load32(block + 4 * 4); + block_words[ 5] = load32(block + 4 * 5); + block_words[ 6] = load32(block + 4 * 6); + block_words[ 7] = load32(block + 4 * 7); + block_words[ 8] = load32(block + 4 * 8); + block_words[ 9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[ 0] = cv[0 ]; + state[ 1] = cv[1 ]; + state[ 2] = cv[2 ]; + state[ 3] = cv[3 ]; + state[ 4] = cv[4 ]; + state[ 5] = cv[5 ]; + state[ 6] = cv[6 ]; + state[ 7] = cv[7 ]; + state[ 8] = IV[0 ]; + state[ 9] = IV[1 ]; + state[10] = IV[2 ]; + state[11] = IV[3 ]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); } -static void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; +static void blake3_compress_in_place( uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + uint32_t state[16]; + + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[ 8]; + cv[1] = state[1] ^ state[ 9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; } -static void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - uint32_t state[16]; - compress_pre(state, cv, block, block_len, counter, flags); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); +static void blake3_compress_xof( const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64] ) { + uint32_t state[16]; + + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4] , state[0] ^ state[8] ); + store32(&out[1 * 4] , state[1] ^ state[9] ); + store32(&out[2 * 4] , state[2] ^ state[10]); + store32(&out[3 * 4] , state[3] ^ state[11]); + store32(&out[4 * 4] , state[4] ^ state[12]); + store32(&out[5 * 4] , state[5] ^ state[13]); + store32(&out[6 * 4] , state[6] ^ state[14]); + store32(&out[7 * 4] , state[7] ^ state[15]); + store32(&out[8 * 4] , state[8] ^ cv[0] ); + store32(&out[9 * 4] , state[9] ^ cv[1] ); + store32(&out[10 * 4], state[10] ^ cv[2] ); + store32(&out[11 * 4], state[11] ^ cv[3] ); + store32(&out[12 * 4], state[12] ^ cv[4] ); + store32(&out[13 * 4], state[13] ^ cv[5] ); + store32(&out[14 * 4], state[14] ^ cv[6] ); + store32(&out[15 * 4], state[15] ^ cv[7] ); } -static FORCE_INLINE void hash_one(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; +static FORCE_INLINE void hash_one( const uint8_t * input, size_t blocks, const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN] ) { + uint32_t cv[8]; + + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; } - blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - store_cv_words(out, cv); + store_cv_words(out, cv); } -static void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; +static void blake3_hash_many( const uint8_t * const * inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t * out ) { + while (num_inputs > 0) { + hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } } diff --git a/hashes/blake3/compress-sse2.h b/hashes/blake3/compress-sse2.h index b24cf79d..f2525c49 100644 --- a/hashes/blake3/compress-sse2.h +++ b/hashes/blake3/compress-sse2.h @@ -3,562 +3,548 @@ #define DEGREE 4 -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) -static FORCE_INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); +static FORCE_INLINE __m128i loadu( const uint8_t src[16] ) { + return _mm_loadu_si128((const __m128i *)src); } -static FORCE_INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); +static FORCE_INLINE void storeu( __m128i src, uint8_t dest[16] ) { + _mm_storeu_si128((__m128i *)dest, src); } -static FORCE_INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } +static FORCE_INLINE __m128i addv( __m128i a, __m128i b ) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. -static FORCE_INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } +static FORCE_INLINE __m128i xorv( __m128i a, __m128i b ) { return _mm_xor_si128(a, b); } -static FORCE_INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } +static FORCE_INLINE __m128i set1( uint32_t x ) { return _mm_set1_epi32((int32_t)x); } -static FORCE_INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +static FORCE_INLINE __m128i set4( uint32_t a, uint32_t b, uint32_t c, uint32_t d ) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } -static FORCE_INLINE __m128i rot16(__m128i x) { - return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); +static FORCE_INLINE __m128i rot16( __m128i x ) { + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); } -static FORCE_INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +static FORCE_INLINE __m128i rot12( __m128i x ) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } -static FORCE_INLINE __m128i rot8(__m128i x) { - return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); +static FORCE_INLINE __m128i rot8( __m128i x ) { + return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); } -static FORCE_INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +static FORCE_INLINE __m128i rot7( __m128i x ) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } -static FORCE_INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); +static FORCE_INLINE void g1( __m128i * row0, __m128i * row1, __m128i * row2, __m128i * row3, __m128i m ) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); } -static FORCE_INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); +static FORCE_INLINE void g2( __m128i * row0, __m128i * row1, __m128i * row2, __m128i * row3, __m128i m ) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 -static FORCE_INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +static FORCE_INLINE void diagonalize( __m128i * row0, __m128i * row2, __m128i * row3 ) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } -static FORCE_INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +static FORCE_INLINE void undiagonalize( __m128i * row0, __m128i * row2, __m128i * row3 ) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } -static FORCE_INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { - const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - __m128i mask = _mm_set1_epi16(imm8); - mask = _mm_and_si128(mask, bits); - mask = _mm_cmpeq_epi16(mask, bits); - return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +static FORCE_INLINE __m128i blend_epi16( __m128i a, __m128i b, const int16_t imm8 ) { + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); } -static FORCE_INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); +static FORCE_INLINE void compress_pre( __m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE( 2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE( 2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); } -static void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +static void blake3_compress_in_place( uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + __m128i rows[4]; + + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } -static void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +static void blake3_compress_xof( const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64] ) { + __m128i rows[4]; + + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0] ); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } -static FORCE_INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); +static FORCE_INLINE void round_fn( __m128i v[16], __m128i m[16], size_t r ) { + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[ 0] = addv(v[ 0], v[4]); + v[ 1] = addv(v[ 1], v[5]); + v[ 2] = addv(v[ 2], v[6]); + v[ 3] = addv(v[ 3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[ 8] = addv(v [ 8], v[12]); + v[ 9] = addv(v [ 9], v[13]); + v[10] = addv(v [10], v[14]); + v[11] = addv(v [11], v[15]); + v[ 4] = xorv(v [ 4], v[ 8]); + v[ 5] = xorv(v [ 5], v[ 9]); + v[ 6] = xorv(v [ 6], v[10]); + v[ 7] = xorv(v [ 7], v[11]); + v[ 4] = rot12(v[ 4]); + v[ 5] = rot12(v[ 5]); + v[ 6] = rot12(v[ 6]); + v[ 7] = rot12(v[ 7]); + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[ 0] = addv(v[ 0], v[4]); + v[ 1] = addv(v[ 1], v[5]); + v[ 2] = addv(v[ 2], v[6]); + v[ 3] = addv(v[ 3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[ 8] = addv(v[ 8], v[12]); + v[ 9] = addv(v[ 9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[ 4] = xorv(v[ 4], v[ 8]); + v[ 5] = xorv(v[ 5], v[ 9]); + v[ 6] = xorv(v[ 6], v[10]); + v[ 7] = xorv(v[ 7], v[11]); + v[ 4] = rot7(v[ 4]); + v[ 5] = rot7(v[ 5]); + v[ 6] = rot7(v[ 6]); + v[ 7] = rot7(v[ 7]); + + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][ 8]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[ 0] = addv(v[ 0], v[5]); + v[ 1] = addv(v[ 1], v[6]); + v[ 2] = addv(v[ 2], v[7]); + v[ 3] = addv(v[ 3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v [10], v[15]); + v[11] = addv(v [11], v[12]); + v[ 8] = addv(v [ 8], v[13]); + v[ 9] = addv(v [ 9], v[14]); + v[ 5] = xorv(v [ 5], v[10]); + v[ 6] = xorv(v [ 6], v[11]); + v[ 7] = xorv(v [ 7], v[ 8]); + v[ 4] = xorv(v [ 4], v[ 9]); + v[ 5] = rot12(v[ 5]); + v[ 6] = rot12(v[ 6]); + v[ 7] = rot12(v[ 7]); + v[ 4] = rot12(v[ 4]); + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][ 9]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[ 0] = addv(v[ 0], v[5]); + v[ 1] = addv(v[ 1], v[6]); + v[ 2] = addv(v[ 2], v[7]); + v[ 3] = addv(v[ 3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[ 8] = addv(v[ 8], v[13]); + v[ 9] = addv(v[ 9], v[14]); + v[ 5] = xorv(v[ 5], v[10]); + v[ 6] = xorv(v[ 6], v[11]); + v[ 7] = xorv(v[ 7], v[ 8]); + v[ 4] = xorv(v[ 4], v[ 9]); + v[ 5] = rot7(v[ 5]); + v[ 6] = rot7(v[ 6]); + v[ 7] = rot7(v[ 7]); + v[ 4] = rot7(v[ 4]); } -static FORCE_INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; +static FORCE_INLINE void transpose_vecs( __m128i vecs[DEGREE] ) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; } -static FORCE_INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); +static FORCE_INLINE void transpose_msg_vecs( const uint8_t * const * inputs, size_t block_offset, __m128i out[16] ) { + out[ 0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[ 1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[ 2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[ 3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[ 4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[ 5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[ 6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[ 7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[ 8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[ 9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0] ); + transpose_vecs(&out[4] ); + transpose_vecs(&out[8] ); + transpose_vecs(&out[12]); } -static FORCE_INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); - *out_lo = l; - *out_hi = h; +static FORCE_INLINE void load_counters( uint64_t counter, bool increment_counter, __m128i * out_lo, __m128i * out_hi ) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32( + 0x80000000)), _mm_xor_si128(l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32( (int32_t)(counter >> 32)), carry); + + *out_lo = l; + *out_hi = h; } -static void blake3_hash4(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, +static void blake3_hash4( const uint8_t * const * inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t * out ) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); + __m128i counter_low_vec, counter_high_vec; + + load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags ); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[ 8]); + h_vecs[1] = xorv(v[1], v[ 9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } -static FORCE_INLINE void hash_one(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; +static FORCE_INLINE void hash_one( const uint8_t * input, size_t blocks, const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN] ) { + uint32_t cv[8]; + + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; } - blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); + memcpy(out, cv, BLAKE3_OUT_LEN); } -static void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; +static void blake3_hash_many( const uint8_t * const * inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t * out ) { + while (num_inputs >= DEGREE) { + blake3_hash4(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; + while (num_inputs > 0) { + hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } } diff --git a/hashes/blake3/compress-sse41.h b/hashes/blake3/compress-sse41.h index ae0ec61f..a379e5ae 100644 --- a/hashes/blake3/compress-sse41.h +++ b/hashes/blake3/compress-sse41.h @@ -3,556 +3,539 @@ #define DEGREE 4 -#define _mm_shuffle_ps2(a, b, c) \ - (_mm_castps_si128( \ +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) -static FORCE_INLINE __m128i loadu(const uint8_t src[16]) { - return _mm_loadu_si128((const __m128i *)src); +static FORCE_INLINE __m128i loadu( const uint8_t src[16] ) { + return _mm_loadu_si128((const __m128i *)src); } -static FORCE_INLINE void storeu(__m128i src, uint8_t dest[16]) { - _mm_storeu_si128((__m128i *)dest, src); +static FORCE_INLINE void storeu( __m128i src, uint8_t dest[16] ) { + _mm_storeu_si128((__m128i *)dest, src); } -static FORCE_INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } +static FORCE_INLINE __m128i addv( __m128i a, __m128i b ) { return _mm_add_epi32(a, b); } // Note that clang-format doesn't like the name "xor" for some reason. -static FORCE_INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } +static FORCE_INLINE __m128i xorv( __m128i a, __m128i b ) { return _mm_xor_si128(a, b); } -static FORCE_INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } +static FORCE_INLINE __m128i set1( uint32_t x ) { return _mm_set1_epi32((int32_t)x); } -static FORCE_INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +static FORCE_INLINE __m128i set4( uint32_t a, uint32_t b, uint32_t c, uint32_t d ) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); } -static FORCE_INLINE __m128i rot16(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +static FORCE_INLINE __m128i rot16( __m128i x ) { + return _mm_shuffle_epi8(x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); } -static FORCE_INLINE __m128i rot12(__m128i x) { - return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +static FORCE_INLINE __m128i rot12( __m128i x ) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); } -static FORCE_INLINE __m128i rot8(__m128i x) { - return _mm_shuffle_epi8( - x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +static FORCE_INLINE __m128i rot8( __m128i x ) { + return _mm_shuffle_epi8(x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); } -static FORCE_INLINE __m128i rot7(__m128i x) { - return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +static FORCE_INLINE __m128i rot7( __m128i x ) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); } -static FORCE_INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot16(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot12(*row1); +static FORCE_INLINE void g1( __m128i * row0, __m128i * row1, __m128i * row2, __m128i * row3, __m128i m ) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); } -static FORCE_INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, - __m128i m) { - *row0 = addv(addv(*row0, m), *row1); - *row3 = xorv(*row3, *row0); - *row3 = rot8(*row3); - *row2 = addv(*row2, *row3); - *row1 = xorv(*row1, *row2); - *row1 = rot7(*row1); +static FORCE_INLINE void g2( __m128i * row0, __m128i * row1, __m128i * row2, __m128i * row3, __m128i m ) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 -static FORCE_INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +static FORCE_INLINE void diagonalize( __m128i * row0, __m128i * row2, __m128i * row3 ) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } -static FORCE_INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +static FORCE_INLINE void undiagonalize( __m128i * row0, __m128i * row2, __m128i * row3 ) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } -static FORCE_INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, uint8_t flags) { - rows[0] = loadu((uint8_t *)&cv[0]); - rows[1] = loadu((uint8_t *)&cv[4]); - rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(counter_low(counter), counter_high(counter), - (uint32_t)block_len, (uint32_t)flags); - - __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); - __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); - __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); - __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); - - __m128i t0, t1, t2, t3, tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); - t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); - diagonalize(&rows[0], &rows[2], &rows[3]); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); - undiagonalize(&rows[0], &rows[2], &rows[3]); +static FORCE_INLINE void compress_pre( __m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE( 2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE( 2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE( 0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE( 0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE( 0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); } -static void blake3_compress_in_place(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); - storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +static void blake3_compress_in_place( uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags ) { + __m128i rows[4]; + + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); } -static void blake3_compress_xof(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]) { - __m128i rows[4]; - compress_pre(rows, cv, block, block_len, counter, flags); - storeu(xorv(rows[0], rows[2]), &out[0]); - storeu(xorv(rows[1], rows[3]), &out[16]); - storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); - storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +static void blake3_compress_xof( const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64] ) { + __m128i rows[4]; + + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0] ); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); } -static FORCE_INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); - v[0] = addv(v[0], v[4]); - v[1] = addv(v[1], v[5]); - v[2] = addv(v[2], v[6]); - v[3] = addv(v[3], v[7]); - v[12] = xorv(v[12], v[0]); - v[13] = xorv(v[13], v[1]); - v[14] = xorv(v[14], v[2]); - v[15] = xorv(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = addv(v[8], v[12]); - v[9] = addv(v[9], v[13]); - v[10] = addv(v[10], v[14]); - v[11] = addv(v[11], v[15]); - v[4] = xorv(v[4], v[8]); - v[5] = xorv(v[5], v[9]); - v[6] = xorv(v[6], v[10]); - v[7] = xorv(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); - v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); - v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); - v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); - v[0] = addv(v[0], v[5]); - v[1] = addv(v[1], v[6]); - v[2] = addv(v[2], v[7]); - v[3] = addv(v[3], v[4]); - v[15] = xorv(v[15], v[0]); - v[12] = xorv(v[12], v[1]); - v[13] = xorv(v[13], v[2]); - v[14] = xorv(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = addv(v[10], v[15]); - v[11] = addv(v[11], v[12]); - v[8] = addv(v[8], v[13]); - v[9] = addv(v[9], v[14]); - v[5] = xorv(v[5], v[10]); - v[6] = xorv(v[6], v[11]); - v[7] = xorv(v[7], v[8]); - v[4] = xorv(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); +static FORCE_INLINE void round_fn( __m128i v[16], __m128i m[16], size_t r ) { + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[ 0] = addv(v[ 0], v[4]); + v[ 1] = addv(v[ 1], v[5]); + v[ 2] = addv(v[ 2], v[6]); + v[ 3] = addv(v[ 3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[ 8] = addv(v [ 8], v[12]); + v[ 9] = addv(v [ 9], v[13]); + v[10] = addv(v [10], v[14]); + v[11] = addv(v [11], v[15]); + v[ 4] = xorv(v [ 4], v[ 8]); + v[ 5] = xorv(v [ 5], v[ 9]); + v[ 6] = xorv(v [ 6], v[10]); + v[ 7] = xorv(v [ 7], v[11]); + v[ 4] = rot12(v[ 4]); + v[ 5] = rot12(v[ 5]); + v[ 6] = rot12(v[ 6]); + v[ 7] = rot12(v[ 7]); + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[ 0] = addv(v[ 0], v[4]); + v[ 1] = addv(v[ 1], v[5]); + v[ 2] = addv(v[ 2], v[6]); + v[ 3] = addv(v[ 3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[ 8] = addv(v[ 8], v[12]); + v[ 9] = addv(v[ 9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[ 4] = xorv(v[ 4], v[ 8]); + v[ 5] = xorv(v[ 5], v[ 9]); + v[ 6] = xorv(v[ 6], v[10]); + v[ 7] = xorv(v[ 7], v[11]); + v[ 4] = rot7(v[ 4]); + v[ 5] = rot7(v[ 5]); + v[ 6] = rot7(v[ 6]); + v[ 7] = rot7(v[ 7]); + + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][ 8]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[ 0] = addv(v[ 0], v[5]); + v[ 1] = addv(v[ 1], v[6]); + v[ 2] = addv(v[ 2], v[7]); + v[ 3] = addv(v[ 3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v [10], v[15]); + v[11] = addv(v [11], v[12]); + v[ 8] = addv(v [ 8], v[13]); + v[ 9] = addv(v [ 9], v[14]); + v[ 5] = xorv(v [ 5], v[10]); + v[ 6] = xorv(v [ 6], v[11]); + v[ 7] = xorv(v [ 7], v[ 8]); + v[ 4] = xorv(v [ 4], v[ 9]); + v[ 5] = rot12(v[ 5]); + v[ 6] = rot12(v[ 6]); + v[ 7] = rot12(v[ 7]); + v[ 4] = rot12(v[ 4]); + v[ 0] = addv(v[ 0], m[(size_t)MSG_SCHEDULE[r][ 9]]); + v[ 1] = addv(v[ 1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[ 2] = addv(v[ 2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[ 3] = addv(v[ 3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[ 0] = addv(v[ 0], v[5]); + v[ 1] = addv(v[ 1], v[6]); + v[ 2] = addv(v[ 2], v[7]); + v[ 3] = addv(v[ 3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[ 8] = addv(v[ 8], v[13]); + v[ 9] = addv(v[ 9], v[14]); + v[ 5] = xorv(v[ 5], v[10]); + v[ 6] = xorv(v[ 6], v[11]); + v[ 7] = xorv(v[ 7], v[ 8]); + v[ 4] = xorv(v[ 4], v[ 9]); + v[ 5] = rot7(v[ 5]); + v[ 6] = rot7(v[ 6]); + v[ 7] = rot7(v[ 7]); + v[ 4] = rot7(v[ 4]); } -static FORCE_INLINE void transpose_vecs(__m128i vecs[DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; +static FORCE_INLINE void transpose_vecs( __m128i vecs[DEGREE] ) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; } -static FORCE_INLINE void transpose_msg_vecs(const uint8_t *const *inputs, - size_t block_offset, __m128i out[16]) { - out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); - out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); - out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); - out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); - out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); - out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); - out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); - out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); - out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); - out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); - out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); - out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); - out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); - out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); - out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); - out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); - for (size_t i = 0; i < 4; ++i) { - _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); - } - transpose_vecs(&out[0]); - transpose_vecs(&out[4]); - transpose_vecs(&out[8]); - transpose_vecs(&out[12]); +static FORCE_INLINE void transpose_msg_vecs( const uint8_t * const * inputs, size_t block_offset, __m128i out[16] ) { + out[ 0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[ 1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[ 2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[ 3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[ 4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[ 5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[ 6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[ 7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[ 8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[ 9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0] ); + transpose_vecs(&out[4] ); + transpose_vecs(&out[8] ); + transpose_vecs(&out[12]); } -static FORCE_INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_lo, __m128i *out_hi) { - const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); - const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); - const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); - __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), - _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); - *out_lo = l; - *out_hi = h; +static FORCE_INLINE void load_counters( uint64_t counter, bool increment_counter, __m128i * out_lo, __m128i * out_hi ) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32( + 0x80000000)), _mm_xor_si128(l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32( (int32_t)(counter >> 32)), carry); + + *out_lo = l; + *out_hi = h; } -static void blake3_hash4(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - __m128i h_vecs[8] = { - set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), - set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), - }; - __m128i counter_low_vec, counter_high_vec; - load_counters(counter, increment_counter, &counter_low_vec, - &counter_high_vec); - uint8_t block_flags = flags | flags_start; - - for (size_t block = 0; block < blocks; block++) { - if (block + 1 == blocks) { - block_flags |= flags_end; - } - __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); - __m128i block_flags_vec = set1(block_flags); - __m128i msg_vecs[16]; - transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); - - __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), - counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, +static void blake3_hash4( const uint8_t * const * inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t * out ) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), }; - round_fn(v, msg_vecs, 0); - round_fn(v, msg_vecs, 1); - round_fn(v, msg_vecs, 2); - round_fn(v, msg_vecs, 3); - round_fn(v, msg_vecs, 4); - round_fn(v, msg_vecs, 5); - round_fn(v, msg_vecs, 6); - h_vecs[0] = xorv(v[0], v[8]); - h_vecs[1] = xorv(v[1], v[9]); - h_vecs[2] = xorv(v[2], v[10]); - h_vecs[3] = xorv(v[3], v[11]); - h_vecs[4] = xorv(v[4], v[12]); - h_vecs[5] = xorv(v[5], v[13]); - h_vecs[6] = xorv(v[6], v[14]); - h_vecs[7] = xorv(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&h_vecs[0]); - transpose_vecs(&h_vecs[4]); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); - storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); - storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); - storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); - storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); - storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); - storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); - storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); + __m128i counter_low_vec, counter_high_vec; + + load_counters(counter, increment_counter, &counter_low_vec, &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags ); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[ 8]); + h_vecs[1] = xorv(v[1], v[ 9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); } -static FORCE_INLINE void hash_one(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint32_t cv[8]; - memcpy(cv, key, BLAKE3_KEY_LEN); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; +static FORCE_INLINE void hash_one( const uint8_t * input, size_t blocks, const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN] ) { + uint32_t cv[8]; + + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; } - blake3_compress_in_place(cv, input, BLAKE3_BLOCK_LEN, counter, - block_flags); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, BLAKE3_OUT_LEN); + memcpy(out, cv, BLAKE3_OUT_LEN); } -static void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs >= DEGREE) { - blake3_hash4(inputs, blocks, key, counter, increment_counter, flags, - flags_start, flags_end, out); - if (increment_counter) { - counter += DEGREE; +static void blake3_hash_many( const uint8_t * const * inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t * out ) { + while (num_inputs >= DEGREE) { + blake3_hash4(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; } - inputs += DEGREE; - num_inputs -= DEGREE; - out = &out[DEGREE * BLAKE3_OUT_LEN]; - } - while (num_inputs > 0) { - hash_one(inputs[0], blocks, key, counter, flags, flags_start, - flags_end, out); - if (increment_counter) { - counter += 1; + while (num_inputs > 0) { + hash_one(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; } - inputs += 1; - num_inputs -= 1; - out = &out[BLAKE3_OUT_LEN]; - } } diff --git a/hashes/blockpearson.cpp b/hashes/blockpearson.cpp index c10a520b..41eb3b95 100644 --- a/hashes/blockpearson.cpp +++ b/hashes/blockpearson.cpp @@ -33,11 +33,11 @@ // David Stafford's Mix13 from http://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html // the author clarified via eMail that this of his work is released to the public domain -#define permute64(in) \ - in ^= (in >> 30); \ - in *= UINT64_C(0xbf58476d1ce4e5b9); \ - in ^= (in >> 27); \ - in *= UINT64_C(0x94d049bb133111eb); \ +#define permute64(in) \ + in ^= (in >> 30); \ + in *= UINT64_C(0xbf58476d1ce4e5b9); \ + in ^= (in >> 27); \ + in *= UINT64_C(0x94d049bb133111eb); \ in ^= (in >> 31) #define dec1(in) \ @@ -60,189 +60,189 @@ dec##part(hash##part); \ permute64(hash##part) -template < bool bswap > -static void blockpearson_hash_256(const void * in, const size_t org_len, const seed_t seed, void * out) { - const uint8_t * current = (const uint8_t *)in; - - uint64_t len = (uint64_t)org_len; - uint64_t hash1 = (uint64_t)seed; - - permute64(hash1); - - uint64_t hash2 = hash1; - uint64_t hash3 = hash1; - uint64_t hash4 = hash1; - - while (len > 7) { - hash_round(hash, GET_U64(current, 0), 1); - hash_round(hash, GET_U64(current, 0), 2); - hash_round(hash, GET_U64(current, 0), 3); - hash_round(hash, GET_U64(current, 0), 4); - - current += 8; - len -= 8; - } - - // handle the rest - hash1 = ~hash1; - hash2 = ~hash2; - hash3 = ~hash3; - hash4 = ~hash4; - - while(len) { - // byte-wise, no endianess - hash_round(hash, *current, 1); - hash_round(hash, *current, 2); - hash_round(hash, *current, 3); - hash_round(hash, *current, 4); - - current++; - len--; - } - - // digest length - hash1 = ~hash1; - hash2 = ~hash2; - hash3 = ~hash3; - hash4 = ~hash4; - - hash_round(hash, (uint64_t)org_len, 1); - hash_round(hash, (uint64_t)org_len, 2); - hash_round(hash, (uint64_t)org_len, 3); - hash_round(hash, (uint64_t)org_len, 4); - - PUT_U64(hash4, (uint8_t *)out, 0); - PUT_U64(hash3, (uint8_t *)out, 8); - PUT_U64(hash2, (uint8_t *)out, 16); - PUT_U64(hash1, (uint8_t *)out, 24); +template +static void blockpearson_hash_256( const void * in, const size_t org_len, const seed_t seed, void * out ) { + const uint8_t * current = (const uint8_t *)in; + + uint64_t len = (uint64_t )org_len; + uint64_t hash1 = (uint64_t )seed; + + permute64(hash1); + + uint64_t hash2 = hash1; + uint64_t hash3 = hash1; + uint64_t hash4 = hash1; + + while (len > 7) { + hash_round(hash, GET_U64(current, 0), 1); + hash_round(hash, GET_U64(current, 0), 2); + hash_round(hash, GET_U64(current, 0), 3); + hash_round(hash, GET_U64(current, 0), 4); + + current += 8; + len -= 8; + } + + // handle the rest + hash1 = ~hash1; + hash2 = ~hash2; + hash3 = ~hash3; + hash4 = ~hash4; + + while (len) { + // byte-wise, no endianess + hash_round(hash, *current, 1); + hash_round(hash, *current, 2); + hash_round(hash, *current, 3); + hash_round(hash, *current, 4); + + current++; + len--; + } + + // digest length + hash1 = ~hash1; + hash2 = ~hash2; + hash3 = ~hash3; + hash4 = ~hash4; + + hash_round(hash, (uint64_t)org_len, 1); + hash_round(hash, (uint64_t)org_len, 2); + hash_round(hash, (uint64_t)org_len, 3); + hash_round(hash, (uint64_t)org_len, 4); + + PUT_U64(hash4, (uint8_t *)out, 0); + PUT_U64(hash3, (uint8_t *)out, 8); + PUT_U64(hash2, (uint8_t *)out, 16); + PUT_U64(hash1, (uint8_t *)out, 24); } -template < bool bswap > -static void blockpearson_hash_128(const void * in, const size_t org_len, const seed_t seed, void * out) { - const uint8_t * current = (const uint8_t *)in; +template +static void blockpearson_hash_128( const void * in, const size_t org_len, const seed_t seed, void * out ) { + const uint8_t * current = (const uint8_t *)in; - uint64_t len = (uint64_t)org_len; - uint64_t hash1 = (uint64_t)seed; + uint64_t len = (uint64_t )org_len; + uint64_t hash1 = (uint64_t )seed; - permute64(hash1); + permute64(hash1); - uint64_t hash2 = hash1; + uint64_t hash2 = hash1; - while (len > 7) { - hash_round(hash, GET_U64(current, 0), 1); - hash_round(hash, GET_U64(current, 0), 2); + while (len > 7) { + hash_round(hash, GET_U64(current, 0), 1); + hash_round(hash, GET_U64(current, 0), 2); - current += 8; - len -= 8; - } + current += 8; + len -= 8; + } - // handle the rest - hash1 = ~hash1; - hash2 = ~hash2; + // handle the rest + hash1 = ~hash1; + hash2 = ~hash2; - while(len) { - // byte-wise, no endianess - hash_round(hash, *current, 1); - hash_round(hash, *current, 2); + while (len) { + // byte-wise, no endianess + hash_round(hash, *current, 1); + hash_round(hash, *current, 2); - current++; - len--; - } + current++; + len--; + } - // digest length - hash1 = ~hash1; - hash2 = ~hash2; + // digest length + hash1 = ~hash1; + hash2 = ~hash2; - hash_round(hash, (uint64_t)org_len, 1); - hash_round(hash, (uint64_t)org_len, 2); + hash_round(hash, (uint64_t)org_len, 1); + hash_round(hash, (uint64_t)org_len, 2); - PUT_U64(hash2, (uint8_t *)out, 0); - PUT_U64(hash1, (uint8_t *)out, 8); + PUT_U64(hash2, (uint8_t *)out, 0); + PUT_U64(hash1, (uint8_t *)out, 8); } -template < bool bswap > -static void blockpearson_hash_64(const void * in, const size_t org_len, const seed_t seed, void * out) { - const uint8_t * current = (const uint8_t *)in; +template +static void blockpearson_hash_64( const void * in, const size_t org_len, const seed_t seed, void * out ) { + const uint8_t * current = (const uint8_t *)in; - uint64_t len = (uint64_t)org_len; - uint64_t hash1 = (uint64_t)seed; + uint64_t len = (uint64_t )org_len; + uint64_t hash1 = (uint64_t )seed; - permute64(hash1); + permute64(hash1); - while (len > 7) { - hash_round(hash, GET_U64(current, 0), 1); + while (len > 7) { + hash_round(hash, GET_U64(current, 0), 1); - current += 8; - len -= 8; - } + current += 8; + len -= 8; + } - // handle the rest - hash1 = ~hash1; + // handle the rest + hash1 = ~hash1; - while(len) { - // byte-wise, no endianess - hash_round(hash, *current, 1); + while (len) { + // byte-wise, no endianess + hash_round(hash, *current, 1); - current++; - len--; - } + current++; + len--; + } - // digest length - hash1 = ~hash1; + // digest length + hash1 = ~hash1; - hash_round(hash, (uint64_t)org_len, 1); + hash_round(hash, (uint64_t)org_len, 1); - // Previous SMHasher implementation didn't byteswap this properly - PUT_U64(hash1, (uint8_t *)out, 0); + // Previous SMHasher implementation didn't byteswap this properly + PUT_U64(hash1, (uint8_t *)out, 0); } REGISTER_FAMILY(pearsonblock, - $.src_url = "https://github.com/Logan007/pearsonB", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/Logan007/pearsonB", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(PearsonBlock_64, - $.desc = "Pearson-inspired block hash, 64-bit state", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x14C3D184, - $.verification_BE = 0x162C2D8A, - $.hashfn_native = blockpearson_hash_64, - $.hashfn_bswap = blockpearson_hash_64 -); + $.desc = "Pearson-inspired block hash, 64-bit state", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x14C3D184, + $.verification_BE = 0x162C2D8A, + $.hashfn_native = blockpearson_hash_64, + $.hashfn_bswap = blockpearson_hash_64 + ); REGISTER_HASH(PearsonBlock_128, - $.desc = "Pearson-inspired block hash, 128-bit state", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x6BEFE6EA, - $.verification_BE = 0x00D61079, - $.hashfn_native = blockpearson_hash_128, - $.hashfn_bswap = blockpearson_hash_128 -); + $.desc = "Pearson-inspired block hash, 128-bit state", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x6BEFE6EA, + $.verification_BE = 0x00D61079, + $.hashfn_native = blockpearson_hash_128, + $.hashfn_bswap = blockpearson_hash_128 + ); REGISTER_HASH(PearsonBlock_256, - $.desc = "Pearson-inspired block hash, 256-bit state", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0x999B3C19, - $.verification_BE = 0x92D43B4F, - $.hashfn_native = blockpearson_hash_256, - $.hashfn_bswap = blockpearson_hash_256 -); + $.desc = "Pearson-inspired block hash, 256-bit state", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0x999B3C19, + $.verification_BE = 0x92D43B4F, + $.hashfn_native = blockpearson_hash_256, + $.hashfn_bswap = blockpearson_hash_256 + ); diff --git a/hashes/chaskey.cpp b/hashes/chaskey.cpp index 4c922e27..2e05e739 100644 --- a/hashes/chaskey.cpp +++ b/hashes/chaskey.cpp @@ -15,78 +15,78 @@ #include "Hashlib.h" //------------------------------------------------------------ -#define ROUND(v) \ - do { \ - v[0] += v[1]; v[1]=ROTL32(v[1], 5); \ - v[1] ^= v[0]; v[0]=ROTL32(v[0],16); \ - v[2] += v[3]; v[3]=ROTL32(v[3], 8); v[3] ^= v[2]; \ - v[0] += v[3]; v[3]=ROTL32(v[3],13); v[3] ^= v[0]; \ - v[2] += v[1]; v[1]=ROTL32(v[1], 7); \ - v[1] ^= v[2]; v[2]=ROTL32(v[2],16); \ +#define ROUND(v) \ + do { \ + v[0] += v[1]; v[1]=ROTL32(v[1], 5); \ + v[1] ^= v[0]; v[0]=ROTL32(v[0],16); \ + v[2] += v[3]; v[3]=ROTL32(v[3], 8); v[3] ^= v[2]; \ + v[0] += v[3]; v[3]=ROTL32(v[3],13); v[3] ^= v[0]; \ + v[2] += v[1]; v[1]=ROTL32(v[1], 7); \ + v[1] ^= v[2]; v[2]=ROTL32(v[2],16); \ } while(0) -template < uint32_t rounds, uint32_t tagwords, bool bswap > -static void chaskey_impl(uint8_t * tag, const uint8_t * m, const size_t mlen, - const uint32_t k[4], const uint32_t k1[4], const uint32_t k2[4]) { - const uint8_t * end = m + (((mlen - 1) >> 4) << 4); /* pointer to last message block */ - - uint32_t v[4]; - - v[0] = k[0]; - v[1] = k[1]; - v[2] = k[2]; - v[3] = k[3]; - - if (mlen != 0) { - for (; m != end; m += 16) { - v[0] ^= GET_U32(m, 0); - v[1] ^= GET_U32(m, 4); - v[2] ^= GET_U32(m, 8); - v[3] ^= GET_U32(m, 12); - for (uint32_t i = 0; i < rounds; i++) { - ROUND(v); - } - } - } - - const size_t remain = mlen & 0xF; - const uint8_t * lastblock; - const uint32_t * lastkey; - uint8_t lb[16]; - - if ((mlen != 0) && (remain == 0)) { - lastkey = k1; - lastblock = m; - } else { - lastkey = k2; - memset(lb, 0, sizeof(lb)); - memcpy(lb, m, remain); - lb[remain] = 0x01; /* padding bit */ - lastblock = lb; - } - - v[0] ^= GET_U32(lastblock, 0); - v[1] ^= GET_U32(lastblock, 4); - v[2] ^= GET_U32(lastblock, 8); - v[3] ^= GET_U32(lastblock, 12); - - v[0] ^= lastkey[0]; - v[1] ^= lastkey[1]; - v[2] ^= lastkey[2]; - v[3] ^= lastkey[3]; - - for (uint32_t i = 0; i < rounds; i++) { - ROUND(v); - } - - v[0] ^= lastkey[0]; - v[1] ^= lastkey[1]; - v[2] ^= lastkey[2]; - v[3] ^= lastkey[3]; - - for (int i = 0; i < tagwords; i++) { - PUT_U32(v[i], tag, 4*i); - } +template +static void chaskey_impl( uint8_t * tag, const uint8_t * m, const size_t mlen, const uint32_t k[4], + const uint32_t k1[4], const uint32_t k2[4] ) { + const uint8_t * end = m + (((mlen - 1) >> 4) << 4); /* pointer to last message block */ + + uint32_t v[4]; + + v[0] = k[0]; + v[1] = k[1]; + v[2] = k[2]; + v[3] = k[3]; + + if (mlen != 0) { + for (; m != end; m += 16) { + v[0] ^= GET_U32(m, 0); + v[1] ^= GET_U32(m, 4); + v[2] ^= GET_U32(m, 8); + v[3] ^= GET_U32(m, 12); + for (uint32_t i = 0; i < rounds; i++) { + ROUND(v); + } + } + } + + const size_t remain = mlen & 0xF; + const uint8_t * lastblock; + const uint32_t * lastkey; + uint8_t lb[16]; + + if ((mlen != 0) && (remain == 0)) { + lastkey = k1; + lastblock = m; + } else { + lastkey = k2; + memset(lb, 0, sizeof(lb)); + memcpy(lb, m, remain); + lb[remain] = 0x01; /* padding bit */ + lastblock = lb; + } + + v[0] ^= GET_U32(lastblock, 0); + v[1] ^= GET_U32(lastblock, 4); + v[2] ^= GET_U32(lastblock, 8); + v[3] ^= GET_U32(lastblock, 12); + + v[0] ^= lastkey[0]; + v[1] ^= lastkey[1]; + v[2] ^= lastkey[2]; + v[3] ^= lastkey[3]; + + for (uint32_t i = 0; i < rounds; i++) { + ROUND(v); + } + + v[0] ^= lastkey[0]; + v[1] ^= lastkey[1]; + v[2] ^= lastkey[2]; + v[3] ^= lastkey[3]; + + for (int i = 0; i < tagwords; i++) { + PUT_U32(v[i], tag, 4 * i); + } } //------------------------------------------------------------ @@ -100,17 +100,16 @@ static const volatile uint32_t C[2] = { 0x00, 0x87 }; out[3] = (in[3] << 1) | (in[2] >> 31); \ } while(0) - -static void make_subkeys(uint32_t k1[4], uint32_t k2[4], const uint32_t k[4]) { - TIMESTWO(k1, k); - TIMESTWO(k2, k1); +static void make_subkeys( uint32_t k1[4], uint32_t k2[4], const uint32_t k[4] ) { + TIMESTWO(k1, k ); + TIMESTWO(k2, k1); } //------------------------------------------------------------ typedef struct { - uint32_t k[4]; - uint32_t k1[4]; - uint32_t k2[4]; + uint32_t k[4]; + uint32_t k1[4]; + uint32_t k2[4]; } keys_t; static thread_local keys_t chaskeys; @@ -128,8 +127,8 @@ static thread_local keys_t chaskeys; // the state space. ROUND() also has full diffusion after 3 rounds, so // this is two full diffusions. Finally, a 6-round permutation is the // smallest number where chaskey passes this SMHasher3 test suite. -static uintptr_t seed_subkeys(uint64_t seed) { - uint32_t seedlo = (uint32_t)(seed); +static uintptr_t seed_subkeys( uint64_t seed ) { + uint32_t seedlo = (uint32_t)(seed ); uint32_t seedhi = (uint32_t)(seed >> 32); chaskeys.k[0] = seedlo ^ 0xe5d2aff1; @@ -148,83 +147,83 @@ static uintptr_t seed_subkeys(uint64_t seed) { return (uintptr_t)(&chaskeys); } -template < uint32_t rounds, uint32_t tagwords, bool bswap > -static void chaskey(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void chaskey( const void * in, const size_t len, const seed_t seed, void * out ) { const keys_t * keys = (const keys_t *)(uintptr_t)seed; - chaskey_impl((uint8_t *)out, (const uint8_t *)in, - len, keys->k, keys->k1, keys->k2); + + chaskey_impl((uint8_t *)out, (const uint8_t *)in, len, keys->k, keys->k1, keys->k2); } //------------------------------------------------------------ // Test vectors from chaskey-12 reference implementation static const uint8_t vectors[64][8] = { - { 0xdd, 0x3e, 0x18, 0x49, 0xd6, 0x82, 0x45, 0x55 }, - { 0xed, 0x1d, 0xa8, 0x9e, 0xc9, 0x31, 0x79, 0xca }, - { 0x98, 0xfe, 0x20, 0xa3, 0x43, 0xcd, 0x66, 0x6f }, - { 0xf6, 0xf4, 0x18, 0xac, 0xdd, 0x7d, 0x9f, 0xa1 }, - { 0x4c, 0xf0, 0x49, 0x60, 0x09, 0x99, 0x49, 0xf3 }, - { 0x75, 0xc8, 0x32, 0x52, 0x65, 0x3d, 0x3b, 0x57 }, - { 0x96, 0x4b, 0x04, 0x61, 0xfb, 0xe9, 0x22, 0x73 }, - { 0x14, 0x1f, 0xa0, 0x8b, 0xbf, 0x39, 0x96, 0x36 }, - { 0x41, 0x2d, 0x98, 0xed, 0x93, 0x6d, 0x4a, 0xb2 }, - { 0xfb, 0x0d, 0x98, 0xbc, 0x70, 0xe3, 0x05, 0xf9 }, - { 0x36, 0xf8, 0x8e, 0x1f, 0xda, 0x86, 0xc8, 0xab }, - { 0x4d, 0x1a, 0x18, 0x15, 0x86, 0x8a, 0x5a, 0xa8 }, - { 0x7a, 0x79, 0x12, 0xc1, 0x99, 0x9e, 0xae, 0x81 }, - { 0x9c, 0xa1, 0x11, 0x37, 0xb4, 0xa3, 0x46, 0x01 }, - { 0x79, 0x05, 0x14, 0x2f, 0x3b, 0xe7, 0x7e, 0x67 }, - { 0x6a, 0x3e, 0xe3, 0xd3, 0x5c, 0x04, 0x33, 0x97 }, - { 0xd1, 0x39, 0x70, 0xd7, 0xbe, 0x9b, 0x23, 0x50 }, - { 0x32, 0xac, 0xd9, 0x14, 0xbf, 0xda, 0x3b, 0xc8 }, - { 0x8a, 0x58, 0xd8, 0x16, 0xcb, 0x7a, 0x14, 0x83 }, - { 0x03, 0xf4, 0xd6, 0x66, 0x38, 0xef, 0xad, 0x8d }, - { 0xf9, 0x93, 0x22, 0x37, 0xff, 0x05, 0xe8, 0x31 }, - { 0xf5, 0xfe, 0xdb, 0x13, 0x48, 0x62, 0xb4, 0x71 }, - { 0x8b, 0xb5, 0x54, 0x86, 0xf3, 0x8d, 0x57, 0xea }, - { 0x8a, 0x3a, 0xcb, 0x94, 0xb5, 0xad, 0x59, 0x1c }, - { 0x7c, 0xe3, 0x70, 0x87, 0x23, 0xf7, 0x49, 0x5f }, - { 0xf4, 0x2f, 0x3d, 0x2f, 0x40, 0x57, 0x10, 0xc2 }, - { 0xb3, 0x93, 0x3a, 0x16, 0x7e, 0x56, 0x36, 0xac }, - { 0x89, 0x9a, 0x79, 0x45, 0x42, 0x3a, 0x5e, 0x1b }, - { 0x65, 0xe1, 0x2d, 0xf5, 0xa6, 0x95, 0xfa, 0xc8 }, - { 0xb8, 0x24, 0x49, 0xd8, 0xc8, 0xa0, 0x6a, 0xe9 }, - { 0xa8, 0x50, 0xdf, 0xba, 0xde, 0xfa, 0x42, 0x29 }, - { 0xfd, 0x42, 0xc3, 0x9d, 0x08, 0xab, 0x71, 0xa0 }, - { 0xb4, 0x65, 0xc2, 0x41, 0x26, 0x10, 0xbf, 0x84 }, - { 0x89, 0xc4, 0xa9, 0xdd, 0xb5, 0x3e, 0x69, 0x91 }, - { 0x5a, 0x9a, 0xf9, 0x1e, 0xb0, 0x95, 0xd3, 0x31 }, - { 0x8e, 0x54, 0x91, 0x4c, 0x15, 0x1e, 0x46, 0xb0 }, - { 0xfa, 0xb8, 0xab, 0x0b, 0x5b, 0xea, 0xae, 0xc6 }, - { 0x60, 0xad, 0x90, 0x6a, 0xcd, 0x06, 0xc8, 0x23 }, - { 0x6b, 0x1e, 0x6b, 0xc2, 0x42, 0x6d, 0xad, 0x17 }, - { 0x90, 0x32, 0x8f, 0xd2, 0x59, 0x88, 0x9a, 0x8f }, - { 0xf0, 0xf7, 0x81, 0x5e, 0xe6, 0xf3, 0xd5, 0x16 }, - { 0x97, 0xe7, 0xe2, 0xce, 0xbe, 0xa8, 0x26, 0xb8 }, - { 0xb0, 0xfa, 0x18, 0x45, 0xf7, 0x2a, 0x76, 0xd6 }, - { 0xa4, 0x68, 0xbd, 0xfc, 0xdf, 0x0a, 0xa9, 0xc7 }, - { 0xda, 0x84, 0xe1, 0x13, 0x38, 0x38, 0x7d, 0xa7 }, - { 0xb3, 0x0d, 0x5e, 0xad, 0x8e, 0x39, 0xf2, 0xbc }, - { 0x17, 0x8a, 0x43, 0xd2, 0xa0, 0x08, 0x50, 0x3e }, - { 0x6d, 0xfa, 0xa7, 0x05, 0xa8, 0xa0, 0x6c, 0x70 }, - { 0xaa, 0x04, 0x7f, 0x07, 0xc5, 0xae, 0x8d, 0xb4 }, - { 0x30, 0x5b, 0xbb, 0x42, 0x0c, 0x5d, 0x5e, 0xcc }, - { 0x08, 0x32, 0x80, 0x31, 0x59, 0x75, 0x0f, 0x49 }, - { 0x90, 0x80, 0x25, 0x4f, 0xb7, 0x9b, 0xab, 0x1a }, - { 0x61, 0xc2, 0x85, 0xca, 0x24, 0x57, 0x74, 0xa4 }, - { 0x2a, 0xae, 0x03, 0x5c, 0xfb, 0x61, 0xf9, 0x7a }, - { 0xf5, 0x28, 0x90, 0x75, 0xc9, 0xab, 0x39, 0xe5 }, - { 0xe6, 0x5c, 0x42, 0x37, 0x32, 0xda, 0xe7, 0x95 }, - { 0x4b, 0x22, 0xcf, 0x0d, 0x9d, 0xa8, 0xde, 0x3d }, - { 0x26, 0x26, 0xea, 0x2f, 0xa1, 0xf9, 0xab, 0xcf }, - { 0xd1, 0xe1, 0x7e, 0x6e, 0xc4, 0xa8, 0x8d, 0xa6 }, - { 0x16, 0x57, 0x44, 0x28, 0x27, 0xff, 0x64, 0x0a }, - { 0xfd, 0x15, 0x5a, 0x40, 0xdf, 0x15, 0xf6, 0x30 }, - { 0xff, 0xeb, 0x59, 0x6f, 0x29, 0x9f, 0x58, 0xb2 }, - { 0xbe, 0x4e, 0xe4, 0xed, 0x39, 0x75, 0xdf, 0x87 }, - { 0xfc, 0x7f, 0x9d, 0xf7, 0x99, 0x1b, 0x87, 0xbc } + { 0xdd, 0x3e, 0x18, 0x49, 0xd6, 0x82, 0x45, 0x55 }, + { 0xed, 0x1d, 0xa8, 0x9e, 0xc9, 0x31, 0x79, 0xca }, + { 0x98, 0xfe, 0x20, 0xa3, 0x43, 0xcd, 0x66, 0x6f }, + { 0xf6, 0xf4, 0x18, 0xac, 0xdd, 0x7d, 0x9f, 0xa1 }, + { 0x4c, 0xf0, 0x49, 0x60, 0x09, 0x99, 0x49, 0xf3 }, + { 0x75, 0xc8, 0x32, 0x52, 0x65, 0x3d, 0x3b, 0x57 }, + { 0x96, 0x4b, 0x04, 0x61, 0xfb, 0xe9, 0x22, 0x73 }, + { 0x14, 0x1f, 0xa0, 0x8b, 0xbf, 0x39, 0x96, 0x36 }, + { 0x41, 0x2d, 0x98, 0xed, 0x93, 0x6d, 0x4a, 0xb2 }, + { 0xfb, 0x0d, 0x98, 0xbc, 0x70, 0xe3, 0x05, 0xf9 }, + { 0x36, 0xf8, 0x8e, 0x1f, 0xda, 0x86, 0xc8, 0xab }, + { 0x4d, 0x1a, 0x18, 0x15, 0x86, 0x8a, 0x5a, 0xa8 }, + { 0x7a, 0x79, 0x12, 0xc1, 0x99, 0x9e, 0xae, 0x81 }, + { 0x9c, 0xa1, 0x11, 0x37, 0xb4, 0xa3, 0x46, 0x01 }, + { 0x79, 0x05, 0x14, 0x2f, 0x3b, 0xe7, 0x7e, 0x67 }, + { 0x6a, 0x3e, 0xe3, 0xd3, 0x5c, 0x04, 0x33, 0x97 }, + { 0xd1, 0x39, 0x70, 0xd7, 0xbe, 0x9b, 0x23, 0x50 }, + { 0x32, 0xac, 0xd9, 0x14, 0xbf, 0xda, 0x3b, 0xc8 }, + { 0x8a, 0x58, 0xd8, 0x16, 0xcb, 0x7a, 0x14, 0x83 }, + { 0x03, 0xf4, 0xd6, 0x66, 0x38, 0xef, 0xad, 0x8d }, + { 0xf9, 0x93, 0x22, 0x37, 0xff, 0x05, 0xe8, 0x31 }, + { 0xf5, 0xfe, 0xdb, 0x13, 0x48, 0x62, 0xb4, 0x71 }, + { 0x8b, 0xb5, 0x54, 0x86, 0xf3, 0x8d, 0x57, 0xea }, + { 0x8a, 0x3a, 0xcb, 0x94, 0xb5, 0xad, 0x59, 0x1c }, + { 0x7c, 0xe3, 0x70, 0x87, 0x23, 0xf7, 0x49, 0x5f }, + { 0xf4, 0x2f, 0x3d, 0x2f, 0x40, 0x57, 0x10, 0xc2 }, + { 0xb3, 0x93, 0x3a, 0x16, 0x7e, 0x56, 0x36, 0xac }, + { 0x89, 0x9a, 0x79, 0x45, 0x42, 0x3a, 0x5e, 0x1b }, + { 0x65, 0xe1, 0x2d, 0xf5, 0xa6, 0x95, 0xfa, 0xc8 }, + { 0xb8, 0x24, 0x49, 0xd8, 0xc8, 0xa0, 0x6a, 0xe9 }, + { 0xa8, 0x50, 0xdf, 0xba, 0xde, 0xfa, 0x42, 0x29 }, + { 0xfd, 0x42, 0xc3, 0x9d, 0x08, 0xab, 0x71, 0xa0 }, + { 0xb4, 0x65, 0xc2, 0x41, 0x26, 0x10, 0xbf, 0x84 }, + { 0x89, 0xc4, 0xa9, 0xdd, 0xb5, 0x3e, 0x69, 0x91 }, + { 0x5a, 0x9a, 0xf9, 0x1e, 0xb0, 0x95, 0xd3, 0x31 }, + { 0x8e, 0x54, 0x91, 0x4c, 0x15, 0x1e, 0x46, 0xb0 }, + { 0xfa, 0xb8, 0xab, 0x0b, 0x5b, 0xea, 0xae, 0xc6 }, + { 0x60, 0xad, 0x90, 0x6a, 0xcd, 0x06, 0xc8, 0x23 }, + { 0x6b, 0x1e, 0x6b, 0xc2, 0x42, 0x6d, 0xad, 0x17 }, + { 0x90, 0x32, 0x8f, 0xd2, 0x59, 0x88, 0x9a, 0x8f }, + { 0xf0, 0xf7, 0x81, 0x5e, 0xe6, 0xf3, 0xd5, 0x16 }, + { 0x97, 0xe7, 0xe2, 0xce, 0xbe, 0xa8, 0x26, 0xb8 }, + { 0xb0, 0xfa, 0x18, 0x45, 0xf7, 0x2a, 0x76, 0xd6 }, + { 0xa4, 0x68, 0xbd, 0xfc, 0xdf, 0x0a, 0xa9, 0xc7 }, + { 0xda, 0x84, 0xe1, 0x13, 0x38, 0x38, 0x7d, 0xa7 }, + { 0xb3, 0x0d, 0x5e, 0xad, 0x8e, 0x39, 0xf2, 0xbc }, + { 0x17, 0x8a, 0x43, 0xd2, 0xa0, 0x08, 0x50, 0x3e }, + { 0x6d, 0xfa, 0xa7, 0x05, 0xa8, 0xa0, 0x6c, 0x70 }, + { 0xaa, 0x04, 0x7f, 0x07, 0xc5, 0xae, 0x8d, 0xb4 }, + { 0x30, 0x5b, 0xbb, 0x42, 0x0c, 0x5d, 0x5e, 0xcc }, + { 0x08, 0x32, 0x80, 0x31, 0x59, 0x75, 0x0f, 0x49 }, + { 0x90, 0x80, 0x25, 0x4f, 0xb7, 0x9b, 0xab, 0x1a }, + { 0x61, 0xc2, 0x85, 0xca, 0x24, 0x57, 0x74, 0xa4 }, + { 0x2a, 0xae, 0x03, 0x5c, 0xfb, 0x61, 0xf9, 0x7a }, + { 0xf5, 0x28, 0x90, 0x75, 0xc9, 0xab, 0x39, 0xe5 }, + { 0xe6, 0x5c, 0x42, 0x37, 0x32, 0xda, 0xe7, 0x95 }, + { 0x4b, 0x22, 0xcf, 0x0d, 0x9d, 0xa8, 0xde, 0x3d }, + { 0x26, 0x26, 0xea, 0x2f, 0xa1, 0xf9, 0xab, 0xcf }, + { 0xd1, 0xe1, 0x7e, 0x6e, 0xc4, 0xa8, 0x8d, 0xa6 }, + { 0x16, 0x57, 0x44, 0x28, 0x27, 0xff, 0x64, 0x0a }, + { 0xfd, 0x15, 0x5a, 0x40, 0xdf, 0x15, 0xf6, 0x30 }, + { 0xff, 0xeb, 0x59, 0x6f, 0x29, 0x9f, 0x58, 0xb2 }, + { 0xbe, 0x4e, 0xe4, 0xed, 0x39, 0x75, 0xdf, 0x87 }, + { 0xfc, 0x7f, 0x9d, 0xf7, 0x99, 0x1b, 0x87, 0xbc } }; -static bool chaskey_selftest(void) { +static bool chaskey_selftest( void ) { uint8_t tag[8]; uint8_t m[64]; @@ -232,14 +231,14 @@ static bool chaskey_selftest(void) { // As mentioned above, this sets the key to the vector // { 0x33221100, 0x77665544, 0xbbaa9988, 0xffeeddcc }. - seed_t s = seed_subkeys(0); + seed_t s = seed_subkeys(0); bool passed = true; for (int i = 0; i < 64; i++) { if (isLE()) { - chaskey<12,2,false>(m, i, s, tag); + chaskey<12, 2, false>(m, i, s, tag); } else { - chaskey<12,2,true>(m, i, s, tag); + chaskey<12, 2, true>(m, i, s, tag); } if (0 != memcmp(tag, vectors[i], 8)) { printf("Mismatch with len %d\n Expected:", i); @@ -256,129 +255,129 @@ static bool chaskey_selftest(void) { //------------------------------------------------------------ REGISTER_FAMILY(chaskey, - $.src_url = "http://mouha.be/chaskey/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "http://mouha.be/chaskey/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(chaskey_12__32, - $.desc = "Chaskey PRF (12 rounds, 32 bits)", - $.sort_order = 20, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x672570CB, - $.verification_BE = 0x22B350D2, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<12,1,false>, - $.hashfn_bswap = chaskey<12,1,true> -); + $.desc = "Chaskey PRF (12 rounds, 32 bits)", + $.sort_order = 20, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x672570CB, + $.verification_BE = 0x22B350D2, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<12, 1, false>, + $.hashfn_bswap = chaskey<12, 1, true> + ); REGISTER_HASH(chaskey_12__64, - $.desc = "Chaskey PRF (12 rounds, 64 bits)", - $.sort_order = 20, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x919290D6, - $.verification_BE = 0x5D0E8285, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<12,2,false>, - $.hashfn_bswap = chaskey<12,2,true> -); + $.desc = "Chaskey PRF (12 rounds, 64 bits)", + $.sort_order = 20, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x919290D6, + $.verification_BE = 0x5D0E8285, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<12, 2, false>, + $.hashfn_bswap = chaskey<12, 2, true> + ); REGISTER_HASH(chaskey_12, - $.desc = "Chaskey PRF (12 rounds, 128 bits)", - $.sort_order = 20, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x1E983B23, - $.verification_BE = 0xB042962B, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<12,4,false>, - $.hashfn_bswap = chaskey<12,4,true> -); + $.desc = "Chaskey PRF (12 rounds, 128 bits)", + $.sort_order = 20, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x1E983B23, + $.verification_BE = 0xB042962B, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<12, 4, false>, + $.hashfn_bswap = chaskey<12, 4, true> + ); REGISTER_HASH(chaskey_8__32, - $.desc = "Chaskey PRF (8 rounds, 32 bits)", - $.sort_order = 10, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0xA984B318, - $.verification_BE = 0x23FE2699, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<8,1,false>, - $.hashfn_bswap = chaskey<8,1,true> -); + $.desc = "Chaskey PRF (8 rounds, 32 bits)", + $.sort_order = 10, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0xA984B318, + $.verification_BE = 0x23FE2699, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<8, 1, false>, + $.hashfn_bswap = chaskey<8, 1, true> + ); REGISTER_HASH(chaskey_8__64, - $.desc = "Chaskey PRF (8 rounds, 64 bits)", - $.sort_order = 10, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x4DA0DD3A, - $.verification_BE = 0x87A85CD2, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<8,2,false>, - $.hashfn_bswap = chaskey<8,2,true> -); + $.desc = "Chaskey PRF (8 rounds, 64 bits)", + $.sort_order = 10, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x4DA0DD3A, + $.verification_BE = 0x87A85CD2, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<8, 2, false>, + $.hashfn_bswap = chaskey<8, 2, true> + ); REGISTER_HASH(chaskey_8, - $.desc = "Chaskey PRF (8 rounds, 128 bits)", - $.sort_order = 10, - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_NO_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x48B645E4, - $.verification_BE = 0xB84D00F9, - $.initfn = chaskey_selftest, - $.seedfn = seed_subkeys, - $.hashfn_native = chaskey<8,4,false>, - $.hashfn_bswap = chaskey<8,4,true> -); + $.desc = "Chaskey PRF (8 rounds, 128 bits)", + $.sort_order = 10, + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_NO_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x48B645E4, + $.verification_BE = 0xB84D00F9, + $.initfn = chaskey_selftest, + $.seedfn = seed_subkeys, + $.hashfn_native = chaskey<8, 4, false>, + $.hashfn_bswap = chaskey<8, 4, true> + ); diff --git a/hashes/cityhash.cpp b/hashes/cityhash.cpp index 682a1ac4..ef5ea564 100644 --- a/hashes/cityhash.cpp +++ b/hashes/cityhash.cpp @@ -29,43 +29,51 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_CRC32C) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif using namespace std; //------------------------------------------------------------ #if defined(HAVE_INT128) -static inline uint64_t Uint128Low64(const uint128_t x) { - return static_cast(x); + +static inline uint64_t Uint128Low64( const uint128_t x ) { + return static_cast(x); } -static inline uint64_t Uint128High64(const uint128_t x) { - return static_cast(x >> 64); + +static inline uint64_t Uint128High64( const uint128_t x ) { + return static_cast(x >> 64); } -static inline uint128_t Uint128(uint64_t lo, uint64_t hi) { - return lo + (((uint128_t)hi) << 64); + +static inline uint128_t Uint128( uint64_t lo, uint64_t hi ) { + return lo + (((uint128_t)hi) << 64); } + #else typedef std::pair uint128_t; -static inline uint64_t Uint128Low64(const uint128_t x) { + +static inline uint64_t Uint128Low64( const uint128_t x ) { return x.first; } -static inline uint64_t Uint128High64(const uint128_t x) { + +static inline uint64_t Uint128High64( const uint128_t x ) { return x.second; } -static inline uint128_t Uint128(uint64_t lo, uint64_t hi) { + +static inline uint128_t Uint128( uint64_t lo, uint64_t hi ) { return uint128_t(lo, hi); } + #endif //------------------------------------------------------------ -template < bool bswap > -static inline uint32_t Fetch32(const uint8_t * p) { +template +static inline uint32_t Fetch32( const uint8_t * p ) { return GET_U32(p, 0); } -template < bool bswap > -static inline uint64_t Fetch64(const uint8_t * p) { +template +static inline uint64_t Fetch64( const uint8_t * p ) { return GET_U64(p, 0); } @@ -83,387 +91,383 @@ static const uint32_t c2 = 0x1b873593; //------------------------------------------------------------ // Hash 128 input bits down to 64 bits of output. // This is intended to be a reasonably good hash function. -static inline uint64_t Hash128to64(const uint128_t & x) { - // Murmur-inspired hashing. - const uint64_t kMul = UINT64_C(0x9ddfea08eb382d69); - uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; - a ^= (a >> 47); - uint64_t b = (Uint128High64(x) ^ a) * kMul; - b ^= (b >> 47); - b *= kMul; - return b; +static inline uint64_t Hash128to64( const uint128_t & x ) { + // Murmur-inspired hashing. + const uint64_t kMul = UINT64_C(0x9ddfea08eb382d69); + uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + + a ^= (a >> 47); + uint64_t b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; } // A 32-bit to 32-bit integer hash copied from Murmur3. -static uint32_t fmix(uint32_t h) { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; +static uint32_t fmix( uint32_t h ) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; } // Helper from Murmur3 for combining two 32-bit values. -static uint32_t Mur(uint32_t a, uint32_t h) { - a *= c1; - a = ROTR32(a, 17); - a *= c2; - h ^= a; - h = ROTR32(h, 19); - return h * 5 + 0xe6546b64; +static uint32_t Mur( uint32_t a, uint32_t h ) { + a *= c1; + a = ROTR32(a, 17); + a *= c2; + h ^= a; + h = ROTR32(h, 19); + return h * 5 + 0xe6546b64; } -static uint64_t ShiftMix(uint64_t val) { - return val ^ (val >> 47); +static uint64_t ShiftMix( uint64_t val ) { + return val ^ (val >> 47); } -static uint64_t HashLen16(uint64_t u, uint64_t v) { - return Hash128to64(Uint128(u, v)); +static uint64_t HashLen16( uint64_t u, uint64_t v ) { + return Hash128to64(Uint128(u, v)); } // Return a 16-byte hash for 48 bytes. Quick and dirty. // Callers do best to use "random-looking" values for a and b. -static pair WeakHashLen32WithSeeds( - uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) { - a += w; - b = ROTR64(b + a + z, 21); - uint64_t c = a; - a += x; - a += y; - b += ROTR64(a, 44); - return make_pair(a + z, b + c); +static pair WeakHashLen32WithSeeds( uint64_t w, uint64_t x, + uint64_t y, uint64_t z, uint64_t a, uint64_t b ) { + a += w; + b = ROTR64(b + a + z, 21); + uint64_t c = a; + a += x; + a += y; + b += ROTR64(a , 44); + return make_pair(a + z, b + c); } // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. -template < bool bswap > -static pair WeakHashLen32WithSeeds( - const uint8_t* s, uint64_t a, uint64_t b) { - return WeakHashLen32WithSeeds(Fetch64(s), - Fetch64(s + 8), - Fetch64(s + 16), - Fetch64(s + 24), - a, - b); +template +static pair WeakHashLen32WithSeeds( const uint8_t * s, uint64_t a, uint64_t b ) { + return WeakHashLen32WithSeeds(Fetch64(s), Fetch64( + s + 8), Fetch64(s + 16), Fetch64(s + 24), a, b); } #define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0) //------------------------------------------------------------ -static uint32_t Hash32Len0to4(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t b = seed; - uint32_t c = 9; - for (int i = 0; i < len; i++) { - b = b * c1 + s[i]; - c ^= b; - } - return fmix(Mur(b, Mur(len, c))); -} - -template < bool bswap > -static uint32_t Hash32Len5to12(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t a = len + seed, b = len * 5, c = 9, d = b; - a += Fetch32(s); - b += Fetch32(s + len - 4); - c += Fetch32(s + ((len >> 1) & 4)); - return fmix(Mur(c, Mur(b, Mur(a, d)))); -} - -template < bool bswap > -static uint32_t Hash32Len13to24(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t a = Fetch32(s - 4 + (len >> 1)); - uint32_t b = Fetch32(s + 4); - uint32_t c = Fetch32(s + len - 8); - uint32_t d = Fetch32(s + (len >> 1)); - uint32_t e = Fetch32(s); - uint32_t f = Fetch32(s + len - 4); - uint32_t h = seed + len; - - return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); -} - -template < bool bswap > -static uint32_t CityHash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { - if (len <= 24) { - return len <= 12 ? - (len <= 4 ? - Hash32Len0to4(s, len, seed) : - Hash32Len5to12(s, len, seed) ) : - Hash32Len13to24(s, len, seed); - } - - // len > 24 - uint32_t h = len + seed, g = c1 * len, f = g; - uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; - uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; - uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; - uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; - uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; - h ^= a0; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - h ^= a2; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - g ^= a1; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - g ^= a3; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - f += a4; - f = ROTR32(f, 19); - f = f * 5 + 0xe6546b64; - size_t iters = (len - 1) / 20; - do { - uint32_t a0 = ROTR32(Fetch32(s) * c1, 17) * c2; - uint32_t a1 = Fetch32(s + 4); - uint32_t a2 = ROTR32(Fetch32(s + 8) * c1, 17) * c2; - uint32_t a3 = ROTR32(Fetch32(s + 12) * c1, 17) * c2; - uint32_t a4 = Fetch32(s + 16); +static uint32_t Hash32Len0to4( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t b = seed; + uint32_t c = 9; + + for (int i = 0; i < len; i++) { + b = b * c1 + s[i]; + c ^= b; + } + return fmix(Mur(b, Mur(len, c))); +} + +template +static uint32_t Hash32Len5to12( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t a = len + seed, b = len * 5, c = 9, d = b; + + a += Fetch32(s); + b += Fetch32(s + len - 4); + c += Fetch32(s + ((len >> 1) & 4)); + return fmix(Mur(c, Mur(b, Mur(a, d)))); +} + +template +static uint32_t Hash32Len13to24( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t a = Fetch32(s - 4 + (len >> 1)); + uint32_t b = Fetch32(s + 4); + uint32_t c = Fetch32(s + len - 8); + uint32_t d = Fetch32(s + (len >> 1)); + uint32_t e = Fetch32(s); + uint32_t f = Fetch32(s + len - 4); + uint32_t h = seed + len; + + return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); +} + +template +static uint32_t CityHash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { + if (len <= 24) { + return len <= 12 ? + (len <= 4 ? + Hash32Len0to4(s, len, seed) : + Hash32Len5to12(s, len, seed)) : + Hash32Len13to24(s, len, seed); + } + + // len > 24 + uint32_t h = len + seed, g = c1 * len, f = g; + uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; + uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; + uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; + uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; + uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; h ^= a0; - h = ROTR32(h, 18); + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + h ^= a2; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + g ^= a1; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + g ^= a3; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + f += a4; + f = ROTR32(f, 19); + f = f * 5 + 0xe6546b64; + size_t iters = (len - 1) / 20; + do { + uint32_t a0 = ROTR32(Fetch32(s) * c1, 17) * c2; + uint32_t a1 = Fetch32(s + 4); + uint32_t a2 = ROTR32(Fetch32(s + 8) * c1, 17) * c2; + uint32_t a3 = ROTR32(Fetch32(s + 12) * c1, 17) * c2; + uint32_t a4 = Fetch32(s + 16); + h ^= a0; + h = ROTR32(h, 18); + h = h * 5 + 0xe6546b64; + f += a1; + f = ROTR32(f, 19); + f = f * c1; + g += a2; + g = ROTR32(g, 18); + g = g * 5 + 0xe6546b64; + h ^= a3 + a1; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + g ^= a4; + g = BSWAP(g) * 5; + h += a4 * 5; + h = BSWAP(h); + f += a0; + PERMUTE3(f, h, g); + s += 20; + } while (--iters != 0); + g = ROTR32(g , 11) * c1; + g = ROTR32(g , 17) * c1; + f = ROTR32(f , 11) * c1; + f = ROTR32(f , 17) * c1; + h = ROTR32(h + g, 19); h = h * 5 + 0xe6546b64; - f += a1; - f = ROTR32(f, 19); - f = f * c1; - g += a2; - g = ROTR32(g, 18); - g = g * 5 + 0xe6546b64; - h ^= a3 + a1; - h = ROTR32(h, 19); + h = ROTR32(h , 17) * c1; + h = ROTR32(h + f, 19); h = h * 5 + 0xe6546b64; - g ^= a4; - g = BSWAP(g) * 5; - h += a4 * 5; - h = BSWAP(h); - f += a0; - PERMUTE3(f, h, g); - s += 20; - } while (--iters != 0); - g = ROTR32(g, 11) * c1; - g = ROTR32(g, 17) * c1; - f = ROTR32(f, 11) * c1; - f = ROTR32(f, 17) * c1; - h = ROTR32(h + g, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - h = ROTR32(h + f, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - return h; + h = ROTR32(h , 17) * c1; + return h; } //------------------------------------------------------------ -template < bool bswap > -static uint64_t HashLen0to16(const uint8_t *s, size_t len) { - if (len > 8) { - uint64_t a = Fetch64(s); - uint64_t b = Fetch64(s + len - 8); - return HashLen16(a, ROTR64(b + len, len)) ^ b; - } - if (len >= 4) { - uint64_t a = Fetch32(s); - return HashLen16(len + (a << 3), Fetch32(s + len - 4)); - } - if (len > 0) { - uint8_t a = s[0]; - uint8_t b = s[len >> 1]; - uint8_t c = s[len - 1]; - uint32_t y = static_cast(a) + (static_cast(b) << 8); - uint32_t z = len + (static_cast(c) << 2); - return ShiftMix(y * k2 ^ z * k3) * k2; - } - return k2; +template +static uint64_t HashLen0to16( const uint8_t * s, size_t len ) { + if (len > 8) { + uint64_t a = Fetch64(s); + uint64_t b = Fetch64(s + len - 8); + return HashLen16(a, ROTR64(b + len, len)) ^ b; + } + if (len >= 4) { + uint64_t a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4)); + } + if (len > 0) { + uint8_t a = s[0]; + uint8_t b = s[len >> 1]; + uint8_t c = s[len - 1]; + uint32_t y = static_cast(a) + (static_cast(b) << 8); + uint32_t z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; } // This probably works well for 16-byte strings as well, but it may be overkill // in that case. -template < bool bswap > -static uint64_t HashLen17to32(const uint8_t *s, size_t len) { - uint64_t a = Fetch64(s) * k1; - uint64_t b = Fetch64(s + 8); - uint64_t c = Fetch64(s + len - 8) * k2; - uint64_t d = Fetch64(s + len - 16) * k0; - return HashLen16(ROTR64(a - b, 43) + ROTR64(c, 30) + d, - a + ROTR64(b ^ k3, 20) - c + len); +template +static uint64_t HashLen17to32( const uint8_t * s, size_t len ) { + uint64_t a = Fetch64(s ) * k1; + uint64_t b = Fetch64(s + 8); + uint64_t c = Fetch64(s + len - 8) * k2; + uint64_t d = Fetch64(s + len - 16) * k0; + + return HashLen16(ROTR64(a - b, 43) + ROTR64(c, 30) + d, a + ROTR64(b ^ k3, 20) - c + len); } // Return an 8-byte hash for 33 to 64 bytes. -template < bool bswap > -static uint64_t HashLen33to64(const uint8_t *s, size_t len) { - uint64_t z = Fetch64(s + 24); - uint64_t a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; - uint64_t b = ROTR64(a + z, 52); - uint64_t c = ROTR64(a, 37); - a += Fetch64(s + 8); - c += ROTR64(a, 7); - a += Fetch64(s + 16); - uint64_t vf = a + z; - uint64_t vs = b + ROTR64(a, 31) + c; - a = Fetch64(s + 16) + Fetch64(s + len - 32); - z = Fetch64(s + len - 8); - b = ROTR64(a + z, 52); - c = ROTR64(a, 37); - a += Fetch64(s + len - 24); - c += ROTR64(a, 7); - a += Fetch64(s + len - 16); - uint64_t wf = a + z; - uint64_t ws = b + ROTR64(a, 31) + c; - uint64_t r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); - return ShiftMix(r * k0 + vs) * k2; -} - -template < bool bswap > -static uint64_t CityHash64(const uint8_t *s, size_t len) { - if (len <= 32) { - if (len <= 16) { - return HashLen0to16(s, len); - } else { - return HashLen17to32(s, len); +template +static uint64_t HashLen33to64( const uint8_t * s, size_t len ) { + uint64_t z = Fetch64(s + 24); + uint64_t a = Fetch64(s ) + (len + Fetch64(s + len - 16)) * k0; + uint64_t b = ROTR64(a + z, 52); + uint64_t c = ROTR64(a , 37); + + a += Fetch64(s + 8); + c += ROTR64(a, 7); + a += Fetch64(s + 16 ); + uint64_t vf = a + z; + uint64_t vs = b + ROTR64(a, 31) + c; + a = Fetch64(s + 16 ) + Fetch64(s + len - 32); + z = Fetch64(s + len - 8); + b = ROTR64(a + z, 52); + c = ROTR64(a , 37); + a += Fetch64(s + len - 24); + c += ROTR64(a, 7); + a += Fetch64(s + len - 16); + uint64_t wf = a + z; + uint64_t ws = b + ROTR64(a, 31) + c; + uint64_t r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +template +static uint64_t CityHash64( const uint8_t * s, size_t len ) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); } - } else if (len <= 64) { - return HashLen33to64(s, len); - } - - // For strings over 64 bytes we hash the end first, and then as we - // loop we keep 56 bytes of state: v, w, x, y, and z. - uint64_t x = Fetch64(s + len - 40); - uint64_t y = Fetch64(s + len - 16) + Fetch64(s + len - 56); - uint64_t z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); - pair v = WeakHashLen32WithSeeds(s + len - 64, len, z); - pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); - x = x * k1 + Fetch64(s); - - // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. - len = (len - 1) & ~static_cast(63); - do { - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - s += 64; - len -= 64; - } while (len != 0); - return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, - HashLen16(v.second, w.second) + x); -} - -template < bool bswap > -static uint64_t CityHash64WithSeeds(const uint8_t *s, size_t len, - uint64_t seed0, uint64_t seed1) { + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + uint64_t x = Fetch64(s + len - 40); + uint64_t y = Fetch64(s + len - 16) + Fetch64(s + len - 56); + uint64_t z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); + pair v = WeakHashLen32WithSeeds(s + len - 64, len , z); + pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); + x = x * k1 + Fetch64(s); + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~static_cast(63); + do { + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, HashLen16(v.second, w.second) + x); +} + +template +static uint64_t CityHash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ) { return HashLen16(CityHash64(s, len) - seed0, seed1); } -template < bool bswap > -static uint64_t CityHash64WithSeed(const uint8_t *s, size_t len, uint64_t seed) { +template +static uint64_t CityHash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ) { return CityHash64WithSeeds(s, len, k2, seed); } //------------------------------------------------------------ -template < bool bswap > -static uint128_t CityMurmur(const uint8_t *s, size_t len, uint128_t seed) { - uint64_t a = Uint128Low64(seed); - uint64_t b = Uint128High64(seed); - uint64_t c = 0; - uint64_t d = 0; - signed long l = len - 16; - if (l <= 0) { // len <= 16 - a = ShiftMix(a * k1) * k1; - c = b * k1 + HashLen0to16(s, len); - d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); - } else { // len > 16 - c = HashLen16(Fetch64(s + len - 8) + k1, a); - d = HashLen16(b + len, c + Fetch64(s + len - 16)); - a += d; +template +static uint128_t CityMurmur( const uint8_t * s, size_t len, uint128_t seed ) { + uint64_t a = Uint128Low64(seed); + uint64_t b = Uint128High64(seed); + uint64_t c = 0; + uint64_t d = 0; + signed long l = len - 16; + + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + HashLen0to16(s, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s + len - 8) + k1, a ); + d = HashLen16(b + len, c + Fetch64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return Uint128(a ^ b, HashLen16(b, a)); +} + +template +static uint128_t CityHash128WithSeed( const uint8_t * s, size_t len, uint128_t seed ) { + if (len < 128) { + return CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair v, w; + uint64_t x = Uint128Low64(seed); + uint64_t y = Uint128High64(seed); + uint64_t z = len * k1; + v.first = ROTR64(y ^ k1 , 49) * k1 + Fetch64(s); + v.second = ROTR64(v.first, 42) * k1 + Fetch64(s + 8); + w.first = ROTR64(y + z , 35) * k1 + x; + w.second = ROTR64(x + Fetch64(s + 88), 53) * k1; + + // This is the same inner loop as CityHash64(), manually unrolled. do { - a ^= ShiftMix(Fetch64(s) * k1) * k1; - a *= k1; - b ^= a; - c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; - c *= k1; - d ^= c; - s += 16; - l -= 16; - } while (l > 0); - } - a = HashLen16(a, c); - b = HashLen16(d, b); - return Uint128(a ^ b, HashLen16(b, a)); -} - -template < bool bswap > -static uint128_t CityHash128WithSeed(const uint8_t *s, size_t len, uint128_t seed) { - if (len < 128) { - return CityMurmur(s, len, seed); - } - - // We expect len >= 128 to be the common case. Keep 56 bytes of state: - // v, w, x, y, and z. - pair v, w; - uint64_t x = Uint128Low64(seed); - uint64_t y = Uint128High64(seed); - uint64_t z = len * k1; - v.first = ROTR64(y ^ k1, 49) * k1 + Fetch64(s); - v.second = ROTR64(v.first, 42) * k1 + Fetch64(s + 8); - w.first = ROTR64(y + z, 35) * k1 + x; - w.second = ROTR64(x + Fetch64(s + 88), 53) * k1; - - // This is the same inner loop as CityHash64(), manually unrolled. - do { - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - s += 64; - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - s += 64; - len -= 128; - } while (likely(len >= 128)); - x += ROTR64(v.first + z, 49) * k0; - z += ROTR64(w.first, 37) * k0; - // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. - for (size_t tail_done = 0; tail_done < len; ) { - tail_done += 32; - y = ROTR64(x + y, 42) * k0 + v.second; - w.first += Fetch64(s + len - tail_done + 16); - x = x * k0 + w.first; - z += w.second + Fetch64(s + len - tail_done); - w.second += v.first; - v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); - } - // At this point our 56 bytes of state should contain more than - // enough information for a strong 128-bit hash. We use two - // different 56-byte-to-8-byte hashes to get a 16-byte final result. - x = HashLen16(x, v.first); - y = HashLen16(y + z, w.first); - return Uint128(HashLen16(x + v.second, w.second) + y, - HashLen16(x + w.second, y + v.second)); -} - -template < bool bswap > -static uint128_t CityHash128(const char *s, size_t len) { + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 128; + } while (likely(len >= 128)); + x += ROTR64(v.first + z, 49) * k0; + z += ROTR64(w.first , 37) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len;) { + tail_done += 32; + y = ROTR64(x + y, 42) * k0 + v.second; + w.first += Fetch64 (s + len - tail_done + 16); + x = x * k0 + w.first; + z += w.second + Fetch64 (s + len - tail_done); + w.second += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); + } + // At this point our 56 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 56-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x , v.first); + y = HashLen16(y + z, w.first); + return Uint128(HashLen16(x + v.second, w.second) + y, HashLen16(x + w.second, y + v.second)); +} + +template +static uint128_t CityHash128( const char * s, size_t len ) { if (len >= 16) { - return CityHash128WithSeed( - s + 16, len - 16, Uint128(Fetch64(s) ^ k3, Fetch64(s + 8))); + return CityHash128WithSeed(s + 16, len - 16, Uint128(Fetch64(s) ^ k3, Fetch64(s + 8))); } else if (len >= 8) { - return CityHash128WithSeed( - NULL, 0, Uint128(Fetch64(s) ^ (len * k0), Fetch64(s + len - 8) ^ k1)); + return CityHash128WithSeed(NULL, 0, Uint128(Fetch64( + s) ^ (len * k0), Fetch64(s + len - 8) ^ k1)); } else { return CityHash128WithSeed(s, len, Uint128(k0, k1)); } @@ -473,160 +477,167 @@ static uint128_t CityHash128(const char *s, size_t len) { #if defined(HAVE_X86_64_CRC32C) // Requires len >= 240. -template < bool bswap > -static void CityHashCrc256Long(const uint8_t *s, size_t len, - uint64_t seed, uint64_t *result) { - uint64_t a = Fetch64(s + 56) + k0; - uint64_t b = Fetch64(s + 96) + k0; - uint64_t c = result[0] = HashLen16(b, len); - uint64_t d = result[1] = Fetch64(s + 120) * k0 + len; - uint64_t e = Fetch64(s + 184) + seed; - uint64_t f = seed; - uint64_t g = 0; - uint64_t h = 0; - uint64_t i = 0; - uint64_t j = 0; - uint64_t t = c + d; - - // 240 bytes of input per iter. - size_t iters = len / 240; - len -= iters * 240; - do { - -#define CHUNK(multiplier, z) \ - { \ - uint64_t old_a = a; \ - a = ROTR64(b, 41 ^ z) * multiplier + Fetch64(s); \ - b = ROTR64(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ - c = ROTR64(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ - d = ROTR64(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ - e = ROTR64(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ - t = old_a; \ - } \ - f = _mm_crc32_u64(f, a); \ - g = _mm_crc32_u64(g, b); \ - h = _mm_crc32_u64(h, c); \ - i = _mm_crc32_u64(i, d); \ - j = _mm_crc32_u64(j, e); \ +template +static void CityHashCrc256Long( const uint8_t * s, size_t len, uint64_t seed, uint64_t * result ) { + uint64_t a = Fetch64(s + 56) + k0; + uint64_t b = Fetch64(s + 96) + k0; + uint64_t c = HashLen16(b, len); + uint64_t d = Fetch64(s + 120) * k0 + len; + uint64_t e = Fetch64(s + 184) + seed; + uint64_t f = seed; + uint64_t g = 0; + uint64_t h = 0; + uint64_t i = 0; + uint64_t j = 0; + uint64_t t = c + d; + + result[0] = c; + result[1] = d; + + // 240 bytes of input per iter. + size_t iters = len / 240; + len -= iters * 240; + do { +#define CHUNK(multiplier, z) \ + { \ + uint64_t old_a = a; \ + a = ROTR64(b, 41 ^ z) * multiplier + Fetch64(s); \ + b = ROTR64(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ + c = ROTR64(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ + d = ROTR64(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ + e = ROTR64(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ + t = old_a; \ + } \ + f = _mm_crc32_u64(f, a); \ + g = _mm_crc32_u64(g, b); \ + h = _mm_crc32_u64(h, c); \ + i = _mm_crc32_u64(i, d); \ + j = _mm_crc32_u64(j, e); \ s += 40 - CHUNK(1, 1); CHUNK(k0, 0); - CHUNK(1, 1); CHUNK(k0, 0); - CHUNK(1, 1); CHUNK(k0, 0); - } while (--iters > 0); - - while (len >= 40) { - CHUNK(k0, 0); - len -= 40; - } - if (len > 0) { - s = s + len - 40; - CHUNK(k0, 0); - } - j += i << 32; - a = HashLen16(a, j); - h += g << 32; - b += h; - c = HashLen16(c, f) + i; - d = HashLen16(d, e + result[0]); - j += e; - i += HashLen16(h, t); - e = HashLen16(a, d) + j; - f = HashLen16(b, c) + a; - g = HashLen16(j, i) + c; - result[0] = e + f + g + h; - a = ShiftMix((a + g) * k0) * k0 + b; - result[1] += a + result[0]; - a = ShiftMix(a * k0) * k0 + c; - result[2] = a + result[1]; - a = ShiftMix((a + e) * k0) * k0; - result[3] = a + result[2]; + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + } while (--iters > 0); + + while (len >= 40) { + CHUNK(k0, 0); + len -= 40; + } + if (len > 0) { + s = s + len - 40; + CHUNK(k0, 0); + } + j += i << 32; + a = HashLen16(a, j); + h += g << 32; + b += h; + c = HashLen16(c, f) + i; + d = HashLen16(d, e + result[0]); + j += e; + i += HashLen16(h, t); + e = HashLen16(a, d) + j; + f = HashLen16(b, c) + a; + g = HashLen16(j, i) + c; + + // + result[0] = e + f + g + h; + a = ShiftMix((a + g) * k0) * k0 + b; + result[1] += a + result[0 ]; + a = ShiftMix(a * k0) * k0 + c; + result[2] = a + result[1 ]; + a = ShiftMix((a + e) * k0) * k0; + result[3] = a + result[2 ]; } // Requires len < 240. -template < bool bswap > -static void CityHashCrc256Short(const uint8_t *s, size_t len, uint64_t *result) { - uint8_t buf[240]; - memcpy(buf, s, len); - memset(buf + len, 0, 240 - len); - CityHashCrc256Long(buf, 240, ~static_cast(len), result); +template +static void CityHashCrc256Short( const uint8_t * s, size_t len, uint64_t * result ) { + uint8_t buf[240]; + + memcpy(buf, s, len); + memset(buf + len, 0, 240 - len); + CityHashCrc256Long(buf, 240, ~static_cast(len), result); } -template < bool bswap > -static void CityHashCrc256(const uint8_t *s, size_t len, uint64_t *result) { - if (likely(len >= 240)) { - CityHashCrc256Long(s, len, 0, result); - } else { - CityHashCrc256Short(s, len, result); - } +template +static void CityHashCrc256( const uint8_t * s, size_t len, uint64_t * result ) { + if (likely(len >= 240)) { + CityHashCrc256Long(s, len, 0, result); + } else { + CityHashCrc256Short(s, len, result); + } } // Requires len < 240. // Unofficial homegrown seeding for SMHasher3 -template < bool bswap > -static void CityHashCrc256ShortWithSeed(const uint8_t *s, size_t len, uint64_t seed, uint64_t *result) { - uint8_t buf[240]; - memcpy(buf, s, len); - memset(buf + len, 0, 240 - len); - CityHashCrc256Long(buf, 240, seed ^ ~static_cast(len), result); +template +static void CityHashCrc256ShortWithSeed( const uint8_t * s, size_t len, uint64_t seed, uint64_t * result ) { + uint8_t buf[240]; + + memcpy(buf, s, len); + memset(buf + len, 0, 240 - len); + CityHashCrc256Long(buf, 240, seed ^ ~static_cast(len), result); } // Unofficial -template < bool bswap > -static void CityHashCrc256WithSeed(const uint8_t *s, size_t len, uint64_t seed, uint64_t *result) { - if (likely(len >= 240)) { - CityHashCrc256Long(s, len, seed, result); - } else { - CityHashCrc256ShortWithSeed(s, len, seed, result); - } -} - -template < bool bswap > -static uint128_t CityHashCrc128WithSeed(const uint8_t *s, size_t len, uint128_t seed) { - if (len <= 900) { - return CityHash128WithSeed(s, len, seed); - } else { - uint64_t result[4]; - CityHashCrc256(s, len, result); - uint64_t u = Uint128High64(seed) + result[0]; - uint64_t v = Uint128Low64(seed) + result[1]; - return Uint128(HashLen16(u, v + result[2]), - HashLen16(ROTR64(v, 32), u * k0 + result[3])); - } -} - -template < bool bswap > -static uint128_t CityHashCrc128(const uint8_t *s, size_t len) { - if (len <= 900) { - return CityHash128(s, len); - } else { - uint64_t result[4]; - CityHashCrc256(s, len, result); - return Uint128(result[2], result[3]); - } +template +static void CityHashCrc256WithSeed( const uint8_t * s, size_t len, uint64_t seed, uint64_t * result ) { + if (likely(len >= 240)) { + CityHashCrc256Long(s, len, seed, result); + } else { + CityHashCrc256ShortWithSeed(s, len, seed, result); + } +} + +template +static uint128_t CityHashCrc128WithSeed( const uint8_t * s, size_t len, uint128_t seed ) { + if (len <= 900) { + return CityHash128WithSeed(s, len, seed); + } else { + uint64_t result[4]; + CityHashCrc256(s, len, result); + uint64_t u = Uint128High64(seed) + result[0]; + uint64_t v = Uint128Low64(seed) + result[1]; + return Uint128(HashLen16(u, v + result[2]), HashLen16(ROTR64(v, 32), u * k0 + result[3])); + } +} + +template +static uint128_t CityHashCrc128( const uint8_t * s, size_t len ) { + if (len <= 900) { + return CityHash128(s, len); + } else { + uint64_t result[4]; + CityHashCrc256(s, len, result); + return Uint128(result[2], result[3]); + } } #endif //------------------------------------------------------------ -template < bool bswap > -static void City32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void City32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h; + h = CityHash32WithSeed((const uint8_t *)in, len, (uint32_t)seed); PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void City64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void City64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h; + h = CityHash64WithSeed((const uint8_t *)in, len, (uint64_t)seed); PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap, uint32_t seedmode > -static void City128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void City128( const void * in, const size_t len, const seed_t seed, void * out ) { uint128_t seed128; - switch(seedmode) { + + switch (seedmode) { case 1: seed128 = Uint128((uint64_t)seed, 0); break; case 2: seed128 = Uint128(0, (uint64_t)seed); break; case 3: seed128 = Uint128((uint64_t)seed, (uint64_t)seed); break; @@ -635,16 +646,17 @@ static void City128(const void * in, const size_t len, const seed_t seed, void * uint128_t h; h = CityHash128WithSeed((const uint8_t *)in, len, seed128); - PUT_U64(Uint128Low64(h), (uint8_t *)out, 0); + PUT_U64(Uint128Low64(h) , (uint8_t *)out, 0); PUT_U64(Uint128High64(h), (uint8_t *)out, 8); } // This version is slightly different than the one in Farmhash, so it // is tested also. -template < bool bswap, uint32_t seedmode > -static void CityMurmur_128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CityMurmur_128( const void * in, const size_t len, const seed_t seed, void * out ) { uint128_t seed128; - switch(seedmode) { + + switch (seedmode) { case 1: seed128 = Uint128((uint64_t)seed, 0); break; case 2: seed128 = Uint128(0, (uint64_t)seed); break; case 3: seed128 = Uint128((uint64_t)seed, (uint64_t)seed); break; @@ -653,16 +665,17 @@ static void CityMurmur_128(const void * in, const size_t len, const seed_t seed, uint128_t h; h = CityMurmur((const uint8_t *)in, len, seed128); - PUT_U64(Uint128Low64(h), (uint8_t *)out, 0); + PUT_U64(Uint128Low64(h) , (uint8_t *)out, 0); PUT_U64(Uint128High64(h), (uint8_t *)out, 8); } #if defined(HAVE_X86_64_CRC32C) -template < bool bswap, uint32_t seedmode > -static void CityCrc128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CityCrc128( const void * in, const size_t len, const seed_t seed, void * out ) { uint128_t seed128; - switch(seedmode) { + + switch (seedmode) { case 1: seed128 = Uint128((uint64_t)seed, 0); break; case 2: seed128 = Uint128(0, (uint64_t)seed); break; case 3: seed128 = Uint128((uint64_t)seed, (uint64_t)seed); break; @@ -671,13 +684,14 @@ static void CityCrc128(const void * in, const size_t len, const seed_t seed, voi uint128_t h; h = CityHashCrc128WithSeed((const uint8_t *)in, len, seed128); - PUT_U64(Uint128Low64(h), (uint8_t *)out, 0); + PUT_U64(Uint128Low64(h) , (uint8_t *)out, 0); PUT_U64(Uint128High64(h), (uint8_t *)out, 8); } -template < bool bswap > -static void CityCrc256(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CityCrc256( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t result[4]; + CityHashCrc256WithSeed((const uint8_t *)in, len, (uint64_t)seed, result); PUT_U64(result[0], (uint8_t *)out, 0); PUT_U64(result[1], (uint8_t *)out, 8); @@ -689,192 +703,192 @@ static void CityCrc256(const void * in, const size_t len, const seed_t seed, voi //------------------------------------------------------------ REGISTER_FAMILY(cityhash, - $.src_url = "https://github.com/google/cityhash", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/google/cityhash", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(CityHash_32, - $.desc = "Google CityHash32WithSeed", - $.hash_flags = - FLAG_HASH_SMALL_SEED , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.verification_LE = 0x5C28AD62, - $.verification_BE = 0x79F1F814, - $.hashfn_native = City32, - $.hashfn_bswap = City32 -); + $.desc = "Google CityHash32WithSeed", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x5C28AD62, + $.verification_BE = 0x79F1F814, + $.hashfn_native = City32, + $.hashfn_bswap = City32 + ); REGISTER_HASH(CityHash_64, - $.desc = "Google CityHash64WithSeed", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.verification_LE = 0x25A20825, - $.verification_BE = 0x5698D8C4, - $.hashfn_native = City64, - $.hashfn_bswap = City64 -); + $.desc = "Google CityHash64WithSeed", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x25A20825, + $.verification_BE = 0x5698D8C4, + $.hashfn_native = City64, + $.hashfn_bswap = City64 + ); REGISTER_HASH(CityHash_128__seed1, - $.desc = "Google CityHash128WithSeed (seeded low 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0x6531F54E, - $.verification_BE = 0x595FC28D, - $.hashfn_native = City128, - $.hashfn_bswap = City128 -); + $.desc = "Google CityHash128WithSeed (seeded low 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x6531F54E, + $.verification_BE = 0x595FC28D, + $.hashfn_native = City128, + $.hashfn_bswap = City128 + ); REGISTER_HASH(CityHash_128__seed2, - $.desc = "Google CityHash128WithSeed (seeded high 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0x33E4ECD1, - $.verification_BE = 0xE7A9C3FD, - $.hashfn_native = City128, - $.hashfn_bswap = City128 -); + $.desc = "Google CityHash128WithSeed (seeded high 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x33E4ECD1, + $.verification_BE = 0xE7A9C3FD, + $.hashfn_native = City128, + $.hashfn_bswap = City128 + ); REGISTER_HASH(CityHash_128__seed3, - $.desc = "Google CityHash128WithSeed (seeded low+high 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0x1C03D5B9, - $.verification_BE = 0xCE532972, - $.hashfn_native = City128, - $.hashfn_bswap = City128 -); + $.desc = "Google CityHash128WithSeed (seeded low+high 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x1C03D5B9, + $.verification_BE = 0xCE532972, + $.hashfn_native = City128, + $.hashfn_bswap = City128 + ); REGISTER_HASH(CityMurmur__seed1, - $.desc = "CityMurmur (seeded low 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0x47EE6507, - $.verification_BE = 0x646575E0, - $.hashfn_native = CityMurmur_128, - $.hashfn_bswap = CityMurmur_128 -); + $.desc = "CityMurmur (seeded low 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x47EE6507, + $.verification_BE = 0x646575E0, + $.hashfn_native = CityMurmur_128, + $.hashfn_bswap = CityMurmur_128 + ); REGISTER_HASH(CityMurmur__seed2, - $.desc = "CityMurmur (seeded high 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0xAD2F2840, - $.verification_BE = 0x9677E1F6, - $.hashfn_native = CityMurmur_128, - $.hashfn_bswap = CityMurmur_128 -); + $.desc = "CityMurmur (seeded high 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xAD2F2840, + $.verification_BE = 0x9677E1F6, + $.hashfn_native = CityMurmur_128, + $.hashfn_bswap = CityMurmur_128 + ); REGISTER_HASH(CityMurmur__seed3, - $.desc = "CityMurmur (seeded low+high 64 bits)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0xE0FECCA8, - $.verification_BE = 0x2DA46BE3, - $.hashfn_native = CityMurmur_128, - $.hashfn_bswap = CityMurmur_128 -); + $.desc = "CityMurmur (seeded low+high 64 bits)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xE0FECCA8, + $.verification_BE = 0x2DA46BE3, + $.hashfn_native = CityMurmur_128, + $.hashfn_bswap = CityMurmur_128 + ); #if defined(HAVE_X86_64_CRC32C) REGISTER_HASH(CityHashCrc_128__seed1, - $.desc = "Google CityHashCrc128WithSeed (seeded low 64 bits)", - $.hash_flags = - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0xD4389C97, - $.verification_BE = 0x561D03B3 , - $.hashfn_native = CityCrc128, - $.hashfn_bswap = CityCrc128 -); + $.desc = "Google CityHashCrc128WithSeed (seeded low 64 bits)", + $.hash_flags = + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xD4389C97, + $.verification_BE = 0x561D03B3, + $.hashfn_native = CityCrc128, + $.hashfn_bswap = CityCrc128 + ); REGISTER_HASH(CityHashCrc_128__seed2, - $.desc = "Google CityHashCrc128WithSeed (seeded high 64 bits)", - $.hash_flags = - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0xD627AF5F, - $.verification_BE = 0x45FB4A4B, - $.hashfn_native = CityCrc128, - $.hashfn_bswap = CityCrc128 -); + $.desc = "Google CityHashCrc128WithSeed (seeded high 64 bits)", + $.hash_flags = + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xD627AF5F, + $.verification_BE = 0x45FB4A4B, + $.hashfn_native = CityCrc128, + $.hashfn_bswap = CityCrc128 + ); REGISTER_HASH(CityHashCrc_128__seed3, - $.desc = "Google CityHashCrc128WithSeed (seeded low+high 64 bits)", - $.hash_flags = - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.verification_LE = 0x1DA45069, - $.verification_BE = 0x9AFFB28F, - $.hashfn_native = CityCrc128, - $.hashfn_bswap = CityCrc128 -); + $.desc = "Google CityHashCrc128WithSeed (seeded low+high 64 bits)", + $.hash_flags = + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x1DA45069, + $.verification_BE = 0x9AFFB28F, + $.hashfn_native = CityCrc128, + $.hashfn_bswap = CityCrc128 + ); REGISTER_HASH(CityHashCrc_256, - $.desc = "Google CityHashCrc256 (with modified seeding)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT , - $.bits = 256, - $.verification_LE = 0x4A282558, - $.verification_BE = 0xB95D3E15, - $.hashfn_native = CityCrc256, - $.hashfn_bswap = CityCrc256 -); + $.desc = "Google CityHashCrc256 (with modified seeding)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 256, + $.verification_LE = 0x4A282558, + $.verification_BE = 0xB95D3E15, + $.hashfn_native = CityCrc256, + $.hashfn_bswap = CityCrc256 + ); #endif diff --git a/hashes/clhash.cpp b/hashes/clhash.cpp index 1ef9d6fb..549e52da 100644 --- a/hashes/clhash.cpp +++ b/hashes/clhash.cpp @@ -26,8 +26,8 @@ #if defined(HAVE_X86_64_CLMUL) -#include "Intrinsics.h" -#include + #include "Intrinsics.h" + #include /* * CLHash is a very fast hashing function that uses the @@ -40,7 +40,8 @@ * * Template option: if you define BITMIX during compilation, extra * work is done to pass smhasher's avalanche test succesfully. - **/ + * + */ //------------------------------------------------------------ // xoshift RNG for turning uint seeds into random bytes. @@ -48,22 +49,24 @@ // Keys for scalar xorshift128. Must be non-zero. These are modified // by xorshift128plus. typedef struct xorshift128plus_key_s { - uint64_t part1; - uint64_t part2; + uint64_t part1; + uint64_t part2; } xorshift128plus_key_t; -static uint64_t xorshift128plus(xorshift128plus_key_t * key) { - uint64_t s1 = key->part1; +static uint64_t xorshift128plus( xorshift128plus_key_t * key ) { + uint64_t s1 = key->part1; const uint64_t s0 = key->part2; + key->part1 = s0; - s1 ^= s1 << 23; // a + s1 ^= s1 << 23; // a key->part2 = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c return key->part2 + s0; } // key must be aligned to 16 bytes! -static void get_random_key_for_clhash(uint64_t seed1, uint64_t seed2, size_t keycnt, uint64_t * key) { +static void get_random_key_for_clhash( uint64_t seed1, uint64_t seed2, size_t keycnt, uint64_t * key ) { xorshift128plus_key_t k; + k.part1 = seed1; k.part2 = seed2; @@ -78,35 +81,36 @@ static void get_random_key_for_clhash(uint64_t seed1, uint64_t seed2, size_t key //------------------------------------------------------------ enum { - CLHASH_64BITWORDS_CHUNK_SIZE = 128, - CLHASH_64BITWORDS_EXTRA = 6, + CLHASH_64BITWORDS_CHUNK_SIZE = 128, + CLHASH_64BITWORDS_EXTRA = 6, RANDOM_64BITWORDS_NEEDED_FOR_CLHASH = CLHASH_64BITWORDS_CHUNK_SIZE + CLHASH_64BITWORDS_EXTRA, }; // static_assert((CLHASH_64BITWORDS_CHUNK_SIZE % 4) == 0) alignas(16) static uint64_t clhash_random[RANDOM_64BITWORDS_NEEDED_FOR_CLHASH]; -static bool clhash_init(void) { +static bool clhash_init( void ) { // Constants taken from SMHasher, for compatibility - get_random_key_for_clhash(UINT64_C(0xb3816f6a2c68e530), 711, - RANDOM_64BITWORDS_NEEDED_FOR_CLHASH, clhash_random); + get_random_key_for_clhash(UINT64_C(0xb3816f6a2c68e530), 711, RANDOM_64BITWORDS_NEEDED_FOR_CLHASH, clhash_random); return true; } //------------------------------------------------------------ // computes a << 1 -static inline __m128i leftshift1(__m128i a) { - const int x = 1; - __m128i u64shift = _mm_slli_epi64(a,x); - __m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t)); +static inline __m128i leftshift1( __m128i a ) { + const int x = 1; + __m128i u64shift = _mm_slli_epi64(a, x); + __m128i topbits = _mm_slli_si128(_mm_srli_epi64(a, 64 - x), sizeof(uint64_t)); + return _mm_or_si128(u64shift, topbits); } // computes a << 2 -static inline __m128i leftshift2(__m128i a) { - const int x = 2; - __m128i u64shift = _mm_slli_epi64(a,x); - __m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t)); +static inline __m128i leftshift2( __m128i a ) { + const int x = 2; + __m128i u64shift = _mm_slli_epi64(a, x); + __m128i topbits = _mm_slli_si128(_mm_srli_epi64(a, 64 - x), sizeof(uint64_t)); + return _mm_or_si128(u64shift, topbits); } @@ -121,7 +125,7 @@ static inline __m128i leftshift2(__m128i a) { // Precondition: given that Ahigh|Alow represents a 254-bit value // (two highest bits of Ahigh must be zero) ////////////////// -static inline __m128i lazymod127(__m128i Alow, __m128i Ahigh) { +static inline __m128i lazymod127( __m128i Alow, __m128i Ahigh ) { /////////////////////////////////////////////////// // CHECKING THE PRECONDITION: // Important: we are assuming that the two highest bits of Ahigh @@ -136,184 +140,189 @@ static inline __m128i lazymod127(__m128i Alow, __m128i Ahigh) { // credit for simplified implementation : Jan Wassenberg __m128i shift1 = leftshift1(Ahigh); __m128i shift2 = leftshift2(Ahigh); - __m128i final = _mm_xor_si128(_mm_xor_si128(Alow, shift1),shift2); + __m128i final = _mm_xor_si128(_mm_xor_si128(Alow, shift1), shift2); + return final; } // multiplication with lazy reduction // assumes that the two highest bits of the 256-bit multiplication are zeros // returns a lazy reduction -static inline __m128i mul128by128to128_lazymod127( __m128i A, __m128i B) { - __m128i Amix1 = _mm_clmulepi64_si128(A,B,0x01); - __m128i Amix2 = _mm_clmulepi64_si128(A,B,0x10); - __m128i Alow = _mm_clmulepi64_si128(A,B,0x00); - __m128i Ahigh = _mm_clmulepi64_si128(A,B,0x11); - __m128i Amix = _mm_xor_si128(Amix1,Amix2); - Amix1 = _mm_slli_si128(Amix,8); - Amix2 = _mm_srli_si128(Amix,8); - Alow = _mm_xor_si128(Alow,Amix1); - Ahigh = _mm_xor_si128(Ahigh,Amix2); +static inline __m128i mul128by128to128_lazymod127( __m128i A, __m128i B ) { + __m128i Amix1 = _mm_clmulepi64_si128(A, B, 0x01); + __m128i Amix2 = _mm_clmulepi64_si128(A, B, 0x10); + __m128i Alow = _mm_clmulepi64_si128(A, B, 0x00); + __m128i Ahigh = _mm_clmulepi64_si128(A, B, 0x11); + __m128i Amix = _mm_xor_si128(Amix1, Amix2); + + Amix1 = _mm_slli_si128(Amix, 8); + Amix2 = _mm_srli_si128(Amix, 8); + Alow = _mm_xor_si128(Alow , Amix1); + Ahigh = _mm_xor_si128(Ahigh, Amix2); return lazymod127(Alow, Ahigh); } // multiply the length and the some key, no modulo -static __m128i lazyLengthHash(uint64_t keylength, uint64_t length) { - const __m128i lengthvector = _mm_set_epi64x(keylength,length); - const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10); +static __m128i lazyLengthHash( uint64_t keylength, uint64_t length ) { + const __m128i lengthvector = _mm_set_epi64x(keylength, length); + const __m128i clprod1 = _mm_clmulepi64_si128(lengthvector, lengthvector, 0x10); + return clprod1; } // modulo reduction to 64-bit value. The high 64 bits contain garbage, // see precompReduction64 -static inline __m128i precompReduction64_si128( __m128i A) { - //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) - const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); - __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); - __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8( - 0, 27, 54, 45, - 108, 119, 90, 65, - (uint8_t)216, (uint8_t)195, (uint8_t)238, (uint8_t)245, - (uint8_t)180, (uint8_t)175, (uint8_t)130, (uint8_t)153) , - _mm_srli_si128(Q2,8)); - __m128i Q4 = _mm_xor_si128(Q2,A); - const __m128i final = _mm_xor_si128(Q3,Q4); +static inline __m128i precompReduction64_si128( __m128i A ) { + // const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) + const __m128i C = _mm_cvtsi64_si128((1U << 4) + (1U << 3) + (1U << 1) + (1U << 0)); + __m128i Q2 = _mm_clmulepi64_si128(A, C, 0x01); + __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (uint8_t)216, (uint8_t)195, + (uint8_t)238, (uint8_t)245, (uint8_t)180, (uint8_t)175, (uint8_t)130, (uint8_t)153), _mm_srli_si128(Q2, 8)); + __m128i Q4 = _mm_xor_si128(Q2, A); + const __m128i final = _mm_xor_si128(Q3, Q4); + return final; /// WARNING: HIGH 64 BITS CONTAIN GARBAGE } -static inline uint64_t precompReduction64( __m128i A) { +static inline uint64_t precompReduction64( __m128i A ) { return _mm_cvtsi128_si64(precompReduction64_si128(A)); } // hashing the bits in value using the keys key1 and key2 (only the // first 64 bits of key2 are used). This is basically (a xor k1) * (b // xor k2) mod p with length component. -static uint64_t simple128to64hashwithlength(const __m128i value, const __m128i key, uint64_t keylength, uint64_t length) { - const __m128i add = _mm_xor_si128 (value,key); - const __m128i clprod1 = _mm_clmulepi64_si128( add, add, 0x10); - const __m128i total = _mm_xor_si128 (clprod1,lazyLengthHash(keylength, length)); +static uint64_t simple128to64hashwithlength( const __m128i value, const __m128i key, + uint64_t keylength, uint64_t length ) { + const __m128i add = _mm_xor_si128(value, key); + const __m128i clprod1 = _mm_clmulepi64_si128(add, add, 0x10); + const __m128i total = _mm_xor_si128(clprod1, lazyLengthHash(keylength, length)); + return precompReduction64(total); } // we expect length to have value 128 or, at least, to be divisible by 4. -template < bool bswap > -static __m128i clmulhalfscalarproductwithoutreduction(const __m128i * randomsource, - const uint64_t * string, const size_t length) { +template +static __m128i clmulhalfscalarproductwithoutreduction( const __m128i * randomsource, + const uint64_t * string, const size_t length ) { const uint64_t * const endstring = string + length; __m128i acc = _mm_setzero_si128(); + // we expect length = 128 for (; string + 3 < endstring; randomsource += 2, string += 4) { - const __m128i temp1 = _mm_load_si128( randomsource); - const __m128i temp2 = _mm_lddqu_si128((const __m128i *) string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - const __m128i temp12 = _mm_load_si128(randomsource + 1); - const __m128i temp22 = _mm_lddqu_si128((const __m128i *) (string + 2)); - const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; - const __m128i add12 = _mm_xor_si128(temp12, temp32); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_lddqu_si128((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1 , acc); + const __m128i temp12 = _mm_load_si128(randomsource + 1); + const __m128i temp22 = _mm_lddqu_si128((const __m128i *)(string + 2)); + const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; + const __m128i add12 = _mm_xor_si128(temp12, temp32); const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); } return acc; } -template < bool bswap > -static __m128i clmulhalfscalarproductwithtailwithoutreduction(const __m128i * randomsource, - const uint64_t * string, const size_t length) { +template +static __m128i clmulhalfscalarproductwithtailwithoutreduction( const __m128i * randomsource, + const uint64_t * string, const size_t length ) { const uint64_t * const endstring = string + length; __m128i acc = _mm_setzero_si128(); + for (; string + 3 < endstring; randomsource += 2, string += 4) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_lddqu_si128((const __m128i *) string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - const __m128i temp12 = _mm_load_si128(randomsource+1); - const __m128i temp22 = _mm_lddqu_si128((const __m128i *) (string + 2)); - const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; - const __m128i add12 = _mm_xor_si128(temp12, temp32); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_lddqu_si128((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1 , acc); + const __m128i temp12 = _mm_load_si128(randomsource + 1); + const __m128i temp22 = _mm_lddqu_si128((const __m128i *)(string + 2)); + const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; + const __m128i add12 = _mm_xor_si128(temp12, temp32); const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); } if (string + 1 < endstring) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_lddqu_si128((const __m128i *) string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_lddqu_si128((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); + acc = _mm_xor_si128(clprod1, acc); randomsource += 1; - string += 2; + string += 2; } if (string < endstring) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_loadl_epi64((const __m128i *)string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_loadl_epi64((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); } return acc; } -template < bool bswap > -static __m128i clmulhalfscalarproductwithtailwithoutreductionWithExtraWord(const __m128i * randomsource, - const uint64_t * string, const size_t length, const uint64_t extraword) { +template +static __m128i clmulhalfscalarproductwithtailwithoutreductionWithExtraWord( const __m128i * randomsource, + const uint64_t * string, const size_t length, const uint64_t extraword ) { const uint64_t * const endstring = string + length; __m128i acc = _mm_setzero_si128(); + for (; string + 3 < endstring; randomsource += 2, string += 4) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_lddqu_si128((const __m128i *) string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - const __m128i temp12 = _mm_load_si128(randomsource+1); - const __m128i temp22 = _mm_lddqu_si128((const __m128i *) (string + 2)); - const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; - const __m128i add12 = _mm_xor_si128(temp12, temp32); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_lddqu_si128((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1 , acc); + const __m128i temp12 = _mm_load_si128(randomsource + 1); + const __m128i temp22 = _mm_lddqu_si128((const __m128i *)(string + 2)); + const __m128i temp32 = bswap ? mm_bswap64(temp22) : temp22; + const __m128i add12 = _mm_xor_si128(temp12, temp32); const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); acc = _mm_xor_si128(clprod12, acc); } if (string + 1 < endstring) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_lddqu_si128((const __m128i *) string); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_lddqu_si128((const __m128i *)string); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); + acc = _mm_xor_si128(clprod1, acc); randomsource += 1; - string += 2; + string += 2; } // we have to append an extra 1 if (string < endstring) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_set_epi64x(extraword,GET_U64((const uint8_t *)string, 0)); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_set_epi64x(extraword, GET_U64((const uint8_t *)string, 0)); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); acc = _mm_xor_si128(clprod1, acc); } else { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_loadl_epi64((const __m128i *)&extraword); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_loadl_epi64((const __m128i *)&extraword); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x01); acc = _mm_xor_si128(clprod1, acc); } return acc; } -template < bool bswap > -static __m128i clmulhalfscalarproductOnlyExtraWord(const __m128i * randomsource, - const uint64_t extraword) { - const __m128i temp1 = _mm_load_si128(randomsource); - const __m128i temp2 = _mm_loadl_epi64((const __m128i *)&extraword); - const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; - const __m128i add1 = _mm_xor_si128(temp1, temp3); +template +static __m128i clmulhalfscalarproductOnlyExtraWord( const __m128i * randomsource, const uint64_t extraword ) { + const __m128i temp1 = _mm_load_si128(randomsource); + const __m128i temp2 = _mm_loadl_epi64((const __m128i *)&extraword); + const __m128i temp3 = bswap ? mm_bswap64(temp2) : temp2; + const __m128i add1 = _mm_xor_si128(temp1, temp3); const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x01); + return clprod1; } @@ -321,7 +330,7 @@ static __m128i clmulhalfscalarproductOnlyExtraWord(const __m128i * randomsource, // an invertible function used to mix the bits // borrowed directly from murmurhash //////// -static inline uint64_t fmix64 ( uint64_t k ) { +static inline uint64_t fmix64( uint64_t k ) { k ^= k >> 33; k *= UINT64_C(0xff51afd7ed558ccd); k ^= k >> 33; @@ -333,31 +342,37 @@ static inline uint64_t fmix64 ( uint64_t k ) { // there always remain an incomplete word that has 1,2, 3, 4, 5, 6, 7 // used bytes. we append 0s to it. The result is really a fancy 8-byte buffer, so // this routine does not care about byteswapping. -static inline uint64_t createLastWord(const size_t lengthbyte, const uint64_t * lastw) { +static inline uint64_t createLastWord( const size_t lengthbyte, const uint64_t * lastw ) { const int significantbytes = lengthbyte % sizeof(uint64_t); - uint64_t lastword = 0; - memcpy(&lastword,lastw,significantbytes); // could possibly be faster? + uint64_t lastword = 0; + + memcpy(&lastword, lastw, significantbytes); // could possibly be faster? return lastword; } // The seeding here is homegrown for SMHasher3 -template < bool bitmix, bool bswap > -static uint64_t clhash(const void * random, const uint8_t * stringbyte, const size_t lengthbyte, const uint64_t seed) { - assert(((uintptr_t) random & 15) == 0);// we expect cache line alignment for the keys +template +static uint64_t clhash( const void * random, const uint8_t * stringbyte, + const size_t lengthbyte, const uint64_t seed ) { + assert(((uintptr_t)random & 15) == 0); // we expect cache line alignment for the keys // We process the data in chunks of 16 cache lines (m should be divisible by 4). const uint32_t m = CLHASH_64BITWORDS_CHUNK_SIZE; const uint32_t m128neededperblock = m / 2; // How many 128-bit words of random bits we use per block. + const uint64_t * string = (const uint64_t *)stringbyte; - const size_t length = lengthbyte / sizeof(uint64_t); // # of complete words - const size_t lengthinc = (lengthbyte + sizeof(uint64_t) - 1) / sizeof(uint64_t); // # of words, including partial ones - const __m128i * rs64 = (__m128i *)random; - const __m128i seed128 = lazyLengthHash(((const uint64_t *)(rs64 + m128neededperblock + 2))[1], seed); + + const size_t length = lengthbyte / sizeof(uint64_t); // # of complete words + const size_t lengthinc = (lengthbyte + sizeof(uint64_t) - 1) / sizeof(uint64_t); // # of words, including partial + // ones + + const __m128i * rs64 = (__m128i * )random; + const __m128i seed128 = lazyLengthHash(((const uint64_t *)(rs64 + m128neededperblock + 2))[1], seed); // to preserve alignment on cache lines for main loop, we pick random bits at the end - __m128i polyvalue = _mm_load_si128(rs64 + m128neededperblock); + __m128i polyvalue = _mm_load_si128(rs64 + m128neededperblock); // setting two highest bits to zero - polyvalue = _mm_and_si128(polyvalue,_mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x3fffffff)); + polyvalue = _mm_and_si128(polyvalue, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x3fffffff)); // we should check that polyvalue is non-zero, though this is best done outside the function and highly unlikely // long strings // modified from length to lengthinc to address issue #3 raised by Eik List @@ -370,11 +385,11 @@ static uint64_t clhash(const void * random, const uint8_t * stringbyte, const si for (; t + m <= length; t += m) { // we compute something like // acc+= polyvalue * acc + h1 - acc = mul128by128to128_lazymod127(polyvalue,acc); - const __m128i h1 = clmulhalfscalarproductwithoutreduction(rs64, string + t, m); + acc = mul128by128to128_lazymod127(polyvalue, acc); + const __m128i h1 = clmulhalfscalarproductwithoutreduction(rs64, string + t, m); acc = _mm_xor_si128(acc, h1); } - const uint32_t remain = length - t; // number of completely filled words + const uint32_t remain = length - t; // number of completely filled words if (remain != 0) { // we compute something like @@ -382,12 +397,12 @@ static uint64_t clhash(const void * random, const uint8_t * stringbyte, const si acc = mul128by128to128_lazymod127(polyvalue, acc); if (lengthbyte % sizeof(uint64_t) == 0) { const __m128i h1 = - clmulhalfscalarproductwithtailwithoutreduction(rs64, string + t, remain); + clmulhalfscalarproductwithtailwithoutreduction(rs64, string + t, remain); acc = _mm_xor_si128(acc, h1); } else { const uint64_t lastword = createLastWord(lengthbyte, (string + length)); - const __m128i h1 = - clmulhalfscalarproductwithtailwithoutreductionWithExtraWord( + const __m128i h1 = + clmulhalfscalarproductwithtailwithoutreductionWithExtraWord( rs64, string + t, remain, lastword); acc = _mm_xor_si128(acc, h1); } @@ -395,24 +410,22 @@ static uint64_t clhash(const void * random, const uint8_t * stringbyte, const si // there are no completely filled words left, but there is one partial word. acc = mul128by128to128_lazymod127(polyvalue, acc); const uint64_t lastword = createLastWord(lengthbyte, (string + length)); - const __m128i h1 = clmulhalfscalarproductOnlyExtraWord(rs64, lastword); + const __m128i h1 = clmulhalfscalarproductOnlyExtraWord(rs64, lastword); acc = _mm_xor_si128(acc, h1); } - const __m128i finalkey = _mm_load_si128(rs64 + m128neededperblock + 1); + const __m128i finalkey = _mm_load_si128(rs64 + m128neededperblock + 1); const uint64_t keylength = ((const uint64_t *)(rs64 + m128neededperblock + 2))[0]; return simple128to64hashwithlength(acc, finalkey, keylength, (uint64_t)lengthbyte); - } else { // short strings __m128i acc; - if(lengthbyte % sizeof(uint64_t) == 0) { - acc = clmulhalfscalarproductwithtailwithoutreduction(rs64, string, length); + if (lengthbyte % sizeof(uint64_t) == 0) { + acc = clmulhalfscalarproductwithtailwithoutreduction (rs64, string, length); } else { const uint64_t lastword = createLastWord(lengthbyte, (string + length)); - acc = clmulhalfscalarproductwithtailwithoutreductionWithExtraWord( - rs64, string, length, lastword); + acc = clmulhalfscalarproductwithtailwithoutreductionWithExtraWord(rs64, string, length, lastword); } // Mix the seed in using a non-commuting operation with all the xors and clmuls. acc = _mm_sub_epi8(acc, seed128); @@ -424,15 +437,17 @@ static uint64_t clhash(const void * random, const uint8_t * stringbyte, const si } //------------------------------------------------------------ -template < bool bswap > -static void CLHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CLHash( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = clhash(clhash_random, (const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void CLHashNomix(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CLHashNomix( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = clhash(clhash_random, (const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } @@ -440,46 +455,46 @@ static void CLHashNomix(const void * in, const size_t len, const seed_t seed, vo //------------------------------------------------------------ REGISTER_FAMILY(clhash, - $.src_url = "https://github.com/lemire/clhash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/lemire/clhash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); #if defined(HAVE_X86_64_CLMUL) REGISTER_HASH(CLhash__bitmix, - $.desc = "Carryless multiplication hash, with -DBITMIX", - $.hash_flags = - FLAG_HASH_CLMUL_BASED | - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_NO_SEED | - FLAG_HASH_SYSTEM_SPECIFIC , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_GPL3, - $.bits = 64, - $.verification_LE = 0x578865A5, - $.verification_BE = 0x0D2B93FA, - $.hashfn_native = CLHash, - $.hashfn_bswap = CLHash, - $.initfn = clhash_init -); + $.desc = "Carryless multiplication hash, with -DBITMIX", + $.hash_flags = + FLAG_HASH_CLMUL_BASED | + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_NO_SEED | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_GPL3, + $.bits = 64, + $.verification_LE = 0x578865A5, + $.verification_BE = 0x0D2B93FA, + $.hashfn_native = CLHash, + $.hashfn_bswap = CLHash, + $.initfn = clhash_init + ); REGISTER_HASH(CLhash, - $.desc = "Carryless multiplication hash, without -DBITMIX", - $.hash_flags = - FLAG_HASH_CLMUL_BASED | - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_NO_SEED | - FLAG_HASH_SYSTEM_SPECIFIC , - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_GPL3, - $.bits = 64, - $.verification_LE = 0xDD8248E4, - $.verification_BE = 0x25DDBEC2, - $.hashfn_native = CLHashNomix, - $.hashfn_bswap = CLHashNomix, - $.initfn = clhash_init -); + $.desc = "Carryless multiplication hash, without -DBITMIX", + $.hash_flags = + FLAG_HASH_CLMUL_BASED | + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_NO_SEED | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_GPL3, + $.bits = 64, + $.verification_LE = 0xDD8248E4, + $.verification_BE = 0x25DDBEC2, + $.hashfn_native = CLHashNomix, + $.hashfn_bswap = CLHashNomix, + $.initfn = clhash_init + ); #endif diff --git a/hashes/crap.cpp b/hashes/crap.cpp index 7ac675d4..fa198376 100644 --- a/hashes/crap.cpp +++ b/hashes/crap.cpp @@ -36,14 +36,14 @@ // https://web.archive.org/web/20150218011152/http://floodyberry.com/noncryptohashzoo/CrapWow.html // https://web.archive.org/web/20150218011033/http://floodyberry.com/noncryptohashzoo/CrapWow64.html -template < bool bswap > -static uint32_t Crap8_impl(const uint8_t * key, size_t len, uint32_t seed) { -#define c8fold( a, b, y, z ) { \ - p = (uint32_t)(a) * (uint64_t)(b); \ - y ^= (uint32_t)p; \ - z ^= (uint32_t)(p >> 32); \ +template +static uint32_t Crap8_impl( const uint8_t * key, size_t len, uint32_t seed ) { +#define c8fold( a, b, y, z ) { \ + p = (uint32_t)(a) * (uint64_t)(b); \ + y ^= (uint32_t)p; \ + z ^= (uint32_t)(p >> 32); \ } -#define c8mix( in ) { h *= m; c8fold( in, m, k, h ); } +#define c8mix(in) { h *= m; c8fold(in, m, k, h); } const uint32_t m = 0x83d2e73b, n = 0x97e1cc59; uint32_t h = (uint32_t)len + seed, k = n + (uint32_t)len; @@ -62,28 +62,29 @@ static uint32_t Crap8_impl(const uint8_t * key, size_t len, uint32_t seed) { if (isLE() ^ bswap) { c8mix(GET_U32(key, 0) & ((1 << (len * 8)) - 1)); } else { - c8mix(GET_U32(key, 0) >> (32 -(len * 8))); + c8mix(GET_U32(key, 0) >> (32 - (len * 8))); } } c8fold(h ^ k, n, k, k); return k; } + #undef c8mix #undef c8fold -template < bool bswap > -static uint32_t CrapWow_impl(const uint8_t * key, size_t len, uint32_t seed) { -#define cwfold( a, b, lo, hi) { \ - p = (uint32_t)(a) * (uint64_t)(b); \ - lo ^= (uint32_t)p; \ - hi ^= (uint32_t)(p >> 32); \ +template +static uint32_t CrapWow_impl( const uint8_t * key, size_t len, uint32_t seed ) { +#define cwfold( a, b, lo, hi) { \ + p = (uint32_t)(a) * (uint64_t)(b); \ + lo ^= (uint32_t)p; \ + hi ^= (uint32_t)(p >> 32); \ } -#define cwmixa( in ) { cwfold( in, m, k, h ); } -#define cwmixb( in ) { cwfold( in, n, h, k ); } +#define cwmixa(in) { cwfold(in, m, k, h); } +#define cwmixb(in) { cwfold(in, n, h, k); } - const uint32_t m = 0x57559429, n = 0x5052acdb; - uint32_t h = (uint32_t)len, k = (uint32_t)len + seed + n; - uint64_t p; + const uint32_t m = 0x57559429, n = 0x5052acdb; + uint32_t h = (uint32_t)len, k = (uint32_t)len + seed + n; + uint64_t p; while (len >= 8) { cwmixb(GET_U32(key, 0)); @@ -102,26 +103,27 @@ static uint32_t CrapWow_impl(const uint8_t * key, size_t len, uint32_t seed) { } } - cwmixb(h ^ (k + n)); - return k ^ h; + cwmixb(h ^ (k + n)); + return k ^ h; } + #undef cwmixb #undef cwmixa #undef cwfold -template < bool bswap > -static uint64_t CrapWow64_impl(const uint8_t * key, size_t len, uint64_t seed) { -#define cwfold(a, b, lo, hi) { \ - mult64_128(pl, ph, a, b); \ - lo ^= pl; \ - hi ^= ph; \ +template +static uint64_t CrapWow64_impl( const uint8_t * key, size_t len, uint64_t seed ) { +#define cwfold(a, b, lo, hi) { \ + mult64_128(pl, ph, a, b); \ + lo ^= pl; \ + hi ^= ph; \ } -#define cwmixa( in ) { cwfold( in, m, k, h ); } -#define cwmixb( in ) { cwfold( in, n, h, k ); } +#define cwmixa(in) { cwfold(in, m, k, h); } +#define cwmixb(in) { cwfold(in, n, h, k); } const uint64_t m = UINT64_C(0x95b47aa3355ba1a1), n = UINT64_C(0x8a970be7488fda55); - uint64_t h = (uint64_t)len, k = (uint64_t)len + seed + n; - uint64_t pl, ph; + uint64_t h = (uint64_t)len, k = (uint64_t)len + seed + n; + uint64_t pl, ph; while (len >= 16) { cwmixb(GET_U64(key, 0)); @@ -140,81 +142,85 @@ static uint64_t CrapWow64_impl(const uint8_t * key, size_t len, uint64_t seed) { } } - cwmixb(h ^ (k + n)); - return k ^ h; + cwmixb(h ^ (k + n)); + return k ^ h; } + #undef cwmixb #undef cwmixa #undef cwfold //------------------------------------------------------------ -template < bool bswap > -static void Crap8(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void Crap8( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = Crap8_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void CrapWow(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CrapWow( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = CrapWow_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void CrapWow64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CrapWow64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = CrapWow64_impl((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(crap, - $.src_url = "https://web.archive.org/web/20150218011033/http://floodyberry.com/noncryptohashzoo/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://web.archive.org/web/20150218011033/http://floodyberry.com/noncryptohashzoo/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(Crap8, - $.desc = "Noncryptohashzoo's Crap8 hash", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x743E97A1, - $.verification_BE = 0xDFE06AD9, - $.hashfn_native = Crap8, - $.hashfn_bswap = Crap8 -); + $.desc = "Noncryptohashzoo's Crap8 hash", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x743E97A1, + $.verification_BE = 0xDFE06AD9, + $.hashfn_native = Crap8, + $.hashfn_bswap = Crap8 + ); REGISTER_HASH(CrapWow, - $.desc = "Noncryptohashzoo's CrapWow hash", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x49ECB015, - $.verification_BE = 0x4EF994DF, - $.hashfn_native = CrapWow, - $.hashfn_bswap = CrapWow -); + $.desc = "Noncryptohashzoo's CrapWow hash", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x49ECB015, + $.verification_BE = 0x4EF994DF, + $.hashfn_native = CrapWow, + $.hashfn_bswap = CrapWow + ); REGISTER_HASH(CrapWow_64, - $.desc = "Noncryptohashzoo's CrapWow64 hash", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x669D3A9B, - $.verification_BE = 0xCBB7690C, - $.hashfn_native = CrapWow64, - $.hashfn_bswap = CrapWow64 -); + $.desc = "Noncryptohashzoo's CrapWow64 hash", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x669D3A9B, + $.verification_BE = 0xCBB7690C, + $.hashfn_native = CrapWow64, + $.hashfn_bswap = CrapWow64 + ); diff --git a/hashes/crc.cpp b/hashes/crc.cpp index 9fc8bb25..8d29688f 100644 --- a/hashes/crc.cpp +++ b/hashes/crc.cpp @@ -29,12 +29,12 @@ #include "Hashlib.h" typedef struct { - uint32_t crc32_long[4][256]; - uint32_t crc32_short[4][256]; + uint32_t crc32_long[4][256]; + uint32_t crc32_short[4][256]; } crc_hw_table; #if defined(HAVE_X86_64_CRC32C) -#include "Intrinsics.h" + #include "Intrinsics.h" // Fancy hardware version @@ -44,7 +44,7 @@ typedef struct { * mat must have at least as many entries as the power of two for most * significant one bit in vec. */ -static inline uint32_t gf2_matrix_times(uint32_t * mat, uint32_t vec) { +static inline uint32_t gf2_matrix_times( uint32_t * mat, uint32_t vec ) { uint32_t sum; sum = 0; @@ -60,7 +60,7 @@ static inline uint32_t gf2_matrix_times(uint32_t * mat, uint32_t vec) { * Multiply a matrix by itself over GF(2). Both mat and square must * have 32 rows. */ -static inline void gf2_matrix_square(uint32_t * square, uint32_t * mat) { +static inline void gf2_matrix_square( uint32_t * square, uint32_t * mat ) { for (int n = 0; n < 32; n++) { square[n] = gf2_matrix_times(mat, mat[n]); } @@ -74,33 +74,35 @@ static inline void gf2_matrix_square(uint32_t * square, uint32_t * mat) { * could be easily written for any len, but that is not needed for * this application. */ -template < uint32_t polynomial > -static void crc32_zeros_op(uint32_t * even, size_t len) { +template +static void crc32_zeros_op( uint32_t * even, size_t len ) { uint32_t row; - uint32_t odd[32]; /* odd-power-of-two zeros operator */ + uint32_t odd[32]; /* odd-power-of-two zeros operator */ /* put operator for one zero bit in odd */ - odd[0] = polynomial; /* CRC-32 polynomial */ - row = 1; + odd[0] = polynomial; /* CRC-32 polynomial */ + row = 1; for (int n = 1; n < 32; n++) { odd[n] = row; - row <<= 1; + row <<= 1; } /* put operator for two zero bits in even */ - gf2_matrix_square(even, odd); + gf2_matrix_square(even, odd ); /* put operator for four zero bits in odd */ - gf2_matrix_square(odd, even); + gf2_matrix_square(odd , even); - /* first square will put the operator for one zero byte (eight zero bits), - in even -- next square puts operator for two zero bytes in odd, and so - on, until len has been rotated down to zero */ + /* + * first square will put the operator for one zero byte (eight zero bits), + * in even -- next square puts operator for two zero bytes in odd, and so + * on, until len has been rotated down to zero + */ do { - gf2_matrix_square(even, odd); + gf2_matrix_square(even, odd ); len >>= 1; if (len == 0) { return; } - gf2_matrix_square(odd, even); + gf2_matrix_square(odd , even); len >>= 1; } while (len); @@ -114,12 +116,12 @@ static void crc32_zeros_op(uint32_t * even, size_t len) { * Take a length and build four lookup tables for applying the zeros * operator for that length, byte-by-byte on the operand. */ -static void crc32_zeros(uint32_t op[32], uint32_t zeros[][256]) { +static void crc32_zeros( uint32_t op[32], uint32_t zeros[][256] ) { uint32_t n; for (n = 0; n < 256; n++) { - zeros[0][n] = gf2_matrix_times(op, n); - zeros[1][n] = gf2_matrix_times(op, n << 8); + zeros[0][n] = gf2_matrix_times(op, n ); + zeros[1][n] = gf2_matrix_times(op, n << 8); zeros[2][n] = gf2_matrix_times(op, n << 16); zeros[3][n] = gf2_matrix_times(op, n << 24); } @@ -128,13 +130,14 @@ static void crc32_zeros(uint32_t op[32], uint32_t zeros[][256]) { // Block sizes for three-way parallel crc computation. // HW_LONGBLOCK_LEN and HW_SHORTBLOCK_LEN must both be // powers of two. -static const uint32_t HW_LONGBLOCK_LEN = 8192; +static const uint32_t HW_LONGBLOCK_LEN = 8192; static const uint32_t HW_SHORTBLOCK_LEN = 256; /* Initialize tables for shifting crcs. */ -template < uint32_t polynomial > -static void crc32_init_hw(crc_hw_table * tblp) { +template +static void crc32_init_hw( crc_hw_table * tblp ) { uint32_t op[32]; + crc32_zeros_op(op, HW_LONGBLOCK_LEN); crc32_zeros(op, tblp->crc32_long); @@ -143,16 +146,16 @@ static void crc32_init_hw(crc_hw_table * tblp) { } /* Apply the zeros operator table to crc. */ -static inline uint32_t crc32_shift(const uint32_t zeros[][256], uint32_t crc) { +static inline uint32_t crc32_shift( const uint32_t zeros[][256], uint32_t crc ) { return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]; } /* Compute CRC-32C using the Intel hardware instruction. */ -static uint32_t crc32c_hw(uint32_t crc, const crc_hw_table * tbl, const void * buf, size_t len) { +static uint32_t crc32c_hw( uint32_t crc, const crc_hw_table * tbl, const void * buf, size_t len ) { const uint8_t * next = (const uint8_t *)buf; const uint8_t * end; - uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ + uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ /* Pre-process the crc */ crc0 = crc ^ 0xffffffff; @@ -173,40 +176,40 @@ static uint32_t crc32c_hw(uint32_t crc, const crc_hw_table * tbl, const void * b * Bridge, and Ivy Bridge architectures, which have a throughput * of one crc per cycle, but a latency of three cycles. */ - while (len >= HW_LONGBLOCK_LEN*3) { + while (len >= HW_LONGBLOCK_LEN * 3) { crc1 = 0; crc2 = 0; - end = next + HW_LONGBLOCK_LEN; + end = next + HW_LONGBLOCK_LEN; do { - crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); - crc1 = _mm_crc32_u64(crc1, GET_U64(next, HW_LONGBLOCK_LEN)); - crc2 = _mm_crc32_u64(crc2, GET_U64(next, HW_LONGBLOCK_LEN+HW_LONGBLOCK_LEN)); + crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); + crc1 = _mm_crc32_u64(crc1, GET_U64(next, HW_LONGBLOCK_LEN)); + crc2 = _mm_crc32_u64(crc2, GET_U64(next, HW_LONGBLOCK_LEN + HW_LONGBLOCK_LEN)); next += 8; } while (next < end); - crc0 = crc32_shift(tbl->crc32_long, crc0) ^ crc1; - crc0 = crc32_shift(tbl->crc32_long, crc0) ^ crc2; - next += HW_LONGBLOCK_LEN*2; - len -= HW_LONGBLOCK_LEN*3; + crc0 = crc32_shift(tbl->crc32_long, crc0) ^ crc1; + crc0 = crc32_shift(tbl->crc32_long, crc0) ^ crc2; + next += HW_LONGBLOCK_LEN * 2; + len -= HW_LONGBLOCK_LEN * 3; } /* * Do the same thing, but now on HW_SHORTBLOCK_LEN*3 blocks for * the remaining data less than a HW_LONGBLOCK_LEN*3 block. */ - while (len >= HW_SHORTBLOCK_LEN*3) { + while (len >= HW_SHORTBLOCK_LEN * 3) { crc1 = 0; crc2 = 0; - end = next + HW_SHORTBLOCK_LEN; + end = next + HW_SHORTBLOCK_LEN; do { - crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); - crc1 = _mm_crc32_u64(crc1, GET_U64(next, HW_SHORTBLOCK_LEN)); - crc2 = _mm_crc32_u64(crc2, GET_U64(next, HW_SHORTBLOCK_LEN+HW_SHORTBLOCK_LEN)); + crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); + crc1 = _mm_crc32_u64(crc1, GET_U64(next, HW_SHORTBLOCK_LEN)); + crc2 = _mm_crc32_u64(crc2, GET_U64(next, HW_SHORTBLOCK_LEN + HW_SHORTBLOCK_LEN)); next += 8; } while (next < end); - crc0 = crc32_shift(tbl->crc32_short, crc0) ^ crc1; - crc0 = crc32_shift(tbl->crc32_short, crc0) ^ crc2; - next += HW_SHORTBLOCK_LEN*2; - len -= HW_SHORTBLOCK_LEN*3; + crc0 = crc32_shift(tbl->crc32_short, crc0) ^ crc1; + crc0 = crc32_shift(tbl->crc32_short, crc0) ^ crc2; + next += HW_SHORTBLOCK_LEN * 2; + len -= HW_SHORTBLOCK_LEN * 3; } /* @@ -215,7 +218,7 @@ static uint32_t crc32c_hw(uint32_t crc, const crc_hw_table * tbl, const void * b */ end = next + (len - (len & 7)); while (next < end) { - crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); + crc0 = _mm_crc32_u64(crc0, GET_U64(next, 0)); next += 8; } len &= 7; @@ -236,7 +239,7 @@ static uint32_t crc32c_hw(uint32_t crc, const crc_hw_table * tbl, const void * b typedef uint32_t crc_sw_table[16][256]; /* Construct table for software CRC-32 calculation. */ -static void crc32_init_sw(const uint32_t POLY, crc_sw_table crc32_table) { +static void crc32_init_sw( const uint32_t POLY, crc_sw_table crc32_table ) { uint32_t n, crc, k; for (n = 0; n < 256; n++) { @@ -261,10 +264,10 @@ static void crc32_init_sw(const uint32_t POLY, crc_sw_table crc32_table) { } // Table-driven software version -template < bool bswap > -static uint32_t crc32_sw(uint32_t crci, const crc_sw_table crc32_table, const void * buf, size_t len) { +template +static uint32_t crc32_sw( uint32_t crci, const crc_sw_table crc32_table, const void * buf, size_t len ) { const uint8_t * next = (const uint8_t *)buf; - uint64_t crc; + uint64_t crc; crc = crci ^ 0xffffffff; @@ -274,46 +277,46 @@ static uint32_t crc32_sw(uint32_t crci, const crc_sw_table crc32_table, const vo } while (len >= 16) { uint64_t wd1, wd2; - wd1 = GET_U64(next, 0); - wd2 = GET_U64(next, 8); // byteswapping taken care of via table indexing! + wd1 = GET_U64(next, 0); + wd2 = GET_U64(next, 8); // byteswapping taken care of via table indexing! crc ^= wd1; if (bswap) { crc = - crc32_table[15][ crc & 0xff] ^ - crc32_table[14][(crc >> 8) & 0xff] ^ - crc32_table[13][(crc >> 16) & 0xff] ^ - crc32_table[12][(crc >> 24) & 0xff] ^ - crc32_table[11][(crc >> 32) & 0xff] ^ - crc32_table[10][(crc >> 40) & 0xff] ^ - crc32_table[ 9][(crc >> 48) & 0xff] ^ - crc32_table[ 8][ crc >> 56] ^ - crc32_table[ 0][ wd2 & 0xff] ^ - crc32_table[ 1][(wd2 >> 8) & 0xff] ^ - crc32_table[ 2][(wd2 >> 16) & 0xff] ^ - crc32_table[ 3][(wd2 >> 24) & 0xff] ^ - crc32_table[ 4][(wd2 >> 32) & 0xff] ^ - crc32_table[ 5][(wd2 >> 40) & 0xff] ^ - crc32_table[ 6][(wd2 >> 48) & 0xff] ^ - crc32_table[ 7][ wd2 >> 56] ; + crc32_table[15][crc & 0xff] ^ + crc32_table[14][(crc >> 8) & 0xff] ^ + crc32_table[13][(crc >> 16) & 0xff] ^ + crc32_table[12][(crc >> 24) & 0xff] ^ + crc32_table[11][(crc >> 32) & 0xff] ^ + crc32_table[10][(crc >> 40) & 0xff] ^ + crc32_table[ 9][(crc >> 48) & 0xff] ^ + crc32_table[ 8][crc >> 56] ^ + crc32_table[ 0][wd2 & 0xff] ^ + crc32_table[ 1][(wd2 >> 8) & 0xff] ^ + crc32_table[ 2][(wd2 >> 16) & 0xff] ^ + crc32_table[ 3][(wd2 >> 24) & 0xff] ^ + crc32_table[ 4][(wd2 >> 32) & 0xff] ^ + crc32_table[ 5][(wd2 >> 40) & 0xff] ^ + crc32_table[ 6][(wd2 >> 48) & 0xff] ^ + crc32_table[ 7][wd2 >> 56]; } else { crc = - crc32_table[15][ crc & 0xff] ^ - crc32_table[14][(crc >> 8) & 0xff] ^ - crc32_table[13][(crc >> 16) & 0xff] ^ - crc32_table[12][(crc >> 24) & 0xff] ^ - crc32_table[11][(crc >> 32) & 0xff] ^ - crc32_table[10][(crc >> 40) & 0xff] ^ - crc32_table[ 9][(crc >> 48) & 0xff] ^ - crc32_table[ 8][ crc >> 56] ^ - crc32_table[ 7][ wd2 & 0xff] ^ - crc32_table[ 6][(wd2 >> 8) & 0xff] ^ - crc32_table[ 5][(wd2 >> 16) & 0xff] ^ - crc32_table[ 4][(wd2 >> 24) & 0xff] ^ - crc32_table[ 3][(wd2 >> 32) & 0xff] ^ - crc32_table[ 2][(wd2 >> 40) & 0xff] ^ - crc32_table[ 1][(wd2 >> 48) & 0xff] ^ - crc32_table[ 0][ wd2 >> 56] ; + crc32_table[15][crc & 0xff] ^ + crc32_table[14][(crc >> 8) & 0xff] ^ + crc32_table[13][(crc >> 16) & 0xff] ^ + crc32_table[12][(crc >> 24) & 0xff] ^ + crc32_table[11][(crc >> 32) & 0xff] ^ + crc32_table[10][(crc >> 40) & 0xff] ^ + crc32_table[ 9][(crc >> 48) & 0xff] ^ + crc32_table[ 8][crc >> 56] ^ + crc32_table[ 7][wd2 & 0xff] ^ + crc32_table[ 6][(wd2 >> 8) & 0xff] ^ + crc32_table[ 5][(wd2 >> 16) & 0xff] ^ + crc32_table[ 4][(wd2 >> 24) & 0xff] ^ + crc32_table[ 3][(wd2 >> 32) & 0xff] ^ + crc32_table[ 2][(wd2 >> 40) & 0xff] ^ + crc32_table[ 1][(wd2 >> 48) & 0xff] ^ + crc32_table[ 0][wd2 >> 56]; } next += 16; len -= 16; @@ -335,17 +338,17 @@ static uint32_t crc32_sw(uint32_t crci, const crc_sw_table crc32_table, const vo /* * For now, only store 1 set of tables at a time. -*/ -static uint32_t table_poly; + */ +static uint32_t table_poly; static crc_hw_table hw_tables; static crc_sw_table sw_tables; -template < uint32_t polynomial > -static void CRC32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void CRC32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t crc = seed; + if (polynomial != table_poly) { - printf("CRC32 of poly %08x requested, but Init() was given %08x\n", - polynomial, table_poly); + printf("CRC32 of poly %08x requested, but Init() was given %08x\n", polynomial, table_poly); exit(1); } #if defined(HAVE_X86_64_CRC32C) @@ -363,8 +366,8 @@ static void CRC32(const void * in, const size_t len, const seed_t seed, void * o memcpy(out, &crc, 4); } -template < uint32_t polynomial > -static bool CRC32_init(void) { +template +static bool CRC32_init( void ) { table_poly = polynomial; #if defined(HAVE_X86_64_CRC32C) if (polynomial == POLY_CRC32C) { @@ -377,24 +380,24 @@ static bool CRC32_init(void) { } REGISTER_FAMILY(crc, - $.src_url = "https://github.com/baruch/crcbench/blob/master/crc-mark-adler.c", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/baruch/crcbench/blob/master/crc-mark-adler.c", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(CRC_32C, - $.desc = "CRC32-C (Castagnoli, 0x1EDC6F41 / 0x82F63B78)", - $.hash_flags = - FLAG_HASH_CRC_BASED | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0x6E6071BD, - $.verification_BE = 0x6E6071BD, - $.initfn = CRC32_init, - $.hashfn_native = CRC32, - $.hashfn_bswap = CRC32 -); + $.desc = "CRC32-C (Castagnoli, 0x1EDC6F41 / 0x82F63B78)", + $.hash_flags = + FLAG_HASH_CRC_BASED | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0x6E6071BD, + $.verification_BE = 0x6E6071BD, + $.initfn = CRC32_init, + $.hashfn_native = CRC32, + $.hashfn_bswap = CRC32 + ); diff --git a/hashes/discohash.cpp b/hashes/discohash.cpp index 5a77fad8..5bbae08e 100644 --- a/hashes/discohash.cpp +++ b/hashes/discohash.cpp @@ -27,36 +27,38 @@ #include "Platform.h" #include "Hashlib.h" -static const uint32_t STATE = 32; // Must be divisible by 8 -static const uint32_t STATE64 = STATE >> 3; -static const uint32_t STATEM = STATE-1; -static const uint32_t HSTATE64M = (STATE64 >> 1)-1; -static const uint32_t STATE64M = STATE64-1; -static const uint64_t P = UINT64_C(0xFFFFFFFFFFFFFFFF) - 58; -static const uint64_t Q = UINT64_C(13166748625691186689); +static const uint32_t STATE = 32; // Must be divisible by 8 +static const uint32_t STATE64 = STATE >> 3; +static const uint32_t STATEM = STATE - 1; +static const uint32_t HSTATE64M = (STATE64 >> 1) - 1; +static const uint32_t STATE64M = STATE64 - 1; +static const uint64_t P = UINT64_C( 0xFFFFFFFFFFFFFFFF) - 58; +static const uint64_t Q = UINT64_C(13166748625691186689); //-------- // State mix function -static FORCE_INLINE uint8_t ROTR8(uint8_t v, int n) { +static FORCE_INLINE uint8_t ROTR8( uint8_t v, int n ) { n = n & 7U; - if (n) - v = (v >> n) | (v << (8-n)); + if (n) { + v = (v >> n) | (v << (8 - n)); + } return v; } -static FORCE_INLINE void mix(uint64_t * ds, const uint32_t A) { - const uint32_t B = A+1; - ds[A] *= P; - ds[A] = ROTR64(ds[A], 23); - ds[A] *= Q; - //ds[A] = ROTR64(ds[A], 23); +static FORCE_INLINE void mix( uint64_t * ds, const uint32_t A ) { + const uint32_t B = A + 1; + + ds[A] *= P; + ds[A] = ROTR64(ds[A], 23); + ds[A] *= Q; + // ds[A] = ROTR64(ds[A], 23); - ds[B] ^= ds[A]; + ds[B] ^= ds[A]; - ds[B] *= P; - ds[B] = ROTR64(ds[B], 23); - ds[B] *= Q; - //ds[B] = ROTR64(ds[B], 23); + ds[B] *= P; + ds[B] = ROTR64(ds[B], 23); + ds[B] *= Q; + // ds[B] = ROTR64(ds[B], 23); } //--------- @@ -68,23 +70,23 @@ static FORCE_INLINE void mix(uint64_t * ds, const uint32_t A) { // // The oldver parameter "fixes" a possibly-unintentional behavior // change, details of which are below. -template < bool bswap, bool reread, bool oldver > -static FORCE_INLINE void round(uint64_t * ds, const uint8_t * m8, uint32_t len) { +template +static FORCE_INLINE void round( uint64_t * ds, const uint8_t * m8, uint32_t len ) { uint32_t index; - uint32_t sindex = 0; - uint32_t Len = len >> 3; - uint64_t counter = UINT64_C(0xfaccadaccad09997); - uint8_t counter8 = 137; + uint32_t sindex = 0; + uint32_t Len = len >> 3; + uint64_t counter = UINT64_C(0xfaccadaccad09997); + uint8_t counter8 = 137; - //#pragma omp parallel for - for(index = 0; index < Len; index++) { - uint64_t blk = GET_U64(m8, index*8); + // #pragma omp parallel for + for (index = 0; index < Len; index++) { + uint64_t blk = GET_U64(m8, index * 8); ds[sindex] += ROTR64(blk + index + counter + 1, 23); - if (reread) { blk = GET_U64(m8, index*8); } - counter += ~blk + 1; - if ( sindex == HSTATE64M ) { + if (reread) { blk = GET_U64(m8, index * 8); } + counter += ~blk + 1; + if (sindex == HSTATE64M) { mix(ds, 0); - } else if ( sindex == STATE64M ) { + } else if (sindex == STATE64M) { mix(ds, 2); sindex = -1; } @@ -103,14 +105,14 @@ static FORCE_INLINE void round(uint64_t * ds, const uint8_t * m8, uint32_t len) // are implemented here. Len = index << 3; if (oldver) { - sindex = Len&(STATEM); + sindex = Len & (STATEM); } else { - sindex = index&(STATEM); + sindex = index & (STATEM); } - //#pragma omp parallel for - for(index = Len; index < len; index++) { - ((uint8_t *)ds)[bswap ? (sindex^7) : sindex] += ROTR8(m8[index] + index + counter8 + 1, 23); + // #pragma omp parallel for + for (index = Len; index < len; index++) { + ((uint8_t *)ds)[bswap ? (sindex ^ 7) : sindex] += ROTR8(m8[index] + index + counter8 + 1, 23); // I also wonder if this was intended to be m8[index], to // mirror the primary 8-byte loop above... // @@ -119,8 +121,8 @@ static FORCE_INLINE void round(uint64_t * ds, const uint8_t * m8, uint32_t len) // of sindex is (len & ~7) if oldver == true, and (len >> 3) // if oldver == false. counter8 += ~m8[sindex] + 1; - mix(ds, index%STATE64M); - if ( sindex >= STATEM ) { + mix(ds, index % STATE64M); + if (sindex >= STATEM) { sindex = -1; } sindex++; @@ -134,31 +136,31 @@ static FORCE_INLINE void round(uint64_t * ds, const uint8_t * m8, uint32_t len) //--------- // main hash function -template < uint32_t hashsize, bool bswap, bool oldver > -static void BEBB4185(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void BEBB4185( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * key8Arr = (const uint8_t *)in; - uint8_t * out8 = (uint8_t *)out; - uint32_t seedbuf[4]; + uint8_t * out8 = (uint8_t * )out; + uint32_t seedbuf[4]; if (len >= UINT32_C(0xffffffff)) { return; } // the cali number from the Matrix (1999) uint32_t seed32 = seed; if (!bswap) { - seedbuf[0] = 0xc5550690; + seedbuf[0] = 0xc5550690; seedbuf[0] -= seed32; - seedbuf[1] = 1 + seed32; - seedbuf[2] = ~(1 - seed32); - seedbuf[3] = (1+seed32) * 0xf00dacca; + seedbuf[1] = 1 + seed32; + seedbuf[2] = ~ (1 - seed32); + seedbuf[3] = (1 + seed32) * 0xf00dacca; } else { - seedbuf[1] = 0xc5550690; + seedbuf[1] = 0xc5550690; seedbuf[1] -= seed32; - seedbuf[0] = 1 + seed32; - seedbuf[3] = ~(1 - seed32); - seedbuf[2] = (1+seed32) * 0xf00dacca; + seedbuf[0] = 1 + seed32; + seedbuf[3] = ~ (1 - seed32); + seedbuf[2] = (1 + seed32) * 0xf00dacca; } - uint64_t ds[STATE/8]; + uint64_t ds[STATE / 8]; // nothing up my sleeve ds[0] = UINT64_C(0x123456789abcdef0); ds[1] = UINT64_C(0x0fedcba987654321); @@ -170,27 +172,29 @@ static void BEBB4185(const void * in, const size_t len, const seed_t seed, void // variable. The mixing of the state with itself also doesn't need // bswap set, because the endianness of the data will naturally // always match the endianness of the ds[] values. - round(ds, key8Arr, (uint32_t)len); - round(ds, (uint8_t *)seedbuf, 16); - round(ds, (uint8_t *)ds, STATE); + round(ds, key8Arr , (uint32_t)len); + round(ds, (uint8_t *)seedbuf, 16 ); + round( ds, (uint8_t *)ds, STATE ); - /** - printf("ds = %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 "\n", - ds[0], ds[1], ds[2], ds[3] ); - **/ + /* + * + * printf("ds = %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 " %#018" PRIx64 "\n", + * ds[0], ds[1], ds[2], ds[3] ); + * + */ - uint64_t h[STATE64] = {0}; + uint64_t h[STATE64] = { 0 }; - h[0] = ds[2]; - h[1] = ds[3]; + h[0] = ds[2]; + h[1] = ds[3]; - h[0] += h[1]; + h[0] += h [1]; if (hashsize == 128) { - h[2] = ds[0]; - h[3] = ds[1]; + h[2] = ds[0]; + h[3] = ds[1]; - h[2] += h[3]; + h[2] += h [3]; PUT_U64(h[2], out8, 8); } if (hashsize >= 64) { @@ -199,75 +203,75 @@ static void BEBB4185(const void * in, const size_t len, const seed_t seed, void } REGISTER_FAMILY(discohash, - $.src_url = "https://github.com/crisdosyago/discohash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/crisdosyago/discohash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); // Yes, none of these have any bad seeds! See note at the top near "thread_local". REGISTER_HASH(Discohash__old, - $.desc = "Discohash (aka BEBB4185) prior version", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xBEBB4185, - $.verification_BE = 0x4B5579AD, - $.hashfn_native = BEBB4185<64, false, true>, - $.hashfn_bswap = BEBB4185<64, true, true>, - $.badseeds = {} -); + $.desc = "Discohash (aka BEBB4185) prior version", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xBEBB4185, + $.verification_BE = 0x4B5579AD, + $.hashfn_native = BEBB4185<64, false, true>, + $.hashfn_bswap = BEBB4185<64, true, true>, + $.badseeds = {} + ); REGISTER_HASH(Discohash, - $.desc = "Discohash (aka BEBB4185)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xFBA72400, - $.verification_BE = 0x286DD52C, - $.hashfn_native = BEBB4185<64, false, false>, - $.hashfn_bswap = BEBB4185<64, true, false>, - $.badseeds = {} -); + $.desc = "Discohash (aka BEBB4185)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xFBA72400, + $.verification_BE = 0x286DD52C, + $.hashfn_native = BEBB4185<64, false, false>, + $.hashfn_bswap = BEBB4185<64, true, false>, + $.badseeds = {} + ); REGISTER_HASH(Discohash_128__old, - $.desc = "Discohash (aka BEBB4185) prior version", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x000ED2A6, - $.verification_BE = 0x3110ECFA, - $.hashfn_native = BEBB4185<128, false, true>, - $.hashfn_bswap = BEBB4185<128, true, true>, - $.badseeds = {} -); + $.desc = "Discohash (aka BEBB4185) prior version", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x000ED2A6, + $.verification_BE = 0x3110ECFA, + $.hashfn_native = BEBB4185<128, false, true>, + $.hashfn_bswap = BEBB4185<128, true, true>, + $.badseeds = {} + ); REGISTER_HASH(Discohash_128, - $.desc = "Discohash (aka BEBB4185)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x231868B1, - $.verification_BE = 0xEB4228F3, - $.hashfn_native = BEBB4185<128, false, false>, - $.hashfn_bswap = BEBB4185<128, true, false>, - $.badseeds = {} -); + $.desc = "Discohash (aka BEBB4185)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x231868B1, + $.verification_BE = 0xEB4228F3, + $.hashfn_native = BEBB4185<128, false, false>, + $.hashfn_bswap = BEBB4185<128, true, false>, + $.badseeds = {} + ); diff --git a/hashes/donothing.cpp b/hashes/donothing.cpp index e116c983..e02eddc7 100644 --- a/hashes/donothing.cpp +++ b/hashes/donothing.cpp @@ -28,14 +28,14 @@ #include "Platform.h" #include "Hashlib.h" -static void DoNothingHash(const void * in, const size_t len, const seed_t seed, void * out) { +static void DoNothingHash( const void * in, const size_t len, const seed_t seed, void * out ) { } -template < uint32_t hashlen, bool bswap > -static void DoNothingOAATHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void DoNothingOAATHash( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * data = (const uint8_t *)in; const uint8_t * const end = &data[len]; - uint32_t h = seed >> 32; + uint32_t h = seed >> 32; while (data < end) { h &= *data++; @@ -44,93 +44,93 @@ static void DoNothingOAATHash(const void * in, const size_t len, const seed_t se } REGISTER_FAMILY(donothing, - $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(donothing_32, - $.desc = "Do-Nothing function (measure call overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingHash, - $.hashfn_bswap = DoNothingHash -); + $.desc = "Do-Nothing function (measure call overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingHash, + $.hashfn_bswap = DoNothingHash + ); REGISTER_HASH(donothing_64, - $.desc = "Do-Nothing function (measure call overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingHash, - $.hashfn_bswap = DoNothingHash -); + $.desc = "Do-Nothing function (measure call overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingHash, + $.hashfn_bswap = DoNothingHash + ); REGISTER_HASH(donothing_128, - $.desc = "Do-Nothing function (measure call overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingHash, - $.hashfn_bswap = DoNothingHash -); + $.desc = "Do-Nothing function (measure call overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingHash, + $.hashfn_bswap = DoNothingHash + ); REGISTER_HASH(donothingOAAT_32, - $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingOAATHash<32, false>, - $.hashfn_bswap = DoNothingOAATHash<32, true>, - $.sort_order = 10 -); + $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingOAATHash<32, false>, + $.hashfn_bswap = DoNothingOAATHash<32, true>, + $.sort_order = 10 + ); REGISTER_HASH(donothingOAAT_64, - $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingOAATHash<64, false>, - $.hashfn_bswap = DoNothingOAATHash<64, true>, - $.sort_order = 10 -); + $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingOAATHash<64, false>, + $.hashfn_bswap = DoNothingOAATHash<64, true>, + $.sort_order = 10 + ); REGISTER_HASH(donothingOAAT_128, - $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", - $.hash_flags = - FLAG_HASH_MOCK, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x0, - $.verification_BE = 0x0, - $.hashfn_native = DoNothingOAATHash<128, false>, - $.hashfn_bswap = DoNothingOAATHash<128, true>, - $.sort_order = 10 -); + $.desc = "Do-Nothing OAAT function (measure call+OAAT overhead)", + $.hash_flags = + FLAG_HASH_MOCK, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x0, + $.verification_BE = 0x0, + $.hashfn_native = DoNothingOAATHash<128, false>, + $.hashfn_bswap = DoNothingOAATHash<128, true>, + $.sort_order = 10 + ); diff --git a/hashes/falcon_oaat.cpp b/hashes/falcon_oaat.cpp index 5399fe48..c4dce0bb 100644 --- a/hashes/falcon_oaat.cpp +++ b/hashes/falcon_oaat.cpp @@ -28,24 +28,26 @@ #include "Hashlib.h" //------------------------------------------------------------ -static uint32_t GoodOAAT_impl(const uint8_t * str, size_t len, uint32_t seed) { +static uint32_t GoodOAAT_impl( const uint8_t * str, size_t len, uint32_t seed ) { const uint8_t * const end = str + len; uint32_t h1 = seed ^ 0x3b00; uint32_t h2 = ROTL32(seed, 15); - for (;str != end; str++) { + for (; str != end; str++) { h1 += str[0]; h1 += h1 << 3; // h1 *= 9 h2 += h1; // the rest could be as in MicroOAAT: h1 = ROTL32(h1, 7) // but clang doesn't generate ROTL instruction then. - h2 = ROTL32(h2, 7); + h2 = ROTL32(h2, 7); h2 += h2 << 2; // h2 *= 5 } h1 ^= h2; - /* now h1 passes all collision checks, - * so it is suitable for hash-tables with prime numbers. */ + /* + * now h1 passes all collision checks, + * so it is suitable for hash-tables with prime numbers. + */ h1 += ROTL32(h2, 14); h2 ^= h1; h2 += ROTR32(h1, 6); h1 ^= h2; h1 += ROTL32(h2, 5); @@ -57,64 +59,67 @@ static uint32_t GoodOAAT_impl(const uint8_t * str, size_t len, uint32_t seed) { // MicroOAAT suitable for hash-tables using prime numbers. // It passes all collision checks. // Author: Sokolov Yura aka funny-falcon -static uint32_t MicroOAAT_impl(const uint8_t * str, size_t len, uint32_t seed) { - const uint8_t * const end = str + len; - uint32_t h1 = seed ^ 0x3b00; - uint32_t h2 = ROTL32(seed, 15); - for (;str != end; str++) { - h1 += str[0]; - h1 += h1 << 3; // h1 *= 9 - h2 -= h1; - // unfortunately, clang produces bad code here, - // cause it doesn't generate rotl instruction. - h1 = ROTL32(h1, 7); - } - return (h1 ^ h2); +static uint32_t MicroOAAT_impl( const uint8_t * str, size_t len, uint32_t seed ) { + const uint8_t * const end = str + len; + uint32_t h1 = seed ^ 0x3b00; + uint32_t h2 = ROTL32(seed, 15); + + for (; str != end; str++) { + h1 += str[0]; + h1 += h1 << 3; // h1 *= 9 + h2 -= h1; + // unfortunately, clang produces bad code here, + // cause it doesn't generate rotl instruction. + h1 = ROTL32(h1, 7); + } + return h1 ^ h2; } //------------------------------------------------------------ -template < bool bswap > -static void GoodOAAT(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void GoodOAAT( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = GoodOAAT_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void MicroOAAT(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MicroOAAT( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = MicroOAAT_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(falcon_oaat, - $.src_url = "https://github.com/rurban/smhasher/commit/3931fd6f723f4fb2afab6ef9a628912220e90ce7", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/commit/3931fd6f723f4fb2afab6ef9a628912220e90ce7", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(GoodOAAT, - $.desc = "GoodOAAT (Small non-multiplicative OAAT by funny-falcon)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x7B14EEE5, - $.verification_BE = 0x1A834495, - $.hashfn_native = GoodOAAT, - $.hashfn_bswap = GoodOAAT -); + $.desc = "GoodOAAT (Small non-multiplicative OAAT by funny-falcon)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x7B14EEE5, + $.verification_BE = 0x1A834495, + $.hashfn_native = GoodOAAT, + $.hashfn_bswap = GoodOAAT + ); REGISTER_HASH(MicroOAAT, - $.desc = "MicroOAAT (Small non-multiplicative OAAT by funny-falcon)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x16F1BA97, - $.verification_BE = 0xDE58061B, - $.hashfn_native = MicroOAAT, - $.hashfn_bswap = MicroOAAT -); + $.desc = "MicroOAAT (Small non-multiplicative OAAT by funny-falcon)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x16F1BA97, + $.verification_BE = 0xDE58061B, + $.hashfn_native = MicroOAAT, + $.hashfn_bswap = MicroOAAT + ); diff --git a/hashes/falkhash.cpp b/hashes/falkhash.cpp index eb3719df..ffb34613 100644 --- a/hashes/falkhash.cpp +++ b/hashes/falkhash.cpp @@ -58,123 +58,124 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_AES) -#include "Intrinsics.h" - -template < uint32_t version, bool bswap > -static void falkhash(const void * in, const size_t olen, const seed_t seed64, void * out) { - const uint8_t * buf = (const uint8_t *)in; - uint64_t len = (uint64_t)olen; - __m128i hash, seed; - - // A chunk_size of 0x50 is ideal for AMD fam 15h platforms, which is - // what this was optimized and designed for. If you change this - // value, you have to manually add/remove instructions from the core - // loop below. This must be divisible by 16. - const uint64_t CHUNK_LEN = 80; - - if (version == 1) { - // Add the seed to the length. Place the length+seed for both the - // low and high 64-bits into our hash output. - seed = _mm_set_epi64x(len + ((uint64_t)seed64), len + ((uint64_t)seed64)); - } else { - // Create the 128-bit seed. Low 64-bits gets seed, high 64-bits gets - // seed + len + 1. The +1 ensures that both 64-bits values will never be - // the same (with the exception of a length of -1. If you have that much - // ram, send me some). - seed = _mm_set_epi64x(1 + len + ((uint64_t)seed64), (uint64_t)seed64); - } - - hash = seed; - - while (len > 0) { - __m128i piece[5]; - uint8_t tmp[CHUNK_LEN]; - - // If the data is smaller than one chunk, pad it with 0xff for v1, - // or zeroes for v2. - if (len < CHUNK_LEN) { - memcpy(tmp, buf, len); - if (version == 1) { - memset(tmp + len, 0xff, CHUNK_LEN - len); - } else { - memset(tmp + len, 0, CHUNK_LEN - len); - } - buf = tmp; - len = CHUNK_LEN; - } + #include "Intrinsics.h" - // Read 5 pieces from memory into xmms - piece[0] = _mm_loadu_si128((__m128i*)(buf + 0*0x10)); - piece[1] = _mm_loadu_si128((__m128i*)(buf + 1*0x10)); - piece[2] = _mm_loadu_si128((__m128i*)(buf + 2*0x10)); - piece[3] = _mm_loadu_si128((__m128i*)(buf + 3*0x10)); - piece[4] = _mm_loadu_si128((__m128i*)(buf + 4*0x10)); - - if (bswap) { - // Arbitrarily chose 64-bit chunks - piece[0] = mm_bswap64(piece[0]); - piece[1] = mm_bswap64(piece[1]); - piece[2] = mm_bswap64(piece[2]); - piece[3] = mm_bswap64(piece[3]); - piece[4] = mm_bswap64(piece[4]); - } +template +static void falkhash( const void * in, const size_t olen, const seed_t seed64, void * out ) { + const uint8_t * buf = (const uint8_t *)in; + uint64_t len = (uint64_t )olen; + __m128i hash, seed; - if (version == 2) { - // xor each piece against the seed - piece[0] = _mm_xor_si128(piece[0], seed); - piece[1] = _mm_xor_si128(piece[1], seed); - piece[2] = _mm_xor_si128(piece[2], seed); - piece[3] = _mm_xor_si128(piece[3], seed); - piece[4] = _mm_xor_si128(piece[4], seed); + // A chunk_size of 0x50 is ideal for AMD fam 15h platforms, which is + // what this was optimized and designed for. If you change this + // value, you have to manually add/remove instructions from the core + // loop below. This must be divisible by 16. + const uint64_t CHUNK_LEN = 80; + + if (version == 1) { + // Add the seed to the length. Place the length+seed for both the + // low and high 64-bits into our hash output. + seed = _mm_set_epi64x(len + ((uint64_t)seed64), len + ((uint64_t)seed64)); + } else { + // Create the 128-bit seed. Low 64-bits gets seed, high 64-bits gets + // seed + len + 1. The +1 ensures that both 64-bits values will never be + // the same (with the exception of a length of -1. If you have that much + // ram, send me some). + seed = _mm_set_epi64x(1 + len + ((uint64_t)seed64), (uint64_t)seed64); } - // Mix all pieces into xmm0 - piece[0] = _mm_aesenc_si128(piece[0], piece[1]); - piece[0] = _mm_aesenc_si128(piece[0], piece[2]); - piece[0] = _mm_aesenc_si128(piece[0], piece[3]); - piece[0] = _mm_aesenc_si128(piece[0], piece[4]); + hash = seed; + + while (len > 0) { + __m128i piece[5]; + uint8_t tmp[CHUNK_LEN]; + + // If the data is smaller than one chunk, pad it with 0xff for v1, + // or zeroes for v2. + if (len < CHUNK_LEN) { + memcpy(tmp, buf, len); + if (version == 1) { + memset(tmp + len, 0xff, CHUNK_LEN - len); + } else { + memset(tmp + len, 0, CHUNK_LEN - len); + } + buf = tmp; + len = CHUNK_LEN; + } + + // Read 5 pieces from memory into xmms + piece[0] = _mm_loadu_si128((__m128i *)(buf + 0 * 0x10)); + piece[1] = _mm_loadu_si128((__m128i *)(buf + 1 * 0x10)); + piece[2] = _mm_loadu_si128((__m128i *)(buf + 2 * 0x10)); + piece[3] = _mm_loadu_si128((__m128i *)(buf + 3 * 0x10)); + piece[4] = _mm_loadu_si128((__m128i *)(buf + 4 * 0x10)); + + if (bswap) { + // Arbitrarily chose 64-bit chunks + piece[0] = mm_bswap64(piece[0]); + piece[1] = mm_bswap64(piece[1]); + piece[2] = mm_bswap64(piece[2]); + piece[3] = mm_bswap64(piece[3]); + piece[4] = mm_bswap64(piece[4]); + } + + if (version == 2) { + // xor each piece against the seed + piece[0] = _mm_xor_si128(piece[0], seed); + piece[1] = _mm_xor_si128(piece[1], seed); + piece[2] = _mm_xor_si128(piece[2], seed); + piece[3] = _mm_xor_si128(piece[3], seed); + piece[4] = _mm_xor_si128(piece[4], seed); + } + + // Mix all pieces into xmm0 + piece[0] = _mm_aesenc_si128(piece[0], piece[1]); + piece[0] = _mm_aesenc_si128(piece[0], piece[2]); + piece[0] = _mm_aesenc_si128(piece[0], piece[3]); + piece[0] = _mm_aesenc_si128(piece[0], piece[4]); + + if (version == 1) { + // Finalize xmm0 by mixing with itself + piece[0] = _mm_aesenc_si128(piece[0], piece[0]); + } else { + // Finalize piece[0] by aesencing against seed + piece[0] = _mm_aesenc_si128(piece[0], seed); + } + + // Mix in xmm0 to the hash + hash = _mm_aesenc_si128(hash, piece[0]); + + buf += CHUNK_LEN; + len -= CHUNK_LEN; + } if (version == 1) { - // Finalize xmm0 by mixing with itself - piece[0] = _mm_aesenc_si128(piece[0], piece[0]); + // Finalize the hash. This is required at least once to pass + // Combination 0x8000000 and Combination 0x0000001. Need more than 1 to + // pass the Seed tests. We do 4 because they're pretty much free. + // Maybe we should actually use the seed better? Nah, more finalizing! + hash = _mm_aesenc_si128(hash, hash); + hash = _mm_aesenc_si128(hash, hash); + hash = _mm_aesenc_si128(hash, hash); + hash = _mm_aesenc_si128(hash, hash); } else { - // Finalize piece[0] by aesencing against seed - piece[0] = _mm_aesenc_si128(piece[0], seed); + // Finalize hash by aesencing against seed four times + hash = _mm_aesenc_si128(hash, seed); + hash = _mm_aesenc_si128(hash, seed); + hash = _mm_aesenc_si128(hash, seed); + hash = _mm_aesenc_si128(hash, seed); } - // Mix in xmm0 to the hash - hash = _mm_aesenc_si128(hash, piece[0]); - - buf += CHUNK_LEN; - len -= CHUNK_LEN; - } - - if (version == 1) { - // Finalize the hash. This is required at least once to pass - // Combination 0x8000000 and Combination 0x0000001. Need more than 1 to - // pass the Seed tests. We do 4 because they're pretty much free. - // Maybe we should actually use the seed better? Nah, more finalizing! - hash = _mm_aesenc_si128(hash, hash); - hash = _mm_aesenc_si128(hash, hash); - hash = _mm_aesenc_si128(hash, hash); - hash = _mm_aesenc_si128(hash, hash); - } else { - // Finalize hash by aesencing against seed four times - hash = _mm_aesenc_si128(hash, seed); - hash = _mm_aesenc_si128(hash, seed); - hash = _mm_aesenc_si128(hash, seed); - hash = _mm_aesenc_si128(hash, seed); - } - - // Write hash to memory - _mm_storeu_si128((__m128i*)out, hash); + // Write hash to memory + _mm_storeu_si128((__m128i *)out, hash); } + #endif REGISTER_FAMILY(falkhash, - $.src_url = "https://github.com/gamozolabs/falkhash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/gamozolabs/falkhash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); #if defined(HAVE_X86_64_AES) @@ -185,29 +186,29 @@ REGISTER_FAMILY(falkhash, // 1) For a hash len of 0, a hash result of 0 was forced, and // 2) The hash output was truncated to 64 bits. REGISTER_HASH(falkhash1, - $.desc = "Falkhash v1", - $.hash_flags = - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0xAEF96E69, - $.verification_BE = 0xDAE2ECE4, - $.hashfn_native = falkhash<1,false>, - $.hashfn_bswap = falkhash<1,true> -); + $.desc = "Falkhash v1", + $.hash_flags = + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xAEF96E69, + $.verification_BE = 0xDAE2ECE4, + $.hashfn_native = falkhash<1, false>, + $.hashfn_bswap = falkhash<1, true> + ); REGISTER_HASH(falkhash2, - $.desc = "Falkhash v2", - $.hash_flags = - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x7FA15220, - $.verification_BE = 0x0A8285F2, - $.hashfn_native = falkhash<2,false>, - $.hashfn_bswap = falkhash<2,true> -); + $.desc = "Falkhash v2", + $.hash_flags = + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x7FA15220, + $.verification_BE = 0x0A8285F2, + $.hashfn_native = falkhash<2, false>, + $.hashfn_bswap = falkhash<2, true> + ); #endif diff --git a/hashes/farmhash.cpp b/hashes/farmhash.cpp index ff358f61..6ed0b455 100644 --- a/hashes/farmhash.cpp +++ b/hashes/farmhash.cpp @@ -29,51 +29,62 @@ #include #if defined(HAVE_SSE_4_1) || defined(HAVE_X86_64_CRC32C) || defined(HAVE_X86_64_AES) -#include "Intrinsics.h" -#define FARMHASH_USE_INTRIN + #include "Intrinsics.h" + #define FARMHASH_USE_INTRIN #endif using namespace std; //------------------------------------------------------------ #if defined(HAVE_INT128) -static inline uint64_t Uint128Low64(const uint128_t x) { - return static_cast(x); + +static inline uint64_t Uint128Low64( const uint128_t x ) { + return static_cast(x); } -static inline uint64_t Uint128High64(const uint128_t x) { - return static_cast(x >> 64); + +static inline uint64_t Uint128High64( const uint128_t x ) { + return static_cast(x >> 64); } -static inline uint128_t Uint128(uint64_t lo, uint64_t hi) { - return lo + (((uint128_t)hi) << 64); + +static inline uint128_t Uint128( uint64_t lo, uint64_t hi ) { + return lo + (((uint128_t)hi) << 64); } + #else typedef std::pair uint128_t; -static inline uint64_t Uint128Low64(const uint128_t x) { return x.first; } -static inline uint64_t Uint128High64(const uint128_t x) { return x.second; } -static inline uint128_t Uint128(uint64_t lo, uint64_t hi) { return uint128_t(lo, hi); } + +static inline uint64_t Uint128Low64( const uint128_t x ) { return x.first; } + +static inline uint64_t Uint128High64( const uint128_t x ) { return x.second; } + +static inline uint128_t Uint128( uint64_t lo, uint64_t hi ) { return uint128_t(lo, hi); } + #endif //------------------------------------------------------------ -template < bool bswap > -static inline uint32_t Fetch32(const uint8_t * p) { +template +static inline uint32_t Fetch32( const uint8_t * p ) { return GET_U32(p, 0); } -template < bool bswap > -static inline uint64_t Fetch64(const uint8_t * p) { +template +static inline uint64_t Fetch64( const uint8_t * p ) { return GET_U64(p, 0); } #if defined(FARMHASH_USE_INTRIN) -template < bool bswap > -static inline __m128i Fetch128(const uint8_t * s) { - __m128i d = _mm_loadu_si128(reinterpret_cast(s)); + +template +static inline __m128i Fetch128( const uint8_t * s ) { + __m128i d = _mm_loadu_si128(reinterpret_cast(s)); + if (bswap) { - const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); + const __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); d = _mm_shuffle_epi8(d, mask); } return d; } + #endif #undef PERMUTE3 @@ -81,25 +92,34 @@ static inline __m128i Fetch128(const uint8_t * s) { //------------------------------------------------------------ #if defined(FARMHASH_USE_INTRIN) + // Helpers for data-parallel operations (1x 128 bits or 2x 64 or 4x 32). -static inline __m128i Add64(__m128i x, __m128i y) { return _mm_add_epi64(x, y); } -static inline __m128i Add32(__m128i x, __m128i y) { return _mm_add_epi32(x, y); } -static inline __m128i Mul(__m128i x, __m128i y) { return _mm_mullo_epi32(x, y); } -static inline __m128i Mul5(__m128i x) { return Add32(x, _mm_slli_epi32(x, 2)); } +static inline __m128i Add64( __m128i x, __m128i y ) { return _mm_add_epi64(x, y); } -static inline __m128i Xor(__m128i x, __m128i y) { return _mm_xor_si128(x, y); } -static inline __m128i Or(__m128i x, __m128i y) { return _mm_or_si128(x, y); } +static inline __m128i Add32( __m128i x, __m128i y ) { return _mm_add_epi32(x, y); } -static inline __m128i RotateLeft(__m128i x, int c) { - return Or(_mm_slli_epi32(x, c), _mm_srli_epi32(x, 32 - c)); +static inline __m128i Mul( __m128i x, __m128i y ) { return _mm_mullo_epi32(x, y); } + +static inline __m128i Mul5( __m128i x ) { return Add32(x, _mm_slli_epi32(x, 2)); } + +static inline __m128i Xor( __m128i x, __m128i y ) { return _mm_xor_si128(x, y); } + +static inline __m128i Or( __m128i x, __m128i y ) { return _mm_or_si128(x, y); } + +static inline __m128i RotateLeft( __m128i x, int c ) { + return Or(_mm_slli_epi32(x, c), _mm_srli_epi32(x, 32 - c)); } -static inline __m128i Rol17(__m128i x) { return RotateLeft(x, 17); } -static inline __m128i Rol19(__m128i x) { return RotateLeft(x, 19); } -static inline __m128i Shuf(__m128i x, __m128i y) { return _mm_shuffle_epi8(y, x); } -static inline __m128i Shuffle0321(__m128i x) { - return _mm_shuffle_epi32(x, (0 << 6) + (3 << 4) + (2 << 2) + (1 << 0)); +static inline __m128i Rol17( __m128i x ) { return RotateLeft(x, 17); } + +static inline __m128i Rol19( __m128i x ) { return RotateLeft(x, 19); } + +static inline __m128i Shuf( __m128i x, __m128i y ) { return _mm_shuffle_epi8(y, x); } + +static inline __m128i Shuffle0321( __m128i x ) { + return _mm_shuffle_epi32(x, (0 << 6) + (3 << 4) + (2 << 2) + (1 << 0)); } + #endif //------------------------------------------------------------ @@ -117,405 +137,409 @@ static const uint32_t c2 = 0x1b873593; // A 32-bit to 32-bit integer hash copied from Murmur3. // mul -static inline uint32_t fmix(uint32_t h) { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; +static inline uint32_t fmix( uint32_t h ) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; } // Helper from Murmur3 for combining two 32-bit values. // mul -static inline uint32_t Mur(uint32_t a, uint32_t h) { - a *= c1; - a = ROTR32(a, 17); - a *= c2; - h ^= a; - h = ROTR32(h, 19); - return h * 5 + 0xe6546b64; +static inline uint32_t Mur( uint32_t a, uint32_t h ) { + a *= c1; + a = ROTR32(a, 17); + a *= c2; + h ^= a; + h = ROTR32(h, 19); + return h * 5 + 0xe6546b64; } - -static inline uint64_t ShiftMix(uint64_t val) { - return val ^ (val >> 47); +static inline uint64_t ShiftMix( uint64_t val ) { + return val ^ (val >> 47); } // Hash 128 input bits down to 64 bits of output. // This is intended to be a reasonably good hash function. // 64x64 -static inline uint64_t Hash128to64(uint128_t x) { - // Murmur-inspired hashing. - const uint64_t kMul = UINT64_C(0x9ddfea08eb382d69); - uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; - a ^= (a >> 47); - uint64_t b = (Uint128High64(x) ^ a) * kMul; - b ^= (b >> 47); - b *= kMul; - return b; +static inline uint64_t Hash128to64( uint128_t x ) { + // Murmur-inspired hashing. + const uint64_t kMul = UINT64_C(0x9ddfea08eb382d69); + uint64_t a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + + a ^= (a >> 47); + uint64_t b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; } // 64x64 -static inline uint64_t HashLen16(uint64_t u, uint64_t v) { - return Hash128to64(Uint128(u, v)); +static inline uint64_t HashLen16( uint64_t u, uint64_t v ) { + return Hash128to64(Uint128(u, v)); } // 64x64 -static inline uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) { - // Murmur-inspired hashing. - uint64_t a = (u ^ v) * mul; - a ^= (a >> 47); - uint64_t b = (v ^ a) * mul; - b ^= (b >> 47); - b *= mul; - return b; +static inline uint64_t HashLen16( uint64_t u, uint64_t v, uint64_t mul ) { + // Murmur-inspired hashing. + uint64_t a = (u ^ v) * mul; + + a ^= (a >> 47); + uint64_t b = (v ^ a) * mul; + b ^= (b >> 47); + b *= mul; + return b; } // Return a 16-byte hash for 48 bytes. Quick and dirty. // Callers do best to use "random-looking" values for a and b. -static inline pair WeakHashLen32WithSeeds( - uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) { - a += w; - b = ROTR64(b + a + z, 21); - uint64_t c = a; - a += x; - a += y; - b += ROTR64(a, 44); - return make_pair(a + z, b + c); +static inline pair WeakHashLen32WithSeeds( uint64_t w, + uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b ) { + a += w; + b = ROTR64(b + a + z, 21); + uint64_t c = a; + a += x; + a += y; + b += ROTR64(a , 44); + return make_pair(a + z, b + c); } // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. -template < bool bswap > -static inline pair WeakHashLen32WithSeeds( - const uint8_t* s, uint64_t a, uint64_t b) { - return WeakHashLen32WithSeeds(Fetch64(s), - Fetch64(s + 8), - Fetch64(s + 16), - Fetch64(s + 24), - a, - b); +template +static inline pair WeakHashLen32WithSeeds( const uint8_t * s, uint64_t a, uint64_t b ) { + return WeakHashLen32WithSeeds(Fetch64(s), Fetch64( + s + 8), Fetch64(s + 16), Fetch64(s + 24), a, b); } //------------------------------------------------------------ namespace farmhashna { - template < bool bswap > - static inline uint64_t HashLen0to16(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint64_t HashLen17to32(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint64_t HashLen33to64(const uint8_t *s, size_t len); - - template < bool bswap > - static uint64_t Hash64(const uint8_t *s, size_t len); - template < bool bswap > - static uint64_t Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1); - template < bool bswap > - static uint64_t Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed); -} - -template < bool bswap > -static inline uint64_t farmhashna::HashLen0to16(const uint8_t *s, size_t len) { - if (len >= 8) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch64(s) + k2; - uint64_t b = Fetch64(s + len - 8); - uint64_t c = ROTR64(b, 37) * mul + a; - uint64_t d = (ROTR64(a, 25) + b) * mul; - return HashLen16(c, d, mul); - } - if (len >= 4) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch32(s); - return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul); - } - if (len > 0) { - uint8_t a = s[0]; - uint8_t b = s[len >> 1]; - uint8_t c = s[len - 1]; - uint32_t y = static_cast(a) + (static_cast(b) << 8); - uint32_t z = len + (static_cast(c) << 2); - return ShiftMix(y * k2 ^ z * k0) * k2; - } - return k2; + template + static inline uint64_t HashLen0to16( const uint8_t * s, size_t len ); + + template + static inline uint64_t HashLen17to32( const uint8_t * s, size_t len ); + + template + static inline uint64_t HashLen33to64( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ); + + template + static uint64_t Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ); +} // namespace farmhashna + +template +static inline uint64_t farmhashna::HashLen0to16( const uint8_t * s, size_t len ) { + if (len >= 8) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch64(s) + k2; + uint64_t b = Fetch64(s + len - 8); + uint64_t c = ROTR64(b, 37) * mul + a; + uint64_t d = (ROTR64(a, 25) + b) * mul; + return HashLen16(c, d, mul); + } + if (len >= 4) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul); + } + if (len > 0) { + uint8_t a = s[0]; + uint8_t b = s[len >> 1]; + uint8_t c = s[len - 1]; + uint32_t y = static_cast(a) + (static_cast(b) << 8); + uint32_t z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k0) * k2; + } + return k2; } // This probably works well for 16-byte strings as well, but it may be overkill // in that case. -template < bool bswap > -static inline uint64_t farmhashna::HashLen17to32(const uint8_t *s, size_t len) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch64(s) * k1; - uint64_t b = Fetch64(s + 8); - uint64_t c = Fetch64(s + len - 8) * mul; - uint64_t d = Fetch64(s + len - 16) * k2; - return HashLen16(ROTR64(a + b, 43) + ROTR64(c, 30) + d, - a + ROTR64(b + k2, 18) + c, mul); +template +static inline uint64_t farmhashna::HashLen17to32( const uint8_t * s, size_t len ) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch64(s ) * k1; + uint64_t b = Fetch64(s + 8); + uint64_t c = Fetch64(s + len - 8) * mul; + uint64_t d = Fetch64(s + len - 16) * k2; + + return HashLen16(ROTR64(a + b, 43) + ROTR64(c, 30) + d, a + ROTR64(b + k2, 18) + c, mul); } // Return an 8-byte hash for 33 to 64 bytes. -template < bool bswap > -static inline uint64_t farmhashna::HashLen33to64(const uint8_t *s, size_t len) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch64(s) * k2; - uint64_t b = Fetch64(s + 8); - uint64_t c = Fetch64(s + len - 8) * mul; - uint64_t d = Fetch64(s + len - 16) * k2; - uint64_t y = ROTR64(a + b, 43) + ROTR64(c, 30) + d; - uint64_t z = HashLen16(y, a + ROTR64(b + k2, 18) + c, mul); - uint64_t e = Fetch64(s + 16) * mul; - uint64_t f = Fetch64(s + 24); - uint64_t g = (y + Fetch64(s + len - 32)) * mul; - uint64_t h = (z + Fetch64(s + len - 24)) * mul; - return HashLen16(ROTR64(e + f, 43) + ROTR64(g, 30) + h, - e + ROTR64(f + a, 18) + g, mul); -} - -template < bool bswap > -static uint64_t farmhashna::Hash64(const uint8_t *s, size_t len) { - const uint64_t seed = 81; - if (len <= 32) { - if (len <= 16) { - return HashLen0to16(s, len); - } else { - return HashLen17to32(s, len); - } - } else if (len <= 64) { - return HashLen33to64(s, len); - } - - // For strings over 64 bytes we loop. I nternal state consists of - // 56 bytes: v, w, x, y, and z. - uint64_t x = seed; - uint64_t y = seed * k1 + 113; - uint64_t z = ShiftMix(y * k2 + 113) * k2; - pair v = make_pair(0, 0); - pair w = make_pair(0, 0); - x = x * k2 + Fetch64(s); - - // Set end so that after the loop we have 1 to 64 bytes left to process. - const uint8_t* end = s + ((len - 1) / 64) * 64; - const uint8_t* last64 = end + ((len - 1) & 63) - 63; - assert(s + len - 64 == last64); - do { - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); +template +static inline uint64_t farmhashna::HashLen33to64( const uint8_t * s, size_t len ) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch64(s ) * k2; + uint64_t b = Fetch64(s + 8); + uint64_t c = Fetch64(s + len - 8) * mul; + uint64_t d = Fetch64(s + len - 16) * k2; + uint64_t y = ROTR64(a + b, 43) + ROTR64(c, 30) + d; + uint64_t z = HashLen16(y, a + ROTR64(b + k2, 18) + c, mul); + uint64_t e = Fetch64(s + 16) * mul; + uint64_t f = Fetch64(s + 24); + uint64_t g = (y + Fetch64(s + len - 32)) * mul; + uint64_t h = (z + Fetch64(s + len - 24)) * mul; + + return HashLen16(ROTR64(e + f, 43) + ROTR64(g, 30) + h, e + ROTR64(f + a, 18) + g, mul); +} + +template +static uint64_t farmhashna::Hash64( const uint8_t * s, size_t len ) { + const uint64_t seed = 81; + + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + // For strings over 64 bytes we loop. I nternal state consists of + // 56 bytes: v, w, x, y, and z. + uint64_t x = seed; + uint64_t y = seed * k1 + 113; + uint64_t z = ShiftMix(y * k2 + 113) * k2; + pair v = make_pair(0, 0); + pair w = make_pair(0, 0); + x = x * k2 + Fetch64(s); + + // Set end so that after the loop we have 1 to 64 bytes left to process. + const uint8_t * end = s + ((len - 1) / 64) * 64; + const uint8_t * last64 = end + ((len - 1) & 63) - 63; + assert(s + len - 64 == last64); + do { + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + } while (s != end); + uint64_t mul = k1 + ((z & 0xff) << 1); + // Make s point to the last 64 bytes of input. + s = last64; + w.first += ((len - 1) & 63); + v.first += w.first; + w.first += v.first; + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * mul; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * mul; + x ^= w.second * 9; + y += v.first * 9 + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * mul; + v = WeakHashLen32WithSeeds(s , v.second * mul, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); std::swap(z, x); - s += 64; - } while (s != end); - uint64_t mul = k1 + ((z & 0xff) << 1); - // Make s point to the last 64 bytes of input. - s = last64; - w.first += ((len - 1) & 63); - v.first += w.first; - w.first += v.first; - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * mul; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * mul; - x ^= w.second * 9; - y += v.first * 9 + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * mul; - v = WeakHashLen32WithSeeds(s, v.second * mul, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z, - HashLen16(v.second, w.second, mul) + x, - mul); -} - -template < bool bswap > -static uint64_t farmhashna::Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1) { + return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z, + HashLen16(v.second, w.second, mul) + x, mul); +} + +template +static uint64_t farmhashna::Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ) { return HashLen16(farmhashna::Hash64(s, len) - seed0, seed1); } -template < bool bswap > -static uint64_t farmhashna::Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed) { +template +static uint64_t farmhashna::Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ) { return farmhashna::Hash64WithSeeds(s, len, k2, seed); } //------------------------------------------------------------ namespace farmhashuo { - static inline uint64_t H(uint64_t x, uint64_t y, uint64_t mul, int r); - - template < bool bswap > - static uint64_t Hash64(const uint8_t *s, size_t len); - template < bool bswap > - static uint64_t Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1); - template < bool bswap > - static uint64_t Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed); -} - -static inline uint64_t farmhashuo::H(uint64_t x, uint64_t y, uint64_t mul, int r) { - uint64_t a = (x ^ y) * mul; - a ^= (a >> 47); - uint64_t b = (y ^ a) * mul; - return ROTR64(b, r) * mul; -} - -template < bool bswap > -static uint64_t farmhashuo::Hash64WithSeeds(const uint8_t *s, size_t len, - uint64_t seed0, uint64_t seed1) { - if (len <= 64) { - return farmhashna::Hash64WithSeeds(s, len, seed0, seed1); - } - - // For strings over 64 bytes we loop. Internal state consists of - // 64 bytes: u, v, w, x, y, and z. - uint64_t x = seed0; - uint64_t y = seed1 * k2 + 113; - uint64_t z = ShiftMix(y * k2) * k2; - pair v = make_pair(seed0, seed1); - pair w = make_pair(0, 0); - uint64_t u = x - z; - x *= k2; - uint64_t mul = k2 + (u & 0x82); - - // Set end so that after the loop we have 1 to 64 bytes left to process. - const uint8_t* end = s + ((len - 1) / 64) * 64; - const uint8_t* last64 = end + ((len - 1) & 63) - 63; - assert(s + len - 64 == last64); - do { - uint64_t a0 = Fetch64(s); - uint64_t a1 = Fetch64(s + 8); - uint64_t a2 = Fetch64(s + 16); - uint64_t a3 = Fetch64(s + 24); - uint64_t a4 = Fetch64(s + 32); - uint64_t a5 = Fetch64(s + 40); - uint64_t a6 = Fetch64(s + 48); - uint64_t a7 = Fetch64(s + 56); - x += a0 + a1; - y += a2; - z += a3; - v.first += a4; - v.second += a5 + a1; - w.first += a6; - w.second += a7; - - x = ROTR64(x, 26); - x *= 9; - y = ROTR64(y, 29); - z *= mul; - v.first = ROTR64(v.first, 33); - v.second = ROTR64(v.second, 30); - w.first ^= x; - w.first *= 9; - z = ROTR64(z, 32); - z += w.second; - w.second += z; - z *= 9; - std::swap(u, y); - - z += a0 + a6; - v.first += a2; - v.second += a3; - w.first += a4; - w.second += a5 + a6; - x += a1; - y += a7; - - y += v.first; - v.first += x - y; - v.second += w.first; - w.first += v.second; - w.second += x - y; - x += w.second; - w.second = ROTR64(w.second, 34); - std::swap(u, z); - s += 64; - } while (s != end); - // Make s point to the last 64 bytes of input. - s = last64; - u *= 9; - v.second = ROTR64(v.second, 28); - v.first = ROTR64(v.first, 20); - w.first += ((len - 1) & 63); - u += y; - y += u; - x = ROTR64(y - x + v.first + Fetch64(s + 8), 37) * mul; - y = ROTR64(y ^ v.second ^ Fetch64(s + 48), 42) * mul; - x ^= w.second * 9; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * mul; - v = WeakHashLen32WithSeeds(s, v.second * mul, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - return farmhashuo::H( - HashLen16(v.first + x, w.first ^ y, mul) + z - u, - farmhashuo::H(v.second + y, w.second + z, k2, 30) ^ x, - k2, - 31); -} - -template < bool bswap > -static uint64_t farmhashuo::Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed) { - return len <= 64 ? farmhashna::Hash64WithSeed(s, len, seed) : - farmhashuo::Hash64WithSeeds(s, len, 0, seed); -} - -template < bool bswap > -static uint64_t farmhashuo::Hash64(const uint8_t *s, size_t len) { - return len <= 64 ? farmhashna::Hash64(s, len) : - farmhashuo::Hash64WithSeeds(s, len, 81, 0); + static inline uint64_t H( uint64_t x, uint64_t y, uint64_t mul, int r ); + + template + static uint64_t Hash64( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ); + + template + static uint64_t Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ); +} // namespace farmhashuo + +static inline uint64_t farmhashuo::H( uint64_t x, uint64_t y, uint64_t mul, int r ) { + uint64_t a = (x ^ y) * mul; + + a ^= (a >> 47); + uint64_t b = (y ^ a) * mul; + return ROTR64(b, r) * mul; +} + +template +static uint64_t farmhashuo::Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ) { + if (len <= 64) { + return farmhashna::Hash64WithSeeds(s, len, seed0, seed1); + } + + // For strings over 64 bytes we loop. Internal state consists of + // 64 bytes: u, v, w, x, y, and z. + uint64_t x = seed0; + uint64_t y = seed1 * k2 + 113; + uint64_t z = ShiftMix(y * k2) * k2; + pair v = make_pair(seed0, seed1); + pair w = make_pair( 0, 0); + uint64_t u = x - z; + x *= k2; + uint64_t mul = k2 + (u & 0x82); + + // Set end so that after the loop we have 1 to 64 bytes left to process. + const uint8_t * end = s + ((len - 1) / 64) * 64; + const uint8_t * last64 = end + ((len - 1) & 63) - 63; + assert(s + len - 64 == last64); + do { + uint64_t a0 = Fetch64(s ); + uint64_t a1 = Fetch64(s + 8); + uint64_t a2 = Fetch64(s + 16); + uint64_t a3 = Fetch64(s + 24); + uint64_t a4 = Fetch64(s + 32); + uint64_t a5 = Fetch64(s + 40); + uint64_t a6 = Fetch64(s + 48); + uint64_t a7 = Fetch64(s + 56); + x += a0 + a1; + y += a2; + z += a3; + v.first += a4; + v.second += a5 + a1; + w.first += a6; + w.second += a7; + + x = ROTR64(x , 26); + x *= 9; + y = ROTR64(y , 29); + z *= mul; + v.first = ROTR64(v.first , 33); + v.second = ROTR64(v.second, 30); + w.first ^= x; + w.first *= 9; + z = ROTR64(z , 32); + z += w.second; + w.second += z; + z *= 9; + std::swap(u, y); + + z += a0 + a6; + v.first += a2; + v.second += a3; + w.first += a4; + w.second += a5 + a6; + x += a1; + y += a7; + + y += v.first; + v.first += x - y; + v.second += w.first; + w.first += v.second; + w.second += x - y; + x += w.second; + w.second = ROTR64(w.second, 34); + std::swap(u, z); + s += 64; + } while (s != end); + // Make s point to the last 64 bytes of input. + s = last64; + u *= 9; + v.second = ROTR64(v.second , 28); + v.first = ROTR64(v.first , 20); + w.first += ((len - 1) & 63); + u += y; + y += u; + x = ROTR64(y - x + v.first + Fetch64(s + 8), 37) * mul; + y = ROTR64(y ^ v.second ^ Fetch64(s + 48), 42) * mul; + x ^= w.second * 9; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * mul; + v = WeakHashLen32WithSeeds(s , v.second * mul, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + return farmhashuo::H(HashLen16(v.first + x, w.first ^ y, mul) + z - u, farmhashuo::H( + v.second + y, w.second + z, k2, 30) ^ x, k2, 31); +} + +template +static uint64_t farmhashuo::Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ) { + return len <= 64 ? farmhashna::Hash64WithSeed(s, len, seed) : + farmhashuo::Hash64WithSeeds(s, len, 0, seed); +} + +template +static uint64_t farmhashuo::Hash64( const uint8_t * s, size_t len ) { + return len <= 64 ? farmhashna::Hash64(s, len) : + farmhashuo::Hash64WithSeeds(s, len, 81, 0); } //------------------------------------------------------------ namespace farmhashxo { - template < bool bswap > - static inline uint64_t H32(const uint8_t *s, size_t len, uint64_t mul, - uint64_t seed0 = 0, uint64_t seed1 = 0); - template < bool bswap > - static inline uint64_t HashLen33to64(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint64_t HashLen65to96(const uint8_t *s, size_t len); - - template < bool bswap > - static uint64_t Hash64(const uint8_t *s, size_t len); - template < bool bswap > - static uint64_t Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1); - template < bool bswap > - static uint64_t Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed); -} - -template < bool bswap > -static inline uint64_t farmhashxo::H32(const uint8_t *s, size_t len, uint64_t mul, - uint64_t seed0, uint64_t seed1) { - uint64_t a = Fetch64(s) * k1; - uint64_t b = Fetch64(s + 8); - uint64_t c = Fetch64(s + len - 8) * mul; - uint64_t d = Fetch64(s + len - 16) * k2; - uint64_t u = ROTR64(a + b, 43) + ROTR64(c, 30) + d + seed0; - uint64_t v = a + ROTR64(b + k2, 18) + c + seed1; - a = ShiftMix((u ^ v) * mul); - b = ShiftMix((v ^ a) * mul); - return b; + template + static inline uint64_t H32( const uint8_t * s, size_t len, uint64_t mul, uint64_t seed0 = 0, uint64_t seed1 = 0 ); + + template + static inline uint64_t HashLen33to64( const uint8_t * s, size_t len ); + + template + static inline uint64_t HashLen65to96( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ); + + template + static uint64_t Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ); +} // namespace farmhashxo + +template +static inline uint64_t farmhashxo::H32( const uint8_t * s, size_t len, uint64_t mul, uint64_t seed0, uint64_t seed1 ) { + uint64_t a = Fetch64(s ) * k1; + uint64_t b = Fetch64(s + 8); + uint64_t c = Fetch64(s + len - 8) * mul; + uint64_t d = Fetch64(s + len - 16) * k2; + uint64_t u = ROTR64(a + b, 43) + ROTR64(c, 30) + d + seed0; + uint64_t v = a + ROTR64(b + k2, 18) + c + seed1; + + a = ShiftMix((u ^ v) * mul); + b = ShiftMix((v ^ a) * mul); + return b; } // Return an 8-byte hash for 33 to 64 bytes. -template < bool bswap > -static inline uint64_t farmhashxo::HashLen33to64(const uint8_t *s, size_t len) { - uint64_t mul0 = k2 - 30; - uint64_t mul1 = k2 - 30 + 2 * len; - uint64_t h0 = H32(s, 32, mul0); - uint64_t h1 = H32(s + len - 32, 32, mul1); - return ((h1 * mul1) + h0) * mul1; +template +static inline uint64_t farmhashxo::HashLen33to64( const uint8_t * s, size_t len ) { + uint64_t mul0 = k2 - 30; + uint64_t mul1 = k2 - 30 + 2 * len; + uint64_t h0 = H32(s, 32, mul0); + uint64_t h1 = H32(s + len - 32, 32, mul1); + + return ((h1 * mul1) + h0) * mul1; } // Return an 8-byte hash for 65 to 96 bytes. -template < bool bswap > -static inline uint64_t farmhashxo::HashLen65to96(const uint8_t *s, size_t len) { - uint64_t mul0 = k2 - 114; - uint64_t mul1 = k2 - 114 + 2 * len; - uint64_t h0 = H32(s, 32, mul0); - uint64_t h1 = H32(s + 32, 32, mul1); - uint64_t h2 = H32(s + len - 32, 32, mul1, h0, h1); - return (h2 * 9 + (h0 >> 17) + (h1 >> 21)) * mul1; -} - -template < bool bswap > -static uint64_t farmhashxo::Hash64(const uint8_t *s, size_t len) { +template +static inline uint64_t farmhashxo::HashLen65to96( const uint8_t * s, size_t len ) { + uint64_t mul0 = k2 - 114; + uint64_t mul1 = k2 - 114 + 2 * len; + uint64_t h0 = H32(s , 32, mul0); + uint64_t h1 = H32(s + 32, 32, mul1); + uint64_t h2 = H32(s + len - 32, 32, mul1, h0, h1); + + return (h2 * 9 + (h0 >> 17) + (h1 >> 21)) * mul1; +} + +template +static uint64_t farmhashxo::Hash64( const uint8_t * s, size_t len ) { if (len <= 32) { if (len <= 16) { return farmhashna::HashLen0to16(s, len); @@ -533,226 +557,227 @@ static uint64_t farmhashxo::Hash64(const uint8_t *s, size_t len) { } } -template < bool bswap > -static uint64_t farmhashxo::Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1) { - return farmhashuo::Hash64WithSeeds(s, len, seed0, seed1); +template +static uint64_t farmhashxo::Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ) { + return farmhashuo::Hash64WithSeeds(s, len, seed0, seed1); } -template < bool bswap > -static uint64_t farmhashxo::Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed) { - return farmhashuo::Hash64WithSeed(s, len, seed); +template +static uint64_t farmhashxo::Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ) { + return farmhashuo::Hash64WithSeed(s, len, seed); } //------------------------------------------------------------ #if defined(HAVE_SSE_4_1) namespace farmhashte { - template < bool bswap > - static inline uint64_t Hash64Long(const uint8_t* s, size_t n, - uint64_t seed0, uint64_t seed1); + template + static inline uint64_t Hash64Long( const uint8_t * s, size_t n, uint64_t seed0, uint64_t seed1 ); - template < bool bswap > - static uint64_t Hash64(const uint8_t *s, size_t len); - template < bool bswap > - static uint64_t Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1); - template < bool bswap > - static uint64_t Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed); -} + template + static uint64_t Hash64( const uint8_t * s, size_t len ); + + template + static uint64_t Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ); + + template + static uint64_t Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ); +} // namespace farmhashte // Requires n >= 256. Requires SSE4.1. Should be slightly faster if the // compiler uses AVX instructions (e.g., use the -mavx flag with GCC). -template < bool bswap > -static inline uint64_t farmhashte::Hash64Long(const uint8_t* s, size_t n, - uint64_t seed0, uint64_t seed1) { - const __m128i kShuf = - _mm_set_epi8(4, 11, 10, 5, 8, 15, 6, 9, 12, 2, 14, 13, 0, 7, 3, 1); - const __m128i kMult = - _mm_set_epi8(0xbd, 0xd6, 0x33, 0x39, 0x45, 0x54, 0xfa, 0x03, - 0x34, 0x3e, 0x33, 0xed, 0xcc, 0x9e, 0x2d, 0x51); - uint64_t seed2 = (seed0 + 113) * (seed1 + 9); - uint64_t seed3 = (ROTR64(seed0, 23) + 27) * (ROTR64(seed1, 30) + 111); - __m128i d0 = _mm_cvtsi64_si128(seed0); - __m128i d1 = _mm_cvtsi64_si128(seed1); - __m128i d2 = Shuf(kShuf, d0); - __m128i d3 = Shuf(kShuf, d1); - __m128i d4 = Xor(d0, d1); - __m128i d5 = Xor(d1, d2); - __m128i d6 = Xor(d2, d4); - __m128i d7 = _mm_set1_epi32(seed2 >> 32); - __m128i d8 = Mul(kMult, d2); - __m128i d9 = _mm_set1_epi32(seed3 >> 32); - __m128i d10 = _mm_set1_epi32(seed3); - __m128i d11 = Add64(d2, _mm_set1_epi32(seed2)); - const uint8_t* end = s + (n & ~static_cast(255)); - do { - __m128i z; - z = Fetch128(s); - d0 = Add64(d0, z); - d1 = Shuf(kShuf, d1); - d2 = Xor(d2, d0); - d4 = Xor(d4, z); - d4 = Xor(d4, d1); - std::swap(d0, d6); - z = Fetch128(s + 16); - d5 = Add64(d5, z); - d6 = Shuf(kShuf, d6); - d8 = Shuf(kShuf, d8); - d7 = Xor(d7, d5); - d0 = Xor(d0, z); - d0 = Xor(d0, d6); - std::swap(d5, d11); - z = Fetch128(s + 32); - d1 = Add64(d1, z); - d2 = Shuf(kShuf, d2); - d4 = Shuf(kShuf, d4); - d5 = Xor(d5, z); - d5 = Xor(d5, d2); - std::swap(d10, d4); - z = Fetch128(s + 48); - d6 = Add64(d6, z); - d7 = Shuf(kShuf, d7); - d0 = Shuf(kShuf, d0); - d8 = Xor(d8, d6); - d1 = Xor(d1, z); - d1 = Add64(d1, d7); - z = Fetch128(s + 64); - d2 = Add64(d2, z); - d5 = Shuf(kShuf, d5); - d4 = Add64(d4, d2); - d6 = Xor(d6, z); - d6 = Xor(d6, d11); - std::swap(d8, d2); - z = Fetch128(s + 80); - d7 = Xor(d7, z); - d8 = Shuf(kShuf, d8); - d1 = Shuf(kShuf, d1); - d0 = Add64(d0, d7); - d2 = Add64(d2, z); - d2 = Add64(d2, d8); - std::swap(d1, d7); - z = Fetch128(s + 96); - d4 = Shuf(kShuf, d4); - d6 = Shuf(kShuf, d6); - d8 = Mul(kMult, d8); - d5 = Xor(d5, d11); - d7 = Xor(d7, z); - d7 = Add64(d7, d4); - std::swap(d6, d0); - z = Fetch128(s + 112); - d8 = Add64(d8, z); - d0 = Shuf(kShuf, d0); - d2 = Shuf(kShuf, d2); - d1 = Xor(d1, d8); - d10 = Xor(d10, z); - d10 = Xor(d10, d0); - std::swap(d11, d5); - z = Fetch128(s + 128); - d4 = Add64(d4, z); - d5 = Shuf(kShuf, d5); - d7 = Shuf(kShuf, d7); - d6 = Add64(d6, d4); - d8 = Xor(d8, z); - d8 = Xor(d8, d5); - std::swap(d4, d10); - z = Fetch128(s + 144); - d0 = Add64(d0, z); - d1 = Shuf(kShuf, d1); - d2 = Add64(d2, d0); - d4 = Xor(d4, z); - d4 = Xor(d4, d1); - z = Fetch128(s + 160); - d5 = Add64(d5, z); - d6 = Shuf(kShuf, d6); - d8 = Shuf(kShuf, d8); - d7 = Xor(d7, d5); - d0 = Xor(d0, z); - d0 = Xor(d0, d6); - std::swap(d2, d8); - z = Fetch128(s + 176); - d1 = Add64(d1, z); - d2 = Shuf(kShuf, d2); - d4 = Shuf(kShuf, d4); - d5 = Mul(kMult, d5); - d5 = Xor(d5, z); - d5 = Xor(d5, d2); - std::swap(d7, d1); - z = Fetch128(s + 192); - d6 = Add64(d6, z); - d7 = Shuf(kShuf, d7); - d0 = Shuf(kShuf, d0); - d8 = Add64(d8, d6); - d1 = Xor(d1, z); - d1 = Xor(d1, d7); - std::swap(d0, d6); - z = Fetch128(s + 208); - d2 = Add64(d2, z); - d5 = Shuf(kShuf, d5); - d4 = Xor(d4, d2); - d6 = Xor(d6, z); - d6 = Xor(d6, d9); - std::swap(d5, d11); - z = Fetch128(s + 224); - d7 = Add64(d7, z); - d8 = Shuf(kShuf, d8); - d1 = Shuf(kShuf, d1); - d0 = Xor(d0, d7); - d2 = Xor(d2, z); - d2 = Xor(d2, d8); - std::swap(d10, d4); - z = Fetch128(s + 240); - d3 = Add64(d3, z); - d4 = Shuf(kShuf, d4); - d6 = Shuf(kShuf, d6); - d7 = Mul(kMult, d7); - d5 = Add64(d5, d3); - d7 = Xor(d7, z); - d7 = Xor(d7, d4); - std::swap(d3, d9); - s += 256; - } while (s != end); - d6 = Add64(Mul(kMult, d6), _mm_cvtsi64_si128(n)); - if (n % 256 != 0) { - d7 = Add64(_mm_shuffle_epi32(d8, (0 << 6) + (3 << 4) + (2 << 2) + (1 << 0)), d7); - d8 = Add64(Mul(kMult, d8), _mm_cvtsi64_si128(farmhashxo::Hash64(s, n % 256))); - } - __m128i t[8]; - d0 = Mul(kMult, Shuf(kShuf, Mul(kMult, d0))); - d3 = Mul(kMult, Shuf(kShuf, Mul(kMult, d3))); - d9 = Mul(kMult, Shuf(kShuf, Mul(kMult, d9))); - d1 = Mul(kMult, Shuf(kShuf, Mul(kMult, d1))); - d0 = Add64(d11, d0); - d3 = Xor(d7, d3); - d9 = Add64(d8, d9); - d1 = Add64(d10, d1); - d4 = Add64(d3, d4); - d5 = Add64(d9, d5); - d6 = Xor(d1, d6); - d2 = Add64(d0, d2); - t[0] = d0; - t[1] = d3; - t[2] = d9; - t[3] = d1; - t[4] = d4; - t[5] = d5; - t[6] = d6; - t[7] = d2; - return farmhashxo::Hash64(reinterpret_cast(t), sizeof(t)); -} - -template < bool bswap > -static uint64_t farmhashte::Hash64(const uint8_t *s, size_t len) { - // Empirically, farmhashxo seems faster until length 512. +template +static inline uint64_t farmhashte::Hash64Long( const uint8_t * s, size_t n, uint64_t seed0, uint64_t seed1 ) { + const __m128i kShuf = + _mm_set_epi8( 4, 11, 10, 5, 8, 15, 6, 9, 12, 2, 14, 13, 0, 7, 3, 1); + const __m128i kMult = + _mm_set_epi8(0xbd, 0xd6, 0x33, 0x39, 0x45, 0x54, 0xfa, + 0x03, 0x34, 0x3e, 0x33, 0xed, 0xcc, 0x9e, 0x2d, 0x51); + uint64_t seed2 = (seed0 + 113) * (seed1 + 9); + uint64_t seed3 = (ROTR64(seed0, 23) + 27) * (ROTR64(seed1, 30) + 111); + __m128i d0 = _mm_cvtsi64_si128(seed0); + __m128i d1 = _mm_cvtsi64_si128(seed1); + __m128i d2 = Shuf(kShuf, d0); + __m128i d3 = Shuf(kShuf, d1); + __m128i d4 = Xor(d0, d1); + __m128i d5 = Xor(d1, d2); + __m128i d6 = Xor(d2, d4); + __m128i d7 = _mm_set1_epi32(seed2 >> 32); + __m128i d8 = Mul(kMult, d2); + __m128i d9 = _mm_set1_epi32(seed3 >> 32); + __m128i d10 = _mm_set1_epi32(seed3 ); + __m128i d11 = Add64(d2, _mm_set1_epi32(seed2)); + const uint8_t * end = s + (n & ~static_cast(255)); + + do { + __m128i z; + z = Fetch128(s); + d0 = Add64(d0, z); + d1 = Shuf(kShuf, d1); + d2 = Xor(d2, d0); + d4 = Xor(d4, z ); + d4 = Xor(d4, d1); + std::swap(d0, d6); + z = Fetch128(s + 16); + d5 = Add64(d5, z); + d6 = Shuf(kShuf, d6); + d8 = Shuf(kShuf, d8); + d7 = Xor(d7, d5); + d0 = Xor(d0, z ); + d0 = Xor(d0, d6); + std::swap(d5, d11); + z = Fetch128(s + 32); + d1 = Add64(d1, z); + d2 = Shuf(kShuf, d2); + d4 = Shuf(kShuf, d4); + d5 = Xor(d5, z ); + d5 = Xor(d5, d2); + std::swap(d10, d4); + z = Fetch128(s + 48); + d6 = Add64(d6, z); + d7 = Shuf(kShuf, d7); + d0 = Shuf(kShuf, d0); + d8 = Xor(d8, d6); + d1 = Xor(d1, z ); + d1 = Add64(d1, d7); + z = Fetch128(s + 64); + d2 = Add64(d2, z); + d5 = Shuf(kShuf, d5); + d4 = Add64(d4, d2); + d6 = Xor(d6, z ); + d6 = Xor(d6, d11); + std::swap(d8, d2); + z = Fetch128(s + 80); + d7 = Xor(d7, z); + d8 = Shuf(kShuf, d8); + d1 = Shuf(kShuf, d1); + d0 = Add64(d0, d7); + d2 = Add64(d2, z ); + d2 = Add64(d2, d8); + std::swap(d1, d7); + z = Fetch128(s + 96); + d4 = Shuf(kShuf, d4); + d6 = Shuf(kShuf, d6); + d8 = Mul(kMult, d8); + d5 = Xor(d5, d11); + d7 = Xor(d7, z ); + d7 = Add64(d7, d4); + std::swap(d6, d0); + z = Fetch128(s + 112); + d8 = Add64(d8, z); + d0 = Shuf(kShuf, d0); + d2 = Shuf(kShuf, d2); + d1 = Xor(d1 , d8); + d10 = Xor(d10, z ); + d10 = Xor(d10, d0); + std::swap(d11, d5); + z = Fetch128(s + 128); + d4 = Add64(d4, z); + d5 = Shuf(kShuf, d5); + d7 = Shuf(kShuf, d7); + d6 = Add64(d6, d4); + d8 = Xor(d8, z ); + d8 = Xor(d8, d5); + std::swap(d4, d10); + z = Fetch128(s + 144); + d0 = Add64(d0, z); + d1 = Shuf(kShuf, d1); + d2 = Add64(d2, d0); + d4 = Xor(d4, z ); + d4 = Xor(d4, d1); + z = Fetch128(s + 160); + d5 = Add64(d5, z); + d6 = Shuf(kShuf, d6); + d8 = Shuf(kShuf, d8); + d7 = Xor(d7, d5); + d0 = Xor(d0, z ); + d0 = Xor(d0, d6); + std::swap(d2, d8); + z = Fetch128(s + 176); + d1 = Add64(d1, z); + d2 = Shuf(kShuf, d2); + d4 = Shuf(kShuf, d4); + d5 = Mul(kMult, d5); + d5 = Xor(d5, z ); + d5 = Xor(d5, d2); + std::swap(d7, d1); + z = Fetch128(s + 192); + d6 = Add64(d6, z); + d7 = Shuf(kShuf, d7); + d0 = Shuf(kShuf, d0); + d8 = Add64(d8, d6); + d1 = Xor(d1, z ); + d1 = Xor(d1, d7); + std::swap(d0, d6); + z = Fetch128(s + 208); + d2 = Add64(d2, z); + d5 = Shuf(kShuf, d5); + d4 = Xor(d4, d2); + d6 = Xor(d6, z ); + d6 = Xor(d6, d9); + std::swap(d5, d11); + z = Fetch128(s + 224); + d7 = Add64(d7, z); + d8 = Shuf(kShuf, d8); + d1 = Shuf(kShuf, d1); + d0 = Xor(d0, d7); + d2 = Xor(d2, z ); + d2 = Xor(d2, d8); + std::swap(d10, d4); + z = Fetch128(s + 240); + d3 = Add64(d3, z); + d4 = Shuf(kShuf, d4); + d6 = Shuf(kShuf, d6); + d7 = Mul(kMult, d7); + d5 = Add64(d5, d3); + d7 = Xor(d7, z ); + d7 = Xor(d7, d4); + std::swap(d3, d9); + s += 256; + } while (s != end); + d6 = Add64(Mul(kMult, d6), _mm_cvtsi64_si128(n)); + if (n % 256 != 0) { + d7 = Add64(_mm_shuffle_epi32(d8, (0 << 6) + (3 << 4) + (2 << 2) + (1 << 0)), d7 ); + d8 = Add64(Mul(kMult, d8), _mm_cvtsi64_si128(farmhashxo::Hash64(s, n % 256))); + } + __m128i t[8]; + d0 = Mul(kMult, Shuf(kShuf, Mul(kMult, d0))); + d3 = Mul(kMult, Shuf(kShuf, Mul(kMult, d3))); + d9 = Mul(kMult, Shuf(kShuf, Mul(kMult, d9))); + d1 = Mul(kMult, Shuf(kShuf, Mul(kMult, d1))); + d0 = Add64(d11, d0); + d3 = Xor(d7, d3); + d9 = Add64(d8 , d9); + d1 = Add64(d10, d1); + d4 = Add64(d3 , d4); + d5 = Add64(d9 , d5); + d6 = Xor(d1, d6); + d2 = Add64(d0, d2); + t[0] = d0; + t[1] = d3; + t[2] = d9; + t[3] = d1; + t[4] = d4; + t[5] = d5; + t[6] = d6; + t[7] = d2; + return farmhashxo::Hash64(reinterpret_cast(t), sizeof(t)); +} + +template +static uint64_t farmhashte::Hash64( const uint8_t * s, size_t len ) { + // Empirically, farmhashxo seems faster until length 512. return len >= 512 ? farmhashte::Hash64Long(s, len, k2, k1) : farmhashxo::Hash64(s, len); } -template < bool bswap > -static uint64_t farmhashte::Hash64WithSeed(const uint8_t *s, size_t len, uint64_t seed) { +template +static uint64_t farmhashte::Hash64WithSeed( const uint8_t * s, size_t len, uint64_t seed ) { return len >= 512 ? farmhashte::Hash64Long(s, len, k1, seed) : farmhashxo::Hash64WithSeed(s, len, seed); } -template < bool bswap > -static uint64_t farmhashte::Hash64WithSeeds(const uint8_t *s, size_t len, uint64_t seed0, uint64_t seed1) { +template +static uint64_t farmhashte::Hash64WithSeeds( const uint8_t * s, size_t len, uint64_t seed0, uint64_t seed1 ) { return len >= 512 ? farmhashte::Hash64Long(s, len, seed0, seed1) : farmhashxo::Hash64WithSeeds(s, len, seed0, seed1); } @@ -762,140 +787,150 @@ static uint64_t farmhashte::Hash64WithSeeds(const uint8_t *s, size_t len, uint64 //------------------------------------------------------------ #if defined(HAVE_SSE_4_1) namespace farmhashnt { - template < bool bswap > - static uint32_t Hash32(const uint8_t *s, size_t len); - template < bool bswap > - static uint32_t Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed); + template + static uint32_t Hash32( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ); } -template < bool bswap > -static uint32_t farmhashnt::Hash32(const uint8_t *s, size_t len) { - return static_cast(farmhashte::Hash64(s, len)); +template +static uint32_t farmhashnt::Hash32( const uint8_t * s, size_t len ) { + return static_cast(farmhashte::Hash64(s, len)); } -template < bool bswap > -static uint32_t farmhashnt::Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { - return static_cast(farmhashte::Hash64WithSeed(s, len, seed)); +template +static uint32_t farmhashnt::Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { + return static_cast(farmhashte::Hash64WithSeed(s, len, seed)); } + #endif //------------------------------------------------------------ namespace farmhashmk { - static inline uint32_t Hash32Len0to4(const uint8_t *s, size_t len, uint32_t seed = 0); - template < bool bswap > - static inline uint32_t Hash32Len5to12(const uint8_t *s, size_t len, uint32_t seed = 0); - template < bool bswap > - static inline uint32_t Hash32Len13to24(const uint8_t *s, size_t len, uint32_t seed = 0); - - template < bool bswap > - static uint32_t Hash32(const uint8_t *s, size_t len); - template < bool bswap > - static uint32_t Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed); -} - -template < bool bswap > -static inline uint32_t farmhashmk::Hash32Len13to24(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t a = Fetch32(s - 4 + (len >> 1)); - uint32_t b = Fetch32(s + 4); - uint32_t c = Fetch32(s + len - 8); - uint32_t d = Fetch32(s + (len >> 1)); - uint32_t e = Fetch32(s); - uint32_t f = Fetch32(s + len - 4); - uint32_t h = d * c1 + len + seed; - a = ROTR32(a, 12) + f; - h = Mur(c, h) + a; - a = ROTR32(a, 3) + c; - h = Mur(e, h) + a; - a = ROTR32(a + f, 12) + d; - h = Mur(b ^ seed, h) + a; - return fmix(h); -} - -static inline uint32_t farmhashmk::Hash32Len0to4(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t b = seed; - uint32_t c = 9; - for (size_t i = 0; i < len; i++) { - int8_t v = s[i]; - b = b * c1 + v; - c ^= b; - } - return fmix(Mur(b, Mur(len, c))); -} - -template < bool bswap > -static inline uint32_t farmhashmk::Hash32Len5to12(const uint8_t *s, size_t len, uint32_t seed) { - uint32_t a = len, b = len * 5, c = 9, d = b + seed; - a += Fetch32(s); - b += Fetch32(s + len - 4); - c += Fetch32(s + ((len >> 1) & 4)); - return fmix(seed ^ Mur(c, Mur(b, Mur(a, d)))); -} - -template < bool bswap > -static uint32_t farmhashmk::Hash32(const uint8_t *s, size_t len) { + static inline uint32_t Hash32Len0to4( const uint8_t * s, size_t len, uint32_t seed = 0 ); + + template + static inline uint32_t Hash32Len5to12( const uint8_t * s, size_t len, uint32_t seed = 0 ); + + template + static inline uint32_t Hash32Len13to24( const uint8_t * s, size_t len, uint32_t seed = 0 ); + + template + static uint32_t Hash32( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ); +} // namespace farmhashmk + +template +static inline uint32_t farmhashmk::Hash32Len13to24( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t a = Fetch32(s - 4 + (len >> 1)); + uint32_t b = Fetch32(s + 4); + uint32_t c = Fetch32(s + len - 8); + uint32_t d = Fetch32(s + (len >> 1)); + uint32_t e = Fetch32(s); + uint32_t f = Fetch32(s + len - 4); + uint32_t h = d * c1 + len + seed; + + a = ROTR32(a, 12) + f; + h = Mur(c, h) + a; + a = ROTR32(a, 3) + c; + h = Mur(e, h) + a; + a = ROTR32(a + f, 12) + d; + h = Mur(b ^ seed, h) + a; + return fmix(h); +} + +static inline uint32_t farmhashmk::Hash32Len0to4( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t b = seed; + uint32_t c = 9; + + for (size_t i = 0; i < len; i++) { + int8_t v = s[i]; + b = b * c1 + v; + c ^= b; + } + return fmix(Mur(b, Mur(len, c))); +} + +template +static inline uint32_t farmhashmk::Hash32Len5to12( const uint8_t * s, size_t len, uint32_t seed ) { + uint32_t a = len, b = len * 5, c = 9, d = b + seed; + + a += Fetch32(s); + b += Fetch32(s + len - 4); + c += Fetch32(s + ((len >> 1) & 4)); + return fmix(seed ^ Mur(c, Mur(b, Mur(a, d)))); +} + +template +static uint32_t farmhashmk::Hash32( const uint8_t * s, size_t len ) { if (len <= 24) { return len <= 12 ? - (len <= 4 ? farmhashmk::Hash32Len0to4(s, len) : farmhashmk::Hash32Len5to12(s, len)) : - farmhashmk::Hash32Len13to24(s, len); - } - - // len > 24 - uint32_t h = len, g = c1 * len, f = g; - uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; - uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; - uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; - uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; - uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; - h ^= a0; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - h ^= a2; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - g ^= a1; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - g ^= a3; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - f += a4; - f = ROTR32(f, 19) + 113; - size_t iters = (len - 1) / 20; - do { - uint32_t a = Fetch32(s); - uint32_t b = Fetch32(s + 4); - uint32_t c = Fetch32(s + 8); - uint32_t d = Fetch32(s + 12); - uint32_t e = Fetch32(s + 16); - h += a; - g += b; - f += c; - h = Mur(d, h) + e; - g = Mur(c, g) + a; - f = Mur(b + e * c1, f) + d; - f += g; - g += f; - s += 20; - } while (--iters != 0); - g = ROTR32(g, 11) * c1; - g = ROTR32(g, 17) * c1; - f = ROTR32(f, 11) * c1; - f = ROTR32(f, 17) * c1; - h = ROTR32(h + g, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - h = ROTR32(h + f, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - return h; -} - -template < bool bswap > -static uint32_t farmhashmk::Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { + (len <= 4 ? farmhashmk::Hash32Len0to4(s, len) : farmhashmk::Hash32Len5to12(s, len)) : + farmhashmk::Hash32Len13to24(s, len); + } + + // len > 24 + uint32_t h = len, g = c1 * len, f = g; + uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; + uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; + uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; + uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; + uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; + h ^= a0; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + h ^= a2; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + g ^= a1; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + g ^= a3; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + f += a4; + f = ROTR32(f, 19) + 113; + size_t iters = (len - 1) / 20; + do { + uint32_t a = Fetch32(s ); + uint32_t b = Fetch32(s + 4); + uint32_t c = Fetch32(s + 8); + uint32_t d = Fetch32(s + 12); + uint32_t e = Fetch32(s + 16); + h += a; + g += b; + f += c; + h = Mur(d, h) + e; + g = Mur(c, g) + a; + f = Mur(b + e * c1, f) + d; + f += g; + g += f; + s += 20; + } while (--iters != 0); + g = ROTR32(g , 11) * c1; + g = ROTR32(g , 17) * c1; + f = ROTR32(f , 11) * c1; + f = ROTR32(f , 17) * c1; + h = ROTR32(h + g, 19); + h = h * 5 + 0xe6546b64; + h = ROTR32(h , 17) * c1; + h = ROTR32(h + f, 19); + h = h * 5 + 0xe6546b64; + h = ROTR32(h , 17) * c1; + return h; +} + +template +static uint32_t farmhashmk::Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { if (len <= 24) { - if (len >= 13) return farmhashmk::Hash32Len13to24(s, len, seed * c1); - else if (len >= 5) return farmhashmk::Hash32Len5to12(s, len, seed); - else return farmhashmk::Hash32Len0to4(s, len, seed); + if (len >= 13) { return farmhashmk::Hash32Len13to24(s, len, seed * c1); } else if (len >= 5) { + return farmhashmk::Hash32Len5to12(s, len, seed); + } else { + return farmhashmk::Hash32Len0to4(s, len, seed); + } } uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); return Mur(farmhashmk::Hash32(s + 24, len - 24) + seed, h); @@ -904,190 +939,194 @@ static uint32_t farmhashmk::Hash32WithSeed(const uint8_t *s, size_t len, uint32_ //------------------------------------------------------------ #if defined(HAVE_X86_64_CRC32C) && defined(HAVE_X86_64_AES) namespace farmhashsu { - template < bool bswap > - static uint32_t Hash32(const uint8_t *s, size_t len); - template < bool bswap > - static uint32_t Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed); + template + static uint32_t Hash32( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ); } -template < bool bswap > -static uint32_t farmhashsu::Hash32(const uint8_t *s, size_t len) { +template +static uint32_t farmhashsu::Hash32( const uint8_t * s, size_t len ) { const uint32_t seed = 81; + if (len <= 24) { return len <= 12 ? - (len <= 4 ? - farmhashmk::Hash32Len0to4(s, len) : - farmhashmk::Hash32Len5to12(s, len)) : - farmhashmk::Hash32Len13to24(s, len); - } - - if (len < 40) { - uint32_t a = len, b = seed * c2, c = a + b; - a += Fetch32(s + len - 4); - b += Fetch32(s + len - 20); - c += Fetch32(s + len - 16); - uint32_t d = a; - a = ROTR32(a, 21); - a = Mur(a, Mur(b, _mm_crc32_u32(c, d))); - a += Fetch32(s + len - 12); - b += Fetch32(s + len - 8); - d += a; - a += d; - b = Mur(b, d) * c2; - a = _mm_crc32_u32(a, b + c); - return farmhashmk::Hash32Len13to24(s, (len + 1) / 2, a) + b; - } - -#undef Mulc1 -#define Mulc1(x) Mul((x), cc1) - -#undef Mulc2 -#define Mulc2(x) Mul((x), cc2) - -#undef Murk -#define Murk(a, h) \ - Add32(k, \ - Mul5( \ - Rol19( \ - Xor( \ - Mulc2( \ - Rol17( \ - Mulc1(a))), \ + (len <= 4 ? + farmhashmk::Hash32Len0to4(s, len) : + farmhashmk::Hash32Len5to12(s, len)) : + farmhashmk::Hash32Len13to24(s, len); + } + + if (len < 40) { + uint32_t a = len, b = seed * c2, c = a + b; + a += Fetch32(s + len - 4); + b += Fetch32(s + len - 20); + c += Fetch32(s + len - 16); + uint32_t d = a; + a = ROTR32(a, 21); + a = Mur(a, Mur(b, _mm_crc32_u32(c, d))); + a += Fetch32(s + len - 12); + b += Fetch32(s + len - 8); + d += a; + a += d; + b = Mur(b, d) * c2; + a = _mm_crc32_u32(a, b + c); + return farmhashmk::Hash32Len13to24(s, (len + 1) / 2, a) + b; + } + + #undef Mulc1 + #define Mulc1(x) Mul((x), cc1) + + #undef Mulc2 + #define Mulc2(x) Mul((x), cc2) + + #undef Murk +#define Murk(a, h) \ + Add32(k, \ + Mul5( \ + Rol19( \ + Xor( \ + Mulc2( \ + Rol17( \ + Mulc1(a))), \ (h))))) - const __m128i cc1 = _mm_set1_epi32(c1); - const __m128i cc2 = _mm_set1_epi32(c2); - __m128i h = _mm_set1_epi32(seed); - __m128i g = _mm_set1_epi32(c1 * seed); - __m128i f = g; - __m128i k = _mm_set1_epi32(0xe6546b64); - __m128i q; - if (len < 80) { - __m128i a = Fetch128(s); - __m128i b = Fetch128(s + 16); - __m128i c = Fetch128(s + (len - 15) / 2); - __m128i d = Fetch128(s + len - 32); - __m128i e = Fetch128(s + len - 16); - h = Add32(h, a); - g = Add32(g, b); - q = g; - g = Shuffle0321(g); - f = Add32(f, c); - __m128i be = Add32(b, Mulc1(e)); - h = Add32(h, f); - f = Add32(f, h); - h = Add32(Murk(d, h), e); - k = Xor(k, _mm_shuffle_epi8(g, f)); - g = Add32(Xor(c, g), a); - f = Add32(Xor(be, f), d); - k = Add32(k, be); - k = Add32(k, _mm_shuffle_epi8(f, h)); - f = Add32(f, g); - g = Add32(g, f); - g = Add32(_mm_set1_epi32(len), Mulc1(g)); - } else { - // len >= 80 - // The following is loosely modelled after farmhashmk::Hash32. - size_t iters = (len - 1) / 80; - len -= iters * 80; - -#undef Chunk -#define Chunk() do { \ - __m128i a = Fetch128(s); \ - __m128i b = Fetch128(s + 16); \ - __m128i c = Fetch128(s + 32); \ - __m128i d = Fetch128(s + 48); \ - __m128i e = Fetch128(s + 64); \ - h = Add32(h, a); \ - g = Add32(g, b); \ - g = Shuffle0321(g); \ - f = Add32(f, c); \ - __m128i be = Add32(b, Mulc1(e)); \ - h = Add32(h, f); \ - f = Add32(f, h); \ - h = Add32(h, d); \ - q = Add32(q, e); \ - h = Rol17(h); \ - h = Mulc1(h); \ - k = Xor(k, _mm_shuffle_epi8(g, f)); \ - g = Add32(Xor(c, g), a); \ - f = Add32(Xor(be, f), d); \ - std::swap(f, q); \ - q = _mm_aesimc_si128(q); \ - k = Add32(k, be); \ - k = Add32(k, _mm_shuffle_epi8(f, h)); \ - f = Add32(f, g); \ - g = Add32(g, f); \ - f = Mulc1(f); \ + const __m128i cc1 = _mm_set1_epi32(c1); + const __m128i cc2 = _mm_set1_epi32( c2 ); + __m128i h = _mm_set1_epi32( seed ); + __m128i g = _mm_set1_epi32( c1 * seed); + __m128i f = g; + __m128i k = _mm_set1_epi32(0xe6546b64 ); + __m128i q; + if (len < 80) { + __m128i a = Fetch128(s ); + __m128i b = Fetch128(s + 16); + __m128i c = Fetch128(s + (len - 15) / 2); + __m128i d = Fetch128(s + len - 32); + __m128i e = Fetch128(s + len - 16); + h = Add32(h, a); + g = Add32(g, b); + q = g; + g = Shuffle0321(g); + f = Add32(f, c); + __m128i be = Add32(b, Mulc1(e)); + h = Add32(h, f); + f = Add32(f, h); + h = Add32(Murk(d, h), e); + k = Xor(k, _mm_shuffle_epi8(g, f)); + g = Add32(Xor(c, g) , a); + f = Add32(Xor(be, f), d); + k = Add32(k, be ); + k = Add32(k, _mm_shuffle_epi8(f, h)); + f = Add32(f, g); + g = Add32(g, f); + g = Add32(_mm_set1_epi32(len), Mulc1(g)); + } else { + // len >= 80 + // The following is loosely modelled after farmhashmk::Hash32. + size_t iters = (len - 1) / 80; + len -= iters * 80; + + #undef Chunk +#define Chunk() do { \ + __m128i a = Fetch128(s); \ + __m128i b = Fetch128(s + 16); \ + __m128i c = Fetch128(s + 32); \ + __m128i d = Fetch128(s + 48); \ + __m128i e = Fetch128(s + 64); \ + h = Add32(h, a); \ + g = Add32(g, b); \ + g = Shuffle0321(g); \ + f = Add32(f, c); \ + __m128i be = Add32(b, Mulc1(e)); \ + h = Add32(h, f); \ + f = Add32(f, h); \ + h = Add32(h, d); \ + q = Add32(q, e); \ + h = Rol17(h); \ + h = Mulc1(h); \ + k = Xor(k, _mm_shuffle_epi8(g, f)); \ + g = Add32(Xor(c, g), a); \ + f = Add32(Xor(be, f), d); \ + std::swap(f, q); \ + q = _mm_aesimc_si128(q); \ + k = Add32(k, be); \ + k = Add32(k, _mm_shuffle_epi8(f, h)); \ + f = Add32(f, g); \ + g = Add32(g, f); \ + f = Mulc1(f); \ } while (0) - q = g; - while (iters-- != 0) { - Chunk(); - s += 80; + q = g; + while (iters-- != 0) { + Chunk(); + s += 80; + } + + if (len != 0) { + h = Add32(h, _mm_set1_epi32(len)); + s = s + len - 80; + Chunk(); + } } - if (len != 0) { - h = Add32(h, _mm_set1_epi32(len)); - s = s + len - 80; - Chunk(); + g = Shuffle0321(g); + k = Xor(k, g); + k = Xor(k, q); + h = Xor(h, q); + f = Mulc1(f); + k = Mulc2(k); + g = Mulc1(g); + h = Mulc2(h); + k = Add32(k, _mm_shuffle_epi8(g, f)); + h = Add32(h, f); + f = Add32(f, h); + g = Add32(g, k); + k = Add32(k, g); + k = Xor(k, _mm_shuffle_epi8(f, h)); + __m128i buf[4]; + buf[0] = f; + buf[1] = g; + buf[2] = k; + buf[3] = h; + s = reinterpret_cast(buf); + uint32_t x = Fetch32(s ); + uint32_t y = Fetch32(s + 4); + uint32_t z = Fetch32(s + 8); + x = _mm_crc32_u32(x , Fetch32(s + 12)); + y = _mm_crc32_u32(y , Fetch32(s + 16)); + z = _mm_crc32_u32(z * c1, Fetch32(s + 20)); + x = _mm_crc32_u32(x , Fetch32(s + 24)); + y = _mm_crc32_u32(y * c1, Fetch32(s + 28)); + uint32_t o = y; + z = _mm_crc32_u32(z , Fetch32(s + 32)); + x = _mm_crc32_u32(x * c1, Fetch32(s + 36)); + y = _mm_crc32_u32(y , Fetch32(s + 40)); + z = _mm_crc32_u32(z * c1, Fetch32(s + 44)); + x = _mm_crc32_u32(x , Fetch32(s + 48)); + y = _mm_crc32_u32(y * c1, Fetch32(s + 52)); + z = _mm_crc32_u32(z , Fetch32(s + 56)); + x = _mm_crc32_u32(x , Fetch32(s + 60)); + return (o - x + y - z) * c1; +} + + #undef Chunk + #undef Murk + #undef Mulc2 + #undef Mulc1 + +template +static uint32_t farmhashsu::Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { + if (len <= 24) { + if (len >= 13) { return farmhashmk::Hash32Len13to24(s, len, seed * c1); } else if (len >= 5) { + return farmhashmk::Hash32Len5to12(s, len, seed); + } else { + return farmhashmk::Hash32Len0to4(s, len, seed); + } } - } - - g = Shuffle0321(g); - k = Xor(k, g); - k = Xor(k, q); - h = Xor(h, q); - f = Mulc1(f); - k = Mulc2(k); - g = Mulc1(g); - h = Mulc2(h); - k = Add32(k, _mm_shuffle_epi8(g, f)); - h = Add32(h, f); - f = Add32(f, h); - g = Add32(g, k); - k = Add32(k, g); - k = Xor(k, _mm_shuffle_epi8(f, h)); - __m128i buf[4]; - buf[0] = f; - buf[1] = g; - buf[2] = k; - buf[3] = h; - s = reinterpret_cast(buf); - uint32_t x = Fetch32(s); - uint32_t y = Fetch32(s+4); - uint32_t z = Fetch32(s+8); - x = _mm_crc32_u32(x, Fetch32(s+12)); - y = _mm_crc32_u32(y, Fetch32(s+16)); - z = _mm_crc32_u32(z * c1, Fetch32(s+20)); - x = _mm_crc32_u32(x, Fetch32(s+24)); - y = _mm_crc32_u32(y * c1, Fetch32(s+28)); - uint32_t o = y; - z = _mm_crc32_u32(z, Fetch32(s+32)); - x = _mm_crc32_u32(x * c1, Fetch32(s+36)); - y = _mm_crc32_u32(y, Fetch32(s+40)); - z = _mm_crc32_u32(z * c1, Fetch32(s+44)); - x = _mm_crc32_u32(x, Fetch32(s+48)); - y = _mm_crc32_u32(y * c1, Fetch32(s+52)); - z = _mm_crc32_u32(z, Fetch32(s+56)); - x = _mm_crc32_u32(x, Fetch32(s+60)); - return (o - x + y - z) * c1; -} - -#undef Chunk -#undef Murk -#undef Mulc2 -#undef Mulc1 - -template < bool bswap > -static uint32_t farmhashsu::Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { - if (len <= 24) { - if (len >= 13) return farmhashmk::Hash32Len13to24(s, len, seed * c1); - else if (len >= 5) return farmhashmk::Hash32Len5to12(s, len, seed); - else return farmhashmk::Hash32Len0to4(s, len, seed); - } - uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); - return _mm_crc32_u32(farmhashsu::Hash32(s + 24, len - 24) + seed, h); + uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); + return _mm_crc32_u32(farmhashsu::Hash32(s + 24, len - 24) + seed, h); } #endif @@ -1095,451 +1134,465 @@ static uint32_t farmhashsu::Hash32WithSeed(const uint8_t *s, size_t len, uint32_ //------------------------------------------------------------ #if defined(HAVE_X86_64_CRC32C) namespace farmhashsa { - template < bool bswap > - static uint32_t Hash32(const uint8_t *s, size_t len); - template < bool bswap > - static uint32_t Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed); -} - -template < bool bswap > -static uint32_t farmhashsa::Hash32(const uint8_t *s, size_t len) { - const uint32_t seed = 81; - if (len <= 24) { - return len <= 12 ? - (len <= 4 ? - farmhashmk::Hash32Len0to4(s, len) : - farmhashmk::Hash32Len5to12(s, len)) : - farmhashmk::Hash32Len13to24(s, len); - } - - if (len < 40) { - uint32_t a = len, b = seed * c2, c = a + b; - a += Fetch32(s + len - 4); - b += Fetch32(s + len - 20); - c += Fetch32(s + len - 16); - uint32_t d = a; - a = ROTR32(a, 21); - a = Mur(a, Mur(b, Mur(c, d))); - a += Fetch32(s + len - 12); - b += Fetch32(s + len - 8); - d += a; - a += d; - b = Mur(b, d) * c2; - a = _mm_crc32_u32(a, b + c); - return farmhashmk::Hash32Len13to24(s, (len + 1) / 2, a) + b; - } - -#undef Mulc1 -#define Mulc1(x) Mul((x), cc1) - -#undef Mulc2 -#define Mulc2(x) Mul((x), cc2) - -#undef Murk -#define Murk(a, h) \ - Add32(k, \ - Mul5( \ - Rol19( \ - Xor( \ - Mulc2( \ - Rol17( \ - Mulc1(a))), \ + template + static uint32_t Hash32( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ); +} + +template +static uint32_t farmhashsa::Hash32( const uint8_t * s, size_t len ) { + const uint32_t seed = 81; + + if (len <= 24) { + return len <= 12 ? + (len <= 4 ? + farmhashmk::Hash32Len0to4(s, len) : + farmhashmk::Hash32Len5to12(s, len)) : + farmhashmk::Hash32Len13to24(s, len); + } + + if (len < 40) { + uint32_t a = len, b = seed * c2, c = a + b; + a += Fetch32(s + len - 4); + b += Fetch32(s + len - 20); + c += Fetch32(s + len - 16); + uint32_t d = a; + a = ROTR32(a, 21); + a = Mur(a, Mur(b, Mur(c, d))); + a += Fetch32(s + len - 12); + b += Fetch32(s + len - 8); + d += a; + a += d; + b = Mur(b, d) * c2; + a = _mm_crc32_u32(a, b + c); + return farmhashmk::Hash32Len13to24(s, (len + 1) / 2, a) + b; + } + + #undef Mulc1 + #define Mulc1(x) Mul((x), cc1) + + #undef Mulc2 + #define Mulc2(x) Mul((x), cc2) + + #undef Murk +#define Murk(a, h) \ + Add32(k, \ + Mul5( \ + Rol19( \ + Xor( \ + Mulc2( \ + Rol17( \ + Mulc1(a))), \ (h))))) - const __m128i cc1 = _mm_set1_epi32(c1); - const __m128i cc2 = _mm_set1_epi32(c2); - __m128i h = _mm_set1_epi32(seed); - __m128i g = _mm_set1_epi32(c1 * seed); - __m128i f = g; - __m128i k = _mm_set1_epi32(0xe6546b64); - if (len < 80) { - __m128i a = Fetch128(s); - __m128i b = Fetch128(s + 16); - __m128i c = Fetch128(s + (len - 15) / 2); - __m128i d = Fetch128(s + len - 32); - __m128i e = Fetch128(s + len - 16); - h = Add32(h, a); - g = Add32(g, b); - g = Shuffle0321(g); - f = Add32(f, c); - __m128i be = Add32(b, Mulc1(e)); - h = Add32(h, f); - f = Add32(f, h); - h = Add32(Murk(d, h), e); - k = Xor(k, _mm_shuffle_epi8(g, f)); - g = Add32(Xor(c, g), a); - f = Add32(Xor(be, f), d); - k = Add32(k, be); - k = Add32(k, _mm_shuffle_epi8(f, h)); - f = Add32(f, g); - g = Add32(g, f); - g = Add32(_mm_set1_epi32(len), Mulc1(g)); - } else { - // len >= 80 - // The following is loosely modelled after farmhashmk::Hash32. - size_t iters = (len - 1) / 80; - len -= iters * 80; - -#undef Chunk -#define Chunk() do { \ - __m128i a = Fetch128(s); \ - __m128i b = Fetch128(s + 16); \ - __m128i c = Fetch128(s + 32); \ - __m128i d = Fetch128(s + 48); \ - __m128i e = Fetch128(s + 64); \ - h = Add32(h, a); \ - g = Add32(g, b); \ - g = Shuffle0321(g); \ - f = Add32(f, c); \ - __m128i be = Add32(b, Mulc1(e)); \ - h = Add32(h, f); \ - f = Add32(f, h); \ - h = Add32(Murk(d, h), e); \ - k = Xor(k, _mm_shuffle_epi8(g, f)); \ - g = Add32(Xor(c, g), a); \ - f = Add32(Xor(be, f), d); \ - k = Add32(k, be); \ - k = Add32(k, _mm_shuffle_epi8(f, h)); \ - f = Add32(f, g); \ - g = Add32(g, f); \ - f = Mulc1(f); \ + const __m128i cc1 = _mm_set1_epi32(c1); + const __m128i cc2 = _mm_set1_epi32( c2 ); + __m128i h = _mm_set1_epi32( seed ); + __m128i g = _mm_set1_epi32( c1 * seed); + __m128i f = g; + __m128i k = _mm_set1_epi32(0xe6546b64 ); + if (len < 80) { + __m128i a = Fetch128(s ); + __m128i b = Fetch128(s + 16); + __m128i c = Fetch128(s + (len - 15) / 2); + __m128i d = Fetch128(s + len - 32); + __m128i e = Fetch128(s + len - 16); + h = Add32(h, a); + g = Add32(g, b); + g = Shuffle0321(g); + f = Add32(f, c); + __m128i be = Add32(b, Mulc1(e)); + h = Add32(h, f); + f = Add32(f, h); + h = Add32(Murk(d, h), e); + k = Xor(k, _mm_shuffle_epi8(g, f)); + g = Add32(Xor(c, g) , a); + f = Add32(Xor(be, f), d); + k = Add32(k, be ); + k = Add32(k, _mm_shuffle_epi8(f, h)); + f = Add32(f, g); + g = Add32(g, f); + g = Add32(_mm_set1_epi32(len), Mulc1(g)); + } else { + // len >= 80 + // The following is loosely modelled after farmhashmk::Hash32. + size_t iters = (len - 1) / 80; + len -= iters * 80; + + #undef Chunk +#define Chunk() do { \ + __m128i a = Fetch128(s); \ + __m128i b = Fetch128(s + 16); \ + __m128i c = Fetch128(s + 32); \ + __m128i d = Fetch128(s + 48); \ + __m128i e = Fetch128(s + 64); \ + h = Add32(h, a); \ + g = Add32(g, b); \ + g = Shuffle0321(g); \ + f = Add32(f, c); \ + __m128i be = Add32(b, Mulc1(e)); \ + h = Add32(h, f); \ + f = Add32(f, h); \ + h = Add32(Murk(d, h), e); \ + k = Xor(k, _mm_shuffle_epi8(g, f)); \ + g = Add32(Xor(c, g), a); \ + f = Add32(Xor(be, f), d); \ + k = Add32(k, be); \ + k = Add32(k, _mm_shuffle_epi8(f, h)); \ + f = Add32(f, g); \ + g = Add32(g, f); \ + f = Mulc1(f); \ } while (0) - while (iters-- != 0) { - Chunk(); - s += 80; + while (iters-- != 0) { + Chunk(); + s += 80; + } + + if (len != 0) { + h = Add32(h, _mm_set1_epi32(len)); + s = s + len - 80; + Chunk(); + } } - if (len != 0) { - h = Add32(h, _mm_set1_epi32(len)); - s = s + len - 80; - Chunk(); + g = Shuffle0321(g); + k = Xor(k, g); + f = Mulc1(f); + k = Mulc2(k); + g = Mulc1(g); + h = Mulc2(h); + k = Add32(k, _mm_shuffle_epi8(g, f)); + h = Add32(h, f); + f = Add32(f, h); + g = Add32(g, k); + k = Add32(k, g); + k = Xor(k, _mm_shuffle_epi8(f, h)); + __m128i buf[4]; + buf[0] = f; + buf[1] = g; + buf[2] = k; + buf[3] = h; + s = reinterpret_cast(buf); + uint32_t x = Fetch32(s ); + uint32_t y = Fetch32(s + 4); + uint32_t z = Fetch32(s + 8); + x = _mm_crc32_u32(x , Fetch32(s + 12)); + y = _mm_crc32_u32(y , Fetch32(s + 16)); + z = _mm_crc32_u32(z * c1, Fetch32(s + 20)); + x = _mm_crc32_u32(x , Fetch32(s + 24)); + y = _mm_crc32_u32(y * c1, Fetch32(s + 28)); + uint32_t o = y; + z = _mm_crc32_u32(z , Fetch32(s + 32)); + x = _mm_crc32_u32(x * c1, Fetch32(s + 36)); + y = _mm_crc32_u32(y , Fetch32(s + 40)); + z = _mm_crc32_u32(z * c1, Fetch32(s + 44)); + x = _mm_crc32_u32(x , Fetch32(s + 48)); + y = _mm_crc32_u32(y * c1, Fetch32(s + 52)); + z = _mm_crc32_u32(z , Fetch32(s + 56)); + x = _mm_crc32_u32(x , Fetch32(s + 60)); + return (o - x + y - z) * c1; +} + + #undef Chunk + #undef Murk + #undef Mulc2 + #undef Mulc1 + +template +static uint32_t farmhashsa::Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { + if (len <= 24) { + if (len >= 13) { return farmhashmk::Hash32Len13to24(s, len, seed * c1); } else if (len >= 5) { + return farmhashmk::Hash32Len5to12(s, len, seed); + } else { + return farmhashmk::Hash32Len0to4(s, len, seed); + } } - } - - g = Shuffle0321(g); - k = Xor(k, g); - f = Mulc1(f); - k = Mulc2(k); - g = Mulc1(g); - h = Mulc2(h); - k = Add32(k, _mm_shuffle_epi8(g, f)); - h = Add32(h, f); - f = Add32(f, h); - g = Add32(g, k); - k = Add32(k, g); - k = Xor(k, _mm_shuffle_epi8(f, h)); - __m128i buf[4]; - buf[0] = f; - buf[1] = g; - buf[2] = k; - buf[3] = h; - s = reinterpret_cast(buf); - uint32_t x = Fetch32(s); - uint32_t y = Fetch32(s+4); - uint32_t z = Fetch32(s+8); - x = _mm_crc32_u32(x, Fetch32(s+12)); - y = _mm_crc32_u32(y, Fetch32(s+16)); - z = _mm_crc32_u32(z * c1, Fetch32(s+20)); - x = _mm_crc32_u32(x, Fetch32(s+24)); - y = _mm_crc32_u32(y * c1, Fetch32(s+28)); - uint32_t o = y; - z = _mm_crc32_u32(z, Fetch32(s+32)); - x = _mm_crc32_u32(x * c1, Fetch32(s+36)); - y = _mm_crc32_u32(y, Fetch32(s+40)); - z = _mm_crc32_u32(z * c1, Fetch32(s+44)); - x = _mm_crc32_u32(x, Fetch32(s+48)); - y = _mm_crc32_u32(y * c1, Fetch32(s+52)); - z = _mm_crc32_u32(z, Fetch32(s+56)); - x = _mm_crc32_u32(x, Fetch32(s+60)); - return (o - x + y - z) * c1; -} - -#undef Chunk -#undef Murk -#undef Mulc2 -#undef Mulc1 - -template < bool bswap > -static uint32_t farmhashsa::Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { - if (len <= 24) { - if (len >= 13) return farmhashmk::Hash32Len13to24(s, len, seed * c1); - else if (len >= 5) return farmhashmk::Hash32Len5to12(s, len, seed); - else return farmhashmk::Hash32Len0to4(s, len, seed); - } - uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); - return _mm_crc32_u32(farmhashsa::Hash32(s + 24, len - 24) + seed, h); + uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); + return _mm_crc32_u32(farmhashsa::Hash32(s + 24, len - 24) + seed, h); } #endif //------------------------------------------------------------ namespace farmhashcc { - static inline uint32_t Hash32Len0to4(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint32_t Hash32Len5to12(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint32_t Hash32Len13to24(const uint8_t *s, size_t len); - - template < bool bswap > - static uint32_t Hash32(const uint8_t *s, size_t len); - template < bool bswap > - static uint32_t Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed); - - template < bool bswap > - static inline uint64_t HashLen0to16(const uint8_t *s, size_t len); - template < bool bswap > - static inline uint128_t CityMurmur(const uint8_t *s, size_t len, uint128_t seed); - - template < bool bswap > - static uint128_t Hash128WithSeed(const uint8_t *s, size_t len, uint128_t seed); -} - -template < bool bswap > -static inline uint32_t farmhashcc::Hash32Len13to24(const uint8_t *s, size_t len) { - uint32_t a = Fetch32(s - 4 + (len >> 1)); - uint32_t b = Fetch32(s + 4); - uint32_t c = Fetch32(s + len - 8); - uint32_t d = Fetch32(s + (len >> 1)); - uint32_t e = Fetch32(s); - uint32_t f = Fetch32(s + len - 4); - uint32_t h = len; - - return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); -} - -static inline uint32_t farmhashcc::Hash32Len0to4(const uint8_t *s, size_t len) { - uint32_t b = 0; - uint32_t c = 9; - for (size_t i = 0; i < len; i++) { - int8_t v = s[i]; - b = b * c1 + v; - c ^= b; - } - return fmix(Mur(b, Mur(len, c))); -} - -template < bool bswap > -static inline uint32_t farmhashcc::Hash32Len5to12(const uint8_t *s, size_t len) { - uint32_t a = len, b = len * 5, c = 9, d = b; - a += Fetch32(s); - b += Fetch32(s + len - 4); - c += Fetch32(s + ((len >> 1) & 4)); - return fmix(Mur(c, Mur(b, Mur(a, d)))); -} - -template < bool bswap > -static uint32_t farmhashcc::Hash32(const uint8_t *s, size_t len) { - if (len <= 24) { - return len <= 12 ? - (len <= 4 ? farmhashcc::Hash32Len0to4(s, len) : farmhashcc::Hash32Len5to12(s, len)) : - farmhashcc::Hash32Len13to24(s, len); - } - - // len > 24 - uint32_t h = len, g = c1 * len, f = g; - uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; - uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; - uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; - uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; - uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; - h ^= a0; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - h ^= a2; - h = ROTR32(h, 19); - h = h * 5 + 0xe6546b64; - g ^= a1; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - g ^= a3; - g = ROTR32(g, 19); - g = g * 5 + 0xe6546b64; - f += a4; - f = ROTR32(f, 19); - f = f * 5 + 0xe6546b64; - size_t iters = (len - 1) / 20; - do { - uint32_t a0 = ROTR32(Fetch32(s) * c1, 17) * c2; - uint32_t a1 = Fetch32(s + 4); - uint32_t a2 = ROTR32(Fetch32(s + 8) * c1, 17) * c2; - uint32_t a3 = ROTR32(Fetch32(s + 12) * c1, 17) * c2; - uint32_t a4 = Fetch32(s + 16); + static inline uint32_t Hash32Len0to4( const uint8_t * s, size_t len ); + + template + static inline uint32_t Hash32Len5to12( const uint8_t * s, size_t len ); + + template + static inline uint32_t Hash32Len13to24( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32( const uint8_t * s, size_t len ); + + template + static uint32_t Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ); + + template + static inline uint64_t HashLen0to16( const uint8_t * s, size_t len ); + + template + static inline uint128_t CityMurmur( const uint8_t * s, size_t len, uint128_t seed ); + + template + static uint128_t Hash128WithSeed( const uint8_t * s, size_t len, uint128_t seed ); +} // namespace farmhashcc + +template +static inline uint32_t farmhashcc::Hash32Len13to24( const uint8_t * s, size_t len ) { + uint32_t a = Fetch32(s - 4 + (len >> 1)); + uint32_t b = Fetch32(s + 4); + uint32_t c = Fetch32(s + len - 8); + uint32_t d = Fetch32(s + (len >> 1)); + uint32_t e = Fetch32(s); + uint32_t f = Fetch32(s + len - 4); + uint32_t h = len; + + return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); +} + +static inline uint32_t farmhashcc::Hash32Len0to4( const uint8_t * s, size_t len ) { + uint32_t b = 0; + uint32_t c = 9; + + for (size_t i = 0; i < len; i++) { + int8_t v = s[i]; + b = b * c1 + v; + c ^= b; + } + return fmix(Mur(b, Mur(len, c))); +} + +template +static inline uint32_t farmhashcc::Hash32Len5to12( const uint8_t * s, size_t len ) { + uint32_t a = len, b = len * 5, c = 9, d = b; + + a += Fetch32(s); + b += Fetch32(s + len - 4); + c += Fetch32(s + ((len >> 1) & 4)); + return fmix(Mur(c, Mur(b, Mur(a, d)))); +} + +template +static uint32_t farmhashcc::Hash32( const uint8_t * s, size_t len ) { + if (len <= 24) { + return len <= 12 ? + (len <= 4 ? farmhashcc::Hash32Len0to4(s, len) : farmhashcc::Hash32Len5to12(s, len)) : + farmhashcc::Hash32Len13to24(s, len); + } + + // len > 24 + uint32_t h = len, g = c1 * len, f = g; + uint32_t a0 = ROTR32(Fetch32(s + len - 4) * c1, 17) * c2; + uint32_t a1 = ROTR32(Fetch32(s + len - 8) * c1, 17) * c2; + uint32_t a2 = ROTR32(Fetch32(s + len - 16) * c1, 17) * c2; + uint32_t a3 = ROTR32(Fetch32(s + len - 12) * c1, 17) * c2; + uint32_t a4 = ROTR32(Fetch32(s + len - 20) * c1, 17) * c2; h ^= a0; - h = ROTR32(h, 18); + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + h ^= a2; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + g ^= a1; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + g ^= a3; + g = ROTR32(g, 19); + g = g * 5 + 0xe6546b64; + f += a4; + f = ROTR32(f, 19); + f = f * 5 + 0xe6546b64; + size_t iters = (len - 1) / 20; + do { + uint32_t a0 = ROTR32(Fetch32(s) * c1, 17) * c2; + uint32_t a1 = Fetch32(s + 4); + uint32_t a2 = ROTR32(Fetch32(s + 8) * c1, 17) * c2; + uint32_t a3 = ROTR32(Fetch32(s + 12) * c1, 17) * c2; + uint32_t a4 = Fetch32(s + 16); + h ^= a0; + h = ROTR32(h, 18); + h = h * 5 + 0xe6546b64; + f += a1; + f = ROTR32(f, 19); + f = f * c1; + g += a2; + g = ROTR32(g, 18); + g = g * 5 + 0xe6546b64; + h ^= a3 + a1; + h = ROTR32(h, 19); + h = h * 5 + 0xe6546b64; + g ^= a4; + g = BSWAP(g) * 5; + h += a4 * 5; + h = BSWAP(h); + f += a0; + PERMUTE3(f, h, g); + s += 20; + } while (--iters != 0); + g = ROTR32(g , 11) * c1; + g = ROTR32(g , 17) * c1; + f = ROTR32(f , 11) * c1; + f = ROTR32(f , 17) * c1; + h = ROTR32(h + g, 19); h = h * 5 + 0xe6546b64; - f += a1; - f = ROTR32(f, 19); - f = f * c1; - g += a2; - g = ROTR32(g, 18); - g = g * 5 + 0xe6546b64; - h ^= a3 + a1; - h = ROTR32(h, 19); + h = ROTR32(h , 17) * c1; + h = ROTR32(h + f, 19); h = h * 5 + 0xe6546b64; - g ^= a4; - g = BSWAP(g) * 5; - h += a4 * 5; - h = BSWAP(h); - f += a0; - PERMUTE3(f, h, g); - s += 20; - } while (--iters != 0); - g = ROTR32(g, 11) * c1; - g = ROTR32(g, 17) * c1; - f = ROTR32(f, 11) * c1; - f = ROTR32(f, 17) * c1; - h = ROTR32(h + g, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - h = ROTR32(h + f, 19); - h = h * 5 + 0xe6546b64; - h = ROTR32(h, 17) * c1; - return h; -} - -template < bool bswap > -static uint32_t farmhashcc::Hash32WithSeed(const uint8_t *s, size_t len, uint32_t seed) { - if (len <= 24) { - if (len >= 13) return farmhashmk::Hash32Len13to24(s, len, seed * c1); - else if (len >= 5) return farmhashmk::Hash32Len5to12(s, len, seed); - else return farmhashmk::Hash32Len0to4(s, len, seed); - } - uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); - return Mur(farmhashcc::Hash32(s + 24, len - 24) + seed, h); -} - -template < bool bswap > -static inline uint64_t farmhashcc::HashLen0to16(const uint8_t *s, size_t len) { - if (len >= 8) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch64(s) + k2; - uint64_t b = Fetch64(s + len - 8); - uint64_t c = ROTR64(b, 37) * mul + a; - uint64_t d = (ROTR64(a, 25) + b) * mul; - return HashLen16(c, d, mul); - } - if (len >= 4) { - uint64_t mul = k2 + len * 2; - uint64_t a = Fetch32(s); - return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul); - } - if (len > 0) { - uint8_t a = s[0]; - uint8_t b = s[len >> 1]; - uint8_t c = s[len - 1]; - uint32_t y = static_cast(a) + (static_cast(b) << 8); - uint32_t z = len + (static_cast(c) << 2); - return ShiftMix(y * k2 ^ z * k0) * k2; - } - return k2; -} - -template < bool bswap > -static inline uint128_t farmhashcc::CityMurmur(const uint8_t *s, size_t len, uint128_t seed) { - uint64_t a = Uint128Low64(seed); - uint64_t b = Uint128High64(seed); - uint64_t c = 0; - uint64_t d = 0; - signed long l = len - 16; - if (l <= 0) { // len <= 16 - a = ShiftMix(a * k1) * k1; - c = b * k1 + farmhashcc::HashLen0to16(s, len); - d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); - } else { // len > 16 - c = HashLen16(Fetch64(s + len - 8) + k1, a); - d = HashLen16(b + len, c + Fetch64(s + len - 16)); - a += d; + h = ROTR32(h , 17) * c1; + return h; +} + +template +static uint32_t farmhashcc::Hash32WithSeed( const uint8_t * s, size_t len, uint32_t seed ) { + if (len <= 24) { + if (len >= 13) { return farmhashmk::Hash32Len13to24(s, len, seed * c1); } else if (len >= 5) { + return farmhashmk::Hash32Len5to12(s, len, seed); + } else { + return farmhashmk::Hash32Len0to4(s, len, seed); + } + } + uint32_t h = farmhashmk::Hash32Len13to24(s, 24, seed ^ len); + return Mur(farmhashcc::Hash32(s + 24, len - 24) + seed, h); +} + +template +static inline uint64_t farmhashcc::HashLen0to16( const uint8_t * s, size_t len ) { + if (len >= 8) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch64(s) + k2; + uint64_t b = Fetch64(s + len - 8); + uint64_t c = ROTR64(b, 37) * mul + a; + uint64_t d = (ROTR64(a, 25) + b) * mul; + return HashLen16(c, d, mul); + } + if (len >= 4) { + uint64_t mul = k2 + len * 2; + uint64_t a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul); + } + if (len > 0) { + uint8_t a = s[0]; + uint8_t b = s[len >> 1]; + uint8_t c = s[len - 1]; + uint32_t y = static_cast(a) + (static_cast(b) << 8); + uint32_t z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k0) * k2; + } + return k2; +} + +template +static inline uint128_t farmhashcc::CityMurmur( const uint8_t * s, size_t len, uint128_t seed ) { + uint64_t a = Uint128Low64(seed); + uint64_t b = Uint128High64(seed); + uint64_t c = 0; + uint64_t d = 0; + signed long l = len - 16; + + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + farmhashcc::HashLen0to16(s, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s + len - 8) + k1, a ); + d = HashLen16(b + len, c + Fetch64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return Uint128(a ^ b, HashLen16(b, a)); +} + +template +static uint128_t farmhashcc::Hash128WithSeed( const uint8_t * s, size_t len, uint128_t seed ) { + if (len < 128) { + return farmhashcc::CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair v, w; + uint64_t x = Uint128Low64(seed); + uint64_t y = Uint128High64(seed); + uint64_t z = len * k1; + v.first = ROTR64(y ^ k1 , 49) * k1 + Fetch64(s); + v.second = ROTR64(v.first, 42) * k1 + Fetch64(s + 8); + w.first = ROTR64(y + z , 35) * k1 + x; + w.second = ROTR64(x + Fetch64(s + 88), 53) * k1; + + // This is the same inner loop as CityHash64(), manually unrolled. do { - a ^= ShiftMix(Fetch64(s) * k1) * k1; - a *= k1; - b ^= a; - c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; - c *= k1; - d ^= c; - s += 16; - l -= 16; - } while (l > 0); - } - a = HashLen16(a, c); - b = HashLen16(d, b); - return Uint128(a ^ b, HashLen16(b, a)); -} - -template < bool bswap > -static uint128_t farmhashcc::Hash128WithSeed(const uint8_t *s, size_t len, uint128_t seed) { - if (len < 128) { - return farmhashcc::CityMurmur(s, len, seed); - } - - // We expect len >= 128 to be the common case. Keep 56 bytes of state: - // v, w, x, y, and z. - pair v, w; - uint64_t x = Uint128Low64(seed); - uint64_t y = Uint128High64(seed); - uint64_t z = len * k1; - v.first = ROTR64(y ^ k1, 49) * k1 + Fetch64(s); - v.second = ROTR64(v.first, 42) * k1 + Fetch64(s + 8); - w.first = ROTR64(y + z, 35) * k1 + x; - w.second = ROTR64(x + Fetch64(s + 88), 53) * k1; - - // This is the same inner loop as CityHash64(), manually unrolled. - do { - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - s += 64; - x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; - y = ROTR64(y + v.second + Fetch64(s + 48), 42) * k1; - x ^= w.second; - y += v.first + Fetch64(s + 40); - z = ROTR64(z + w.first, 33) * k1; - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); - std::swap(z, x); - s += 64; - len -= 128; - } while (likely(len >= 128)); - x += ROTR64(v.first + z, 49) * k0; - y = y * k0 + ROTR64(w.second, 37); - z = z * k0 + ROTR64(w.first, 27); - w.first *= 9; - v.first *= k0; - // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. - for (size_t tail_done = 0; tail_done < len; ) { - tail_done += 32; - y = ROTR64(x + y, 42) * k0 + v.second; - w.first += Fetch64(s + len - tail_done + 16); - x = x * k0 + w.first; - z += w.second + Fetch64(s + len - tail_done); - w.second += v.first; - v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + x = ROTR64(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = ROTR64(y + v.second + Fetch64 (s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = ROTR64(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s , v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second , y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 128; + } while (likely(len >= 128)); + x += ROTR64(v.first + z, 49) * k0; + y = y * k0 + ROTR64(w.second, 37); + z = z * k0 + ROTR64(w.first , 27); + w.first *= 9; v.first *= k0; - } - // At this point our 56 bytes of state should contain more than - // enough information for a strong 128-bit hash. We use two - // different 56-byte-to-8-byte hashes to get a 16-byte final result. - x = HashLen16(x, v.first); - y = HashLen16(y + z, w.first); - return Uint128(HashLen16(x + v.second, w.second) + y, - HashLen16(x + w.second, y + v.second)); + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len;) { + tail_done += 32; + y = ROTR64(x + y, 42) * k0 + v.second; + w.first += Fetch64 (s + len - tail_done + 16); + x = x * k0 + w.first; + z += w.second + Fetch64 (s + len - tail_done); + w.second += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); + v.first *= k0; + } + // At this point our 56 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 56-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x , v.first); + y = HashLen16(y + z, w.first); + return Uint128(HashLen16(x + v.second, w.second) + y, HashLen16(x + w.second, y + v.second)); } //------------------------------------------------------------ -template < bool bswap > -static void FarmHashNA(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashNA( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = farmhashna::Hash64WithSeed((const uint8_t *)in, len, seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void FarmHashUO(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashUO( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = farmhashuo::Hash64WithSeed((const uint8_t *)in, len, seed); + PUT_U64(h, (uint8_t *)out, 0); } @@ -1547,309 +1600,323 @@ static void FarmHashUO(const void * in, const size_t len, const seed_t seed, voi // version, the XO version won't be tested explicitly. #if defined(HAVE_SSE_4_1) -template < bool bswap > -static void FarmHashTE(const void * in, const size_t len, const seed_t seed, void * out) { + +template +static void FarmHashTE( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = farmhashte::Hash64WithSeed((const uint8_t *)in, len, seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void FarmHashNT(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashNT( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = farmhashnt::Hash32WithSeed((const uint8_t *)in, len, seed); + PUT_U32(h, (uint8_t *)out, 0); } + #endif -template < bool bswap > -static void FarmHashMK(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashMK( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = farmhashmk::Hash32WithSeed((const uint8_t *)in, len, seed); + PUT_U32(h, (uint8_t *)out, 0); } #if defined(HAVE_X86_64_CRC32C) && defined(HAVE_X86_64_AES) -template < bool bswap > -static void FarmHashSU(const void * in, const size_t len, const seed_t seed, void * out) { + +template +static void FarmHashSU( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = farmhashsu::Hash32WithSeed((const uint8_t *)in, len, seed); + PUT_U32(h, (uint8_t *)out, 0); } + #endif #if defined(HAVE_X86_64_CRC32C) -template < bool bswap > -static void FarmHashSA(const void * in, const size_t len, const seed_t seed, void * out) { + +template +static void FarmHashSA( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = farmhashsa::Hash32WithSeed((const uint8_t *)in, len, seed); + PUT_U32(h, (uint8_t *)out, 0); } + #endif -template < bool bswap > -static void FarmHashCC_32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashCC_32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = farmhashcc::Hash32WithSeed((const uint8_t *)in, len, seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap, uint32_t seedmode > -static void FarmHashCC_128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashCC_128( const void * in, const size_t len, const seed_t seed, void * out ) { uint128_t seed128; - switch(seedmode) { + + switch (seedmode) { case 1: seed128 = Uint128((uint64_t)seed, 0); break; case 2: seed128 = Uint128(0, (uint64_t)seed); break; case 3: seed128 = Uint128((uint64_t)seed, (uint64_t)seed); break; default: exit(1); } uint128_t h = farmhashcc::Hash128WithSeed((const uint8_t *)in, len, seed128); - PUT_U64(Uint128Low64(h), (uint8_t *)out, 0); + PUT_U64(Uint128Low64(h) , (uint8_t *)out, 0); PUT_U64(Uint128High64(h), (uint8_t *)out, 8); } -template < bool bswap, uint32_t seedmode > -static void FarmHashCityMurmur_128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FarmHashCityMurmur_128( const void * in, const size_t len, const seed_t seed, void * out ) { uint128_t seed128; - switch(seedmode) { + + switch (seedmode) { case 1: seed128 = Uint128((uint64_t)seed, 0); break; case 2: seed128 = Uint128(0, (uint64_t)seed); break; case 3: seed128 = Uint128((uint64_t)seed, (uint64_t)seed); break; default: exit(1); } uint128_t h = farmhashcc::CityMurmur((const uint8_t *)in, len, seed128); - PUT_U64(Uint128Low64(h), (uint8_t *)out, 0); + PUT_U64(Uint128Low64(h) , (uint8_t *)out, 0); PUT_U64(Uint128High64(h), (uint8_t *)out, 8); } REGISTER_FAMILY(farmhash, - $.src_url = "https://github.com/google/farmhash", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/google/farmhash", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(FarmHash_64__NA, - $.desc = "FarmHash Hash64WithSeed (NA version)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.sort_order = 10, - $.verification_LE = 0xEBC4A679, - $.verification_BE = 0xB24C5C09, - $.hashfn_native = FarmHashNA, - $.hashfn_bswap = FarmHashNA -); + $.desc = "FarmHash Hash64WithSeed (NA version)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.sort_order = 10, + $.verification_LE = 0xEBC4A679, + $.verification_BE = 0xB24C5C09, + $.hashfn_native = FarmHashNA, + $.hashfn_bswap = FarmHashNA + ); REGISTER_HASH(FarmHash_64__UO, - $.desc = "FarmHash Hash64WithSeed (UO version)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.sort_order = 20, - $.verification_LE = 0x5438EF2C, - $.verification_BE = 0x72B8113E, - $.hashfn_native = FarmHashUO, - $.hashfn_bswap = FarmHashUO -); + $.desc = "FarmHash Hash64WithSeed (UO version)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.sort_order = 20, + $.verification_LE = 0x5438EF2C, + $.verification_BE = 0x72B8113E, + $.hashfn_native = FarmHashUO, + $.hashfn_bswap = FarmHashUO + ); #if defined(HAVE_SSE_4_1) REGISTER_HASH(FarmHash_64__TE, - $.desc = "FarmHash Hash64WithSeed (TE version)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.sort_order = 30, - $.verification_LE = 0xF1BF42C3, - $.verification_BE = 0x7188736E, - $.hashfn_native = FarmHashTE, - $.hashfn_bswap = FarmHashTE -); + $.desc = "FarmHash Hash64WithSeed (TE version)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.sort_order = 30, + $.verification_LE = 0xF1BF42C3, + $.verification_BE = 0x7188736E, + $.hashfn_native = FarmHashTE, + $.hashfn_bswap = FarmHashTE + ); REGISTER_HASH(FarmHash_32__NT, - $.desc = "FarmHash Hash32WithSeed (NT version)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.sort_order = 40, - $.verification_LE = 0x47AB39AF, - $.verification_BE = 0x6AE8BA9B, - $.hashfn_native = FarmHashNT, - $.hashfn_bswap = FarmHashNT -); + $.desc = "FarmHash Hash32WithSeed (NT version)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.sort_order = 40, + $.verification_LE = 0x47AB39AF, + $.verification_BE = 0x6AE8BA9B, + $.hashfn_native = FarmHashNT, + $.hashfn_bswap = FarmHashNT + ); #endif REGISTER_HASH(FarmHash_32__MK, - $.desc = "FarmHash Hash32WithSeed (MK version)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.sort_order = 50, - $.verification_LE = 0x0DC9AF39, - $.verification_BE = 0x6B67BB90, - $.hashfn_native = FarmHashMK, - $.hashfn_bswap = FarmHashMK -); + $.desc = "FarmHash Hash32WithSeed (MK version)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.sort_order = 50, + $.verification_LE = 0x0DC9AF39, + $.verification_BE = 0x6B67BB90, + $.hashfn_native = FarmHashMK, + $.hashfn_bswap = FarmHashMK + ); #if defined(HAVE_X86_64_CRC32C) && defined(HAVE_X86_64_AES) REGISTER_HASH(FarmHash_32__SU, - $.desc = "FarmHash Hash32WithSeed (SU version)", - $.hash_flags = - FLAG_HASH_SMALL_SEED | - FLAG_HASH_AES_BASED | - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.sort_order = 60, - $.verification_LE = 0xE7A53C98, - $.verification_BE = 0x9CC06B52, - $.hashfn_native = FarmHashSU, - $.hashfn_bswap = FarmHashSU -); + $.desc = "FarmHash Hash32WithSeed (SU version)", + $.hash_flags = + FLAG_HASH_SMALL_SEED | + FLAG_HASH_AES_BASED | + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.sort_order = 60, + $.verification_LE = 0xE7A53C98, + $.verification_BE = 0x9CC06B52, + $.hashfn_native = FarmHashSU, + $.hashfn_bswap = FarmHashSU + ); #endif #if defined(HAVE_X86_64_CRC32C) REGISTER_HASH(FarmHash_32__SA, - $.desc = "FarmHash Hash32WithSeed (SA version)", - $.hash_flags = - FLAG_HASH_SMALL_SEED | - FLAG_HASH_CRC_BASED , - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.sort_order = 70, - $.verification_LE = 0x553B1655, - $.verification_BE = 0x19A1CCEA, - $.hashfn_native = FarmHashSA, - $.hashfn_bswap = FarmHashSA -); + $.desc = "FarmHash Hash32WithSeed (SA version)", + $.hash_flags = + FLAG_HASH_SMALL_SEED | + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.sort_order = 70, + $.verification_LE = 0x553B1655, + $.verification_BE = 0x19A1CCEA, + $.hashfn_native = FarmHashSA, + $.hashfn_bswap = FarmHashSA + ); #endif REGISTER_HASH(FarmHash_32__CC, - $.desc = "FarmHash Hash32WithSeed (CC version)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 32, - $.sort_order = 80, - $.verification_LE = 0x61DEEE7E, - $.verification_BE = 0xAE9514F0, - $.hashfn_native = FarmHashCC_32, - $.hashfn_bswap = FarmHashCC_32 -); + $.desc = "FarmHash Hash32WithSeed (CC version)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.sort_order = 80, + $.verification_LE = 0x61DEEE7E, + $.verification_BE = 0xAE9514F0, + $.hashfn_native = FarmHashCC_32, + $.hashfn_bswap = FarmHashCC_32 + ); REGISTER_HASH(FarmHash_128__CC__seed1, - $.desc = "FarmHash Hash128WithSeed (CC version, seeded low 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 90, - $.verification_LE = 0x305C0D9A, - $.verification_BE = 0xDC1669A2, - $.hashfn_native = FarmHashCC_128, - $.hashfn_bswap = FarmHashCC_128 -); + $.desc = "FarmHash Hash128WithSeed (CC version, seeded low 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 90, + $.verification_LE = 0x305C0D9A, + $.verification_BE = 0xDC1669A2, + $.hashfn_native = FarmHashCC_128, + $.hashfn_bswap = FarmHashCC_128 + ); REGISTER_HASH(FarmHash_128__CC__seed2, - $.desc = "FarmHash Hash128WithSeed (CC version, seeded high 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 100, - $.verification_LE = 0x0DB4D383, - $.verification_BE = 0xFA39DBEA, - $.hashfn_native = FarmHashCC_128, - $.hashfn_bswap = FarmHashCC_128 -); + $.desc = "FarmHash Hash128WithSeed (CC version, seeded high 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 100, + $.verification_LE = 0x0DB4D383, + $.verification_BE = 0xFA39DBEA, + $.hashfn_native = FarmHashCC_128, + $.hashfn_bswap = FarmHashCC_128 + ); REGISTER_HASH(FarmHash_128__CC__seed3, - $.desc = "FarmHash Hash128WithSeed (CC version, seeded low+high 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 110, - $.verification_LE = 0xA93EBF71, - $.verification_BE = 0x38CD0ED1, - $.hashfn_native = FarmHashCC_128, - $.hashfn_bswap = FarmHashCC_128 -); + $.desc = "FarmHash Hash128WithSeed (CC version, seeded low+high 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 110, + $.verification_LE = 0xA93EBF71, + $.verification_BE = 0x38CD0ED1, + $.hashfn_native = FarmHashCC_128, + $.hashfn_bswap = FarmHashCC_128 + ); REGISTER_HASH(FarmHash_128__CM__seed1, - $.desc = "FarmHash CityMurmur (CM version, seeded low 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 120, - $.verification_LE = 0x6593FD6D, - $.verification_BE = 0xF84ED47F, - $.hashfn_native = FarmHashCityMurmur_128, - $.hashfn_bswap = FarmHashCityMurmur_128 -); + $.desc = "FarmHash CityMurmur (CM version, seeded low 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 120, + $.verification_LE = 0x6593FD6D, + $.verification_BE = 0xF84ED47F, + $.hashfn_native = FarmHashCityMurmur_128, + $.hashfn_bswap = FarmHashCityMurmur_128 + ); REGISTER_HASH(FarmHash_128__CM__seed2, - $.desc = "FarmHash CityMurmur (CM version, seeded high 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 130, - $.verification_LE = 0xF1483884, - $.verification_BE = 0x5185F2C4, - $.hashfn_native = FarmHashCityMurmur_128, - $.hashfn_bswap = FarmHashCityMurmur_128 -); + $.desc = "FarmHash CityMurmur (CM version, seeded high 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 130, + $.verification_LE = 0xF1483884, + $.verification_BE = 0x5185F2C4, + $.hashfn_native = FarmHashCityMurmur_128, + $.hashfn_bswap = FarmHashCityMurmur_128 + ); REGISTER_HASH(FarmHash_128__CM__seed3, - $.desc = "FarmHash CityMurmur (CM version, seeded low+high 64 bit)", - $.hash_flags = - FLAG_HASH_XL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 128, - $.sort_order = 140, - $.verification_LE = 0x6D028510, - $.verification_BE = 0xFC258701, - $.hashfn_native = FarmHashCityMurmur_128, - $.hashfn_bswap = FarmHashCityMurmur_128 -); + $.desc = "FarmHash CityMurmur (CM version, seeded low+high 64 bit)", + $.hash_flags = + FLAG_HASH_XL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.sort_order = 140, + $.verification_LE = 0x6D028510, + $.verification_BE = 0xFC258701, + $.hashfn_native = FarmHashCityMurmur_128, + $.hashfn_bswap = FarmHashCityMurmur_128 + ); diff --git a/hashes/farsh.cpp b/hashes/farsh.cpp index d5d888d1..522b5c81 100644 --- a/hashes/farsh.cpp +++ b/hashes/farsh.cpp @@ -28,130 +28,138 @@ #include "Hashlib.h" #if defined(HAVE_AVX2) || defined(HAVE_SSE_2) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif -#define FARSH_MAX_HASHES 32 /* number of 32-bit hashes supported by the built-in key */ -#define FARSH_BASE_KEY_SIZE 1024 /* size of user-supplied key required to compute 32-bit hash with index 0 */ -#define FARSH_EXTRA_KEY_SIZE 16 /* extra bytes required to compute 32-bit hash with every next index */ +#define FARSH_MAX_HASHES 32 /* number of 32-bit hashes supported by the built-in key */ +#define FARSH_BASE_KEY_SIZE 1024 /* size of user-supplied key required to compute 32-bit hash with index 0 */ +#define FARSH_EXTRA_KEY_SIZE 16 /* extra bytes required to compute 32-bit hash with every next index */ #define STRIPE FARSH_BASE_KEY_SIZE -#define STRIPE_ELEMENTS (STRIPE/sizeof(uint32_t)) /* should be power of 2 due to use of 'x % STRIPE_ELEMENTS' below */ -#define EXTRA_ELEMENTS (((FARSH_MAX_HASHES-1) * FARSH_EXTRA_KEY_SIZE) / sizeof(uint32_t)) +#define STRIPE_ELEMENTS (STRIPE / sizeof(uint32_t)) /* + * should be power of 2 due to use of 'x % STRIPE_ELEMENTS' below + * + */ +#define EXTRA_ELEMENTS (((FARSH_MAX_HASHES - 1) * FARSH_EXTRA_KEY_SIZE) / sizeof(uint32_t)) /* STRIPE bytes of key material plus extra keys for hashes up to 1024 bits long */ -alignas(32) static const uint32_t FARSH_KEYS [STRIPE_ELEMENTS + EXTRA_ELEMENTS] = { - 0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c,0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f, - 0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221,0xb8084674,0xf743248e,0xe03590e6,0x813a264c, - 0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3,0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8, - 0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d,0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364, - 0x4f6a0752,0xa79b079c,0x8fc49499,0x8ec9b7a9,0x33c92249,0x4eb6404f,0xfb2afb4e,0xa4814255, - 0x2f0e1b98,0xace93b24,0x188850cd,0x6c5c74a7,0x66fa4404,0xeac5ac83,0x34d3ebc3,0xc581a0ff, - 0xfa1363eb,0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e,0x2b16be58,0x7d47a1fc,0x8ff8b8d1, - 0x7ad031ce,0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e,0x995274a4,0xeb9a2d93,0x3be78908, - 0xed475f6c,0x919cd8f2,0xd3861e5a,0x6e31390c,0xfe6a3a49,0xdcad0914,0x06508beb,0xa88399f3, - 0xb058112f,0xe8b0fa79,0x29b4da06,0xedc253fb,0xc3e96dad,0x6e372b83,0x4f78b153,0xfffa6e86, - 0x21beeeec,0x01caea02,0x1267e50d,0x11e6092f,0xe819d298,0x832f80dd,0x0c4e2477,0xbc7886eb, - 0x01506637,0x8ba89668,0x6d11e7a0,0xfc12fd15,0x86a54c19,0x593ce3dd,0xd2b13fe5,0x8e772b53, - 0xae4a60cc,0x647a3b1b,0x547786e0,0x3ec4378e,0x8d7acf89,0xca36f947,0x0e89d5ef,0xaada6a3c, - 0x6da4a109,0x9ac6e11c,0x686691ef,0xa357bd2b,0xd16f1b9a,0x38c70303,0x7d4622b3,0x2968fa8f, - 0x8ca5bcb9,0xfcd61005,0x228b5e96,0x2c9dcc19,0x57cf243c,0x3c53f9c1,0x0cc7952c,0x686de4f0, - 0x93a747b5,0x4e87a510,0x975e91ae,0x4c10b98e,0x8a7f068c,0x346b19ab,0x353ca625,0xf20a50e0, - 0xce9921f6,0xdf66e014,0x0a11ef4b,0x8bc84ddf,0x84d25d22,0xc823936d,0x94741ec3,0x88278a60, - 0xb8649331,0x7a707a10,0x7292cad6,0xa7c644c2,0xbd156bfa,0x646c9578,0xb7f4dfd5,0x9f8277a7, - 0x7013924e,0xad674cc3,0x2cae9d05,0x912a9a22,0xf67c53fa,0x8d7e22a9,0x59ae372b,0x850199f3, - 0x63a2102c,0xd6ff1261,0x56738ee1,0xaa95145b,0xfdd12832,0x5b684deb,0x0784de94,0xaa62390e, - 0xbb7ccf19,0x0fefd572,0x565b41ca,0x2206d202,0x2d608479,0x4c0fcd3d,0xd36d3be3,0x155a9a65, - 0x10f9e732,0xac9b0f1e,0x1f72a03b,0xea9440ae,0x5b674b4f,0x31a827d1,0xecca954f,0x3d2cd61e, - 0x768d3da4,0x93745ac1,0x1d5d58cb,0x4b86f3b6,0x2aba923a,0x0e65814c,0x8ae063d9,0xcd6969b0, - 0x36641585,0x742af59d,0x613a1316,0x338ea471,0x47861af3,0x30479dc3,0x1270a481,0x08771069, - 0xe3c4f0d2,0x0229874c,0x5a8a3bc1,0xe30d9733,0xd05be5a2,0xe2af31ba,0x222049f9,0x9f923b6a, - 0x033f64ec,0xe528b62b,0x8201efbd,0x2107d877,0xd8312ef1,0xa5679f99,0x1730b51b,0x752616d2, - 0x05305909,0x0dca440b,0x2093cdd9,0x6409ab50,0xba5c8ecc,0x8d4708ea,0x429f0917,0xb762fab0, - 0x5161ea75,0x45eba0eb,0xb6f34b41,0x52047123,0xe4181523,0x8d74e90a,0x54fa401c,0xddda0cc7, - 0x63df182a,0xc6403ef6,0x348ec6e8,0xb9ff57f5,0xf652b8bd,0x0f86b0f3,0xfb3a088a,0x4dc71533, - 0x7b3617d2,0xa34e87eb,0xba2a9bdd,0xe3381306,0x14bad6bb,0xc96dc7c2,0x333b54b6,0x9be47cfa, - 0x1dcf9299,0xe7ea5f99,0xb38feacd,0xc3cfe2f7,0x5b87e822,0x39c5ab56,0x18f4a18f,0x2d484d9c, - 0x4163d519,0x79769e98,0xf58a67f0,0x40590c02,0x319671c0,0x266b133a,0xaf81b287,0x6a31f737, +alignas(32) static const uint32_t FARSH_KEYS[STRIPE_ELEMENTS + EXTRA_ELEMENTS] = { + 0xb8fe6c39, 0x23a44bbe, 0x7c01812c, 0xf721ad1c, 0xded46de9, 0x839097db, 0x7240a4a4, 0xb7b3671f, + 0xcb79e64e, 0xccc0e578, 0x825ad07d, 0xccff7221, 0xb8084674, 0xf743248e, 0xe03590e6, 0x813a264c, + 0x3c2852bb, 0x91c300cb, 0x88d0658b, 0x1b532ea3, 0x71644897, 0xa20df94e, 0x3819ef46, 0xa9deacd8, + 0xa8fa763f, 0xe39c343f, 0xf9dcbbc7, 0xc70b4f1d, 0x8a51e04b, 0xcdb45931, 0xc89f7ec9, 0xd9787364, + 0x4f6a0752, 0xa79b079c, 0x8fc49499, 0x8ec9b7a9, 0x33c92249, 0x4eb6404f, 0xfb2afb4e, 0xa4814255, + 0x2f0e1b98, 0xace93b24, 0x188850cd, 0x6c5c74a7, 0x66fa4404, 0xeac5ac83, 0x34d3ebc3, 0xc581a0ff, + 0xfa1363eb, 0x170ddd51, 0xb7f0da49, 0xd3165526, 0x29d4689e, 0x2b16be58, 0x7d47a1fc, 0x8ff8b8d1, + 0x7ad031ce, 0x45cb3a8f, 0x95160428, 0xafd7fbca, 0xbb4b407e, 0x995274a4, 0xeb9a2d93, 0x3be78908, + 0xed475f6c, 0x919cd8f2, 0xd3861e5a, 0x6e31390c, 0xfe6a3a49, 0xdcad0914, 0x06508beb, 0xa88399f3, + 0xb058112f, 0xe8b0fa79, 0x29b4da06, 0xedc253fb, 0xc3e96dad, 0x6e372b83, 0x4f78b153, 0xfffa6e86, + 0x21beeeec, 0x01caea02, 0x1267e50d, 0x11e6092f, 0xe819d298, 0x832f80dd, 0x0c4e2477, 0xbc7886eb, + 0x01506637, 0x8ba89668, 0x6d11e7a0, 0xfc12fd15, 0x86a54c19, 0x593ce3dd, 0xd2b13fe5, 0x8e772b53, + 0xae4a60cc, 0x647a3b1b, 0x547786e0, 0x3ec4378e, 0x8d7acf89, 0xca36f947, 0x0e89d5ef, 0xaada6a3c, + 0x6da4a109, 0x9ac6e11c, 0x686691ef, 0xa357bd2b, 0xd16f1b9a, 0x38c70303, 0x7d4622b3, 0x2968fa8f, + 0x8ca5bcb9, 0xfcd61005, 0x228b5e96, 0x2c9dcc19, 0x57cf243c, 0x3c53f9c1, 0x0cc7952c, 0x686de4f0, + 0x93a747b5, 0x4e87a510, 0x975e91ae, 0x4c10b98e, 0x8a7f068c, 0x346b19ab, 0x353ca625, 0xf20a50e0, + 0xce9921f6, 0xdf66e014, 0x0a11ef4b, 0x8bc84ddf, 0x84d25d22, 0xc823936d, 0x94741ec3, 0x88278a60, + 0xb8649331, 0x7a707a10, 0x7292cad6, 0xa7c644c2, 0xbd156bfa, 0x646c9578, 0xb7f4dfd5, 0x9f8277a7, + 0x7013924e, 0xad674cc3, 0x2cae9d05, 0x912a9a22, 0xf67c53fa, 0x8d7e22a9, 0x59ae372b, 0x850199f3, + 0x63a2102c, 0xd6ff1261, 0x56738ee1, 0xaa95145b, 0xfdd12832, 0x5b684deb, 0x0784de94, 0xaa62390e, + 0xbb7ccf19, 0x0fefd572, 0x565b41ca, 0x2206d202, 0x2d608479, 0x4c0fcd3d, 0xd36d3be3, 0x155a9a65, + 0x10f9e732, 0xac9b0f1e, 0x1f72a03b, 0xea9440ae, 0x5b674b4f, 0x31a827d1, 0xecca954f, 0x3d2cd61e, + 0x768d3da4, 0x93745ac1, 0x1d5d58cb, 0x4b86f3b6, 0x2aba923a, 0x0e65814c, 0x8ae063d9, 0xcd6969b0, + 0x36641585, 0x742af59d, 0x613a1316, 0x338ea471, 0x47861af3, 0x30479dc3, 0x1270a481, 0x08771069, + 0xe3c4f0d2, 0x0229874c, 0x5a8a3bc1, 0xe30d9733, 0xd05be5a2, 0xe2af31ba, 0x222049f9, 0x9f923b6a, + 0x033f64ec, 0xe528b62b, 0x8201efbd, 0x2107d877, 0xd8312ef1, 0xa5679f99, 0x1730b51b, 0x752616d2, + 0x05305909, 0x0dca440b, 0x2093cdd9, 0x6409ab50, 0xba5c8ecc, 0x8d4708ea, 0x429f0917, 0xb762fab0, + 0x5161ea75, 0x45eba0eb, 0xb6f34b41, 0x52047123, 0xe4181523, 0x8d74e90a, 0x54fa401c, 0xddda0cc7, + 0x63df182a, 0xc6403ef6, 0x348ec6e8, 0xb9ff57f5, 0xf652b8bd, 0x0f86b0f3, 0xfb3a088a, 0x4dc71533, + 0x7b3617d2, 0xa34e87eb, 0xba2a9bdd, 0xe3381306, 0x14bad6bb, 0xc96dc7c2, 0x333b54b6, 0x9be47cfa, + 0x1dcf9299, 0xe7ea5f99, 0xb38feacd, 0xc3cfe2f7, 0x5b87e822, 0x39c5ab56, 0x18f4a18f, 0x2d484d9c, + 0x4163d519, 0x79769e98, 0xf58a67f0, 0x40590c02, 0x319671c0, 0x266b133a, 0xaf81b287, 0x6a31f737, - 0xe3bc0197,0x55079913,0x9f72c696,0x363e00c8,0x53153947,0xebfd127f,0x00f60519,0x46a6b62a, - 0x93b83380,0x3fe29324,0xdfc67091,0x0f62386d,0xdc375e79,0x8fea3f3e,0xdf8463d0,0x3702fa7b, - 0x3954435e,0x87caa648,0xa9158bee,0x08f30c25,0x66b82936,0xe7fc3feb,0x183c5450,0xd7ef4345, - 0x798c7963,0xc02cf557,0x098553d1,0xfa4312aa,0xe29ef883,0x7caf128d,0x74b3a07d,0xc8efdf5b, - 0x8db23782,0x2c409f4a,0xdae469da,0x4d3e1b3f,0x2e7b9a58,0xc83e3753,0xcefd96a6,0x44ddb068, - 0x5faed141,0xdee7d0f1,0xc223dbb4,0x7bfbe104,0x114d6e1d,0x52039cd5,0x307c0a9c,0xa6289c12, - 0x20ee8b3e,0x03724b0b,0xba68ae4a,0x93c5f2a1,0x9af27bb2,0x480f0eba,0xc14c6bbe,0xe7331f87, - 0xf0104df4,0x22c05363,0xb7e6d08a,0x6f15c449,0x4b9ee2cd,0x6b2c78ae,0x25ed2673,0xb6256596, - 0x99ad4803,0x654f8f10,0xe89eca64,0xd9a506df,0x530dc5fa,0xfe75be5c,0xa543833d,0xf739fd45, - 0x1605b488,0xe50f614a,0xe930df83,0x4540195d,0xf2da0f32,0x6b04f79c,0xe3c73c99,0xb3a5265c, - 0x5a1be07d,0xbda13d2a,0xeddc281c,0xe9d9a39a,0xde9beff1,0x573c1747,0x40be5b3e,0x3756e968, - 0x968077b6,0x6525a28f,0x747d0735,0x8a0ec11d,0x49c03af5,0xf3def45b,0xc3c9214d,0x9ea2e76d, - 0xfad3a715,0xcaa7ad89,0xde828e4c,0xa5769bd5,0x467cdb5a,0xd5f2cacb,0x68ebd182,0x8d40341a, - 0x21556887,0x000a5f6f,0x5ad8a473,0xafe7e886,0x98997d39,0x945ad218,0x46be0c93,0x93a5bd3a, - 0x3ffa4a8c,0xd834d936,0x2f022a2a,0x20791c6b,0x5db51516,0x8defeed2,0x9dee28a5,0x5188eba7, - 0xab4f8c67,0x48ceac96,0x2a11e16f,0xc1593b6d + 0xe3bc0197, 0x55079913, 0x9f72c696, 0x363e00c8, 0x53153947, 0xebfd127f, 0x00f60519, 0x46a6b62a, + 0x93b83380, 0x3fe29324, 0xdfc67091, 0x0f62386d, 0xdc375e79, 0x8fea3f3e, 0xdf8463d0, 0x3702fa7b, + 0x3954435e, 0x87caa648, 0xa9158bee, 0x08f30c25, 0x66b82936, 0xe7fc3feb, 0x183c5450, 0xd7ef4345, + 0x798c7963, 0xc02cf557, 0x098553d1, 0xfa4312aa, 0xe29ef883, 0x7caf128d, 0x74b3a07d, 0xc8efdf5b, + 0x8db23782, 0x2c409f4a, 0xdae469da, 0x4d3e1b3f, 0x2e7b9a58, 0xc83e3753, 0xcefd96a6, 0x44ddb068, + 0x5faed141, 0xdee7d0f1, 0xc223dbb4, 0x7bfbe104, 0x114d6e1d, 0x52039cd5, 0x307c0a9c, 0xa6289c12, + 0x20ee8b3e, 0x03724b0b, 0xba68ae4a, 0x93c5f2a1, 0x9af27bb2, 0x480f0eba, 0xc14c6bbe, 0xe7331f87, + 0xf0104df4, 0x22c05363, 0xb7e6d08a, 0x6f15c449, 0x4b9ee2cd, 0x6b2c78ae, 0x25ed2673, 0xb6256596, + 0x99ad4803, 0x654f8f10, 0xe89eca64, 0xd9a506df, 0x530dc5fa, 0xfe75be5c, 0xa543833d, 0xf739fd45, + 0x1605b488, 0xe50f614a, 0xe930df83, 0x4540195d, 0xf2da0f32, 0x6b04f79c, 0xe3c73c99, 0xb3a5265c, + 0x5a1be07d, 0xbda13d2a, 0xeddc281c, 0xe9d9a39a, 0xde9beff1, 0x573c1747, 0x40be5b3e, 0x3756e968, + 0x968077b6, 0x6525a28f, 0x747d0735, 0x8a0ec11d, 0x49c03af5, 0xf3def45b, 0xc3c9214d, 0x9ea2e76d, + 0xfad3a715, 0xcaa7ad89, 0xde828e4c, 0xa5769bd5, 0x467cdb5a, 0xd5f2cacb, 0x68ebd182, 0x8d40341a, + 0x21556887, 0x000a5f6f, 0x5ad8a473, 0xafe7e886, 0x98997d39, 0x945ad218, 0x46be0c93, 0x93a5bd3a, + 0x3ffa4a8c, 0xd834d936, 0x2f022a2a, 0x20791c6b, 0x5db51516, 0x8defeed2, 0x9dee28a5, 0x5188eba7, + 0xab4f8c67, 0x48ceac96, 0x2a11e16f, 0xc1593b6d }; /* Internal: hash exactly STRIPE bytes */ -template < bool bswap > -static uint64_t farsh_full_block (const uint8_t *data, const uint32_t *key) { +template +static uint64_t farsh_full_block( const uint8_t * data, const uint32_t * key ) { #if defined(HAVE_AVX2) - __m256i sum = _mm256_setzero_si256(); __m128i sum128; int i; - const __m256i *xdata = (const __m256i *) data; - const __m256i *xkey = (const __m256i *) key; + __m256i sum = _mm256_setzero_si256(); __m128i sum128; int i; + const __m256i * xdata = (const __m256i *)data; + const __m256i * xkey = (const __m256i *)key; - for (i=0; i < STRIPE/sizeof(__m256i); i++) { - __m256i d = _mm256_loadu_si256 (xdata+i); + for (i = 0; i < STRIPE / sizeof(__m256i); i++) { + __m256i d = _mm256_loadu_si256(xdata + i); if (bswap) { d = mm256_bswap32(d); } - __m256i k = _mm256_loadu_si256 (xkey+i); - __m256i dk = _mm256_add_epi32(d,k); // uint32 dk[8] = {d0+k0, d1+k1 .. d7+k7} - __m256i res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31)); // uint64 res[4] = {dk0*dk1, dk2*dk3, dk4*dk5, dk6*dk7} - sum = _mm256_add_epi64(sum,res); + __m256i k = _mm256_loadu_si256(xkey + i ); + __m256i dk = _mm256_add_epi32(d, k); // uint32 dk[8] = {d0+k0, d1+k1 .. d7+k7} + __m256i res = _mm256_mul_epu32(dk, _mm256_shuffle_epi32(dk, 0x31)); // uint64 res[4] = {dk0*dk1, dk2*dk3, + // dk4*dk5, dk6*dk7} + sum = _mm256_add_epi64(sum, res); } - sum = _mm256_add_epi64 (sum, _mm256_shuffle_epi32(sum,3*4+2)); // return sum of four 64-bit values in the sum - sum128 = _mm_add_epi64 (_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum,1)); + sum = _mm256_add_epi64(sum, _mm256_shuffle_epi32(sum, 3 * 4 + 2)); // return sum of four 64-bit values in + // the sum + sum128 = _mm_add_epi64(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); return _mm_cvtsi128_si64(sum128); #elif defined(HAVE_SSE_2) - __m128i sum = _mm_setzero_si128(); int i; - const __m128i *xdata = (const __m128i *) data; - const __m128i *xkey = (const __m128i *) key; + __m128i sum = _mm_setzero_si128(); int i; + const __m128i * xdata = (const __m128i *)data; + const __m128i * xkey = (const __m128i *)key; - for (i=0; i < STRIPE/sizeof(__m128i); i++) { - __m128i d = _mm_loadu_si128 (xdata+i); + for (i = 0; i < STRIPE / sizeof(__m128i); i++) { + __m128i d = _mm_loadu_si128(xdata + i); if (bswap) { d = mm_bswap32(d); } - __m128i k = _mm_load_si128 (xkey+i); - __m128i dk = _mm_add_epi32(d,k); // uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} - __m128i res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31)); // uint64 res[2] = {dk0*dk1,dk2*dk3} - sum = _mm_add_epi64(sum,res); + __m128i k = _mm_load_si128(xkey + i); + __m128i dk = _mm_add_epi32(d, k); // uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} + __m128i res = _mm_mul_epu32(dk, _mm_shuffle_epi32(dk, 0x31)); // uint64 res[2] = {dk0*dk1,dk2*dk3} + sum = _mm_add_epi64(sum, res); } - sum = _mm_add_epi64 (sum, _mm_shuffle_epi32(sum,3*4+2)); // return sum of two 64-bit values in the sum + sum = _mm_add_epi64(sum, _mm_shuffle_epi32(sum, 3 * 4 + 2)); // return sum of two 64-bit values in the sum return _mm_cvtsi128_si64(sum); #else uint64_t sum = 0; int i; - for (i=0; i < STRIPE_ELEMENTS; i+=2) { - sum += (GET_U32(data, i*4) + key[i]) * - (uint64_t)(GET_U32(data, (i+1)*4) + key[i+1]); + for (i = 0; i < STRIPE_ELEMENTS; i += 2) { + sum += (GET_U32(data, i * 4) + key[i]) * + (uint64_t)(GET_U32(data, (i + 1) * 4) + key[i + 1]); } return sum; #endif } /* Internal: hash less than STRIPE bytes, with careful handling of partial uint32_t pair at the end of buffer */ -template < bool bswap > -static uint64_t farsh_partial_block(const uint8_t *data, size_t bytes, const uint32_t *key) { +template +static uint64_t farsh_partial_block( const uint8_t * data, size_t bytes, const uint32_t * key ) { uint64_t sum = 0; int i; - size_t elements = (bytes/sizeof(uint32_t)) & (~1); + size_t elements = (bytes / sizeof(uint32_t)) & (~1); - uint32_t extra_data[2] = {0}; - size_t extra_bytes = bytes - elements*sizeof(uint32_t); - memcpy (extra_data, data+4*elements, extra_bytes); + uint32_t extra_data[2] = { 0 }; + size_t extra_bytes = bytes - elements * sizeof(uint32_t); - for (i=0; i < elements; i+=2) - sum += (GET_U32(data, i*4) + key[i]) * - (uint64_t)(GET_U32(data, (i+1)*4) + key[i+1]); - if (extra_bytes) - sum += (COND_BSWAP(extra_data[0],bswap) + key[i]) * - (uint64_t)(COND_BSWAP(extra_data[1],bswap) + key[i+1]); + memcpy(extra_data, data + 4 * elements, extra_bytes); + + for (i = 0; i < elements; i += 2) { + sum += (GET_U32(data, i * 4) + key[i]) * + (uint64_t)(GET_U32(data, (i + 1) * 4) + key[i + 1]); + } + if (extra_bytes) { + sum += (COND_BSWAP(extra_data[0], bswap) + key[i]) * + (uint64_t)(COND_BSWAP(extra_data[1], bswap) + key[i + 1]); + } return sum; } @@ -160,21 +168,21 @@ static uint64_t farsh_partial_block(const uint8_t *data, size_t bytes, const uin static const uint64_t PRIME64_1 = UINT64_C(11400714785074694791); static const uint64_t PRIME64_2 = UINT64_C(14029467366897019727); -static const uint64_t PRIME64_3 = UINT64_C( 1609587929392839161); -static const uint64_t PRIME64_4 = UINT64_C( 9650029242287828579); +static const uint64_t PRIME64_3 = UINT64_C(1609587929392839161); +static const uint64_t PRIME64_4 = UINT64_C(9650029242287828579); /* Internal: combine hash of the current block with overall hashsum */ -static uint64_t farsh_combine (uint64_t sum, uint64_t h) { - h *= PRIME64_2; - h += h >> 31; - h *= PRIME64_1; +static uint64_t farsh_combine( uint64_t sum, uint64_t h ) { + h *= PRIME64_2; + h += h >> 31; + h *= PRIME64_1; sum ^= h; - sum = (sum+(sum>>27)) * PRIME64_1 + PRIME64_4; + sum = (sum + (sum >> 27)) * PRIME64_1 + PRIME64_4; return sum; } /* Internal: compute the final hashsum value */ -static uint32_t farsh_final (uint64_t sum) { +static uint32_t farsh_final( uint64_t sum ) { sum ^= sum >> 33; sum *= PRIME64_2; sum ^= sum >> 29; @@ -185,111 +193,117 @@ static uint32_t farsh_final (uint64_t sum) { /* End of hash mixing code kidnapped from the xxHash64 */ /* ////////////////////////////////////////////////////////////////////////// */ - /* Public API functions documented in farsh.h */ -template < bool bswap > -static uint32_t farsh_keyed (const void *data, size_t bytes, const void *key, uint64_t seed) { - uint64_t sum = seed; - const uint8_t * ptr = (const uint8_t *) data; - const uint32_t * key_ptr = (const uint32_t *) key; +template +static uint32_t farsh_keyed( const void * data, size_t bytes, const void * key, uint64_t seed ) { + uint64_t sum = seed; + const uint8_t * ptr = (const uint8_t * )data; + const uint32_t * key_ptr = (const uint32_t *)key; + while (bytes >= STRIPE) { - size_t chunk = STRIPE; - uint64_t h = farsh_full_block(ptr, key_ptr); - sum = farsh_combine (sum, h); + size_t chunk = STRIPE; + uint64_t h = farsh_full_block(ptr, key_ptr); + sum = farsh_combine(sum, h); ptr += chunk; bytes -= chunk; } if (bytes) { - size_t chunk = bytes; - uint64_t h = farsh_partial_block(ptr, chunk, key_ptr); - sum = farsh_combine (sum, h); + size_t chunk = bytes; + uint64_t h = farsh_partial_block(ptr, chunk, key_ptr); + sum = farsh_combine(sum, h); ptr += chunk; bytes -= chunk; } - return farsh_final(sum) ^ key_ptr[bytes%STRIPE_ELEMENTS]; /* ensure that zeroes at the end of data will affect the hash value */ + return farsh_final(sum) ^ key_ptr[bytes % STRIPE_ELEMENTS]; /* + * ensure that zeroes at the end of data will affect the + * hash value + */ } -template < bool bswap > -static void farsh_keyed_n (const void *data, size_t bytes, const void *key, int n, uint64_t seed, void *hash) { - uint32_t * hash_ptr = (uint32_t*)hash; - for (int i = 0; i < n; i++) - hash_ptr[i] = COND_BSWAP(farsh_keyed(data, bytes, (const uint8_t*)key + i*FARSH_EXTRA_KEY_SIZE, seed), bswap); +template +static void farsh_keyed_n( const void * data, size_t bytes, const void * key, int n, uint64_t seed, void * hash ) { + uint32_t * hash_ptr = (uint32_t *)hash; + + for (int i = 0; i < n; i++) { + hash_ptr[i] = COND_BSWAP(farsh_keyed(data, bytes, (const uint8_t *)key + i * FARSH_EXTRA_KEY_SIZE, + seed), bswap); + } } -template < bool bswap > -static void farsh_n (const void *data, size_t bytes, int k, int n, uint64_t seed, void *hash) { +template +static void farsh_n( const void * data, size_t bytes, int k, int n, uint64_t seed, void * hash ) { /* FARSH_KEYS contains only material for the hashes 0..FARSH_MAX_HASHES-1 */ - if (k+n > FARSH_MAX_HASHES) return; + if (k + n > FARSH_MAX_HASHES) { return; } - farsh_keyed_n(data, bytes, (const uint8_t*)FARSH_KEYS + k*FARSH_EXTRA_KEY_SIZE, n, seed, hash); + farsh_keyed_n(data, bytes, (const uint8_t *)FARSH_KEYS + k * FARSH_EXTRA_KEY_SIZE, n, seed, hash); } -template < bool bswap, uint32_t hashcount > -static void farsh(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void farsh( const void * in, const size_t len, const seed_t seed, void * out ) { farsh_n(in, len, 0, hashcount, (uint64_t)seed, out); } REGISTER_FAMILY(farsh, - $.src_url = "https://github.com/Bulat-Ziganshin/FARSH", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/Bulat-Ziganshin/FARSH", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(FARSH_32, - $.desc = "FARSH 32-bit (1 hash output)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xBCDE332C, - $.verification_BE = 0x1AD2B744, - $.hashfn_native = farsh, - $.hashfn_bswap = farsh -); + $.desc = "FARSH 32-bit (1 hash output)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xBCDE332C, + $.verification_BE = 0x1AD2B744, + $.hashfn_native = farsh, + $.hashfn_bswap = farsh + ); REGISTER_HASH(FARSH_64, - $.desc = "FARSH 64-bit (2 hash outputs)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xDE2FDAEE, - $.verification_BE = 0xEFE7812E, - $.hashfn_native = farsh, - $.hashfn_bswap = farsh -); + $.desc = "FARSH 64-bit (2 hash outputs)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xDE2FDAEE, + $.verification_BE = 0xEFE7812E, + $.hashfn_native = farsh, + $.hashfn_bswap = farsh + ); REGISTER_HASH(FARSH_128, - $.desc = "FARSH 128-bit (4 hash outputs)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x82B6CBEC, - $.verification_BE = 0x51150D39, - $.hashfn_native = farsh, - $.hashfn_bswap = farsh -); + $.desc = "FARSH 128-bit (4 hash outputs)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x82B6CBEC, + $.verification_BE = 0x51150D39, + $.hashfn_native = farsh, + $.hashfn_bswap = farsh + ); REGISTER_HASH(FARSH_256, - $.desc = "FARSH 256-bit (8 hash outputs)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 256, - $.verification_LE = 0xFEBEA0BC, - $.verification_BE = 0x75FAC191, - $.hashfn_native = farsh, - $.hashfn_bswap = farsh -); + $.desc = "FARSH 256-bit (8 hash outputs)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 256, + $.verification_LE = 0xFEBEA0BC, + $.verification_BE = 0x75FAC191, + $.hashfn_native = farsh, + $.hashfn_bswap = farsh + ); diff --git a/hashes/fasthash.cpp b/hashes/fasthash.cpp index 6c8a09fa..1e9da60d 100644 --- a/hashes/fasthash.cpp +++ b/hashes/fasthash.cpp @@ -22,39 +22,39 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. -*/ + */ #include "Platform.h" #include "Hashlib.h" //------------------------------------------------------------ // Compression function for Merkle-Damgard construction. // This function is generated using the framework provided. -static inline uint64_t mix(uint64_t h) { +static inline uint64_t mix( uint64_t h ) { h ^= h >> 23; h *= UINT64_C(0x2127599bf4325c37); h ^= h >> 47; return h; } -static inline uint32_t fold(uint64_t h) { +static inline uint32_t fold( uint64_t h ) { // the following trick converts the 64-bit hashcode to Fermat // residue, which shall retain information from both the higher // and lower parts of hashcode. return h - (h >> 32); } -template < bool bswap > -static uint64_t fasthash_impl(const uint8_t * pos, size_t len, uint64_t seed) { - const uint64_t m = UINT64_C(0x880355f21e6d1965); +template +static uint64_t fasthash_impl( const uint8_t * pos, size_t len, uint64_t seed ) { + const uint64_t m = UINT64_C(0x880355f21e6d1965); const uint8_t * end = pos + (len & ~7); uint64_t h = seed ^ (len * m); uint64_t v; while (pos != end) { - v = GET_U64(pos, 0); - h ^= mix(v); - h *= m; + v = GET_U64(pos, 0); + h ^= mix(v); + h *= m; pos += 8; } @@ -66,58 +66,60 @@ static uint64_t fasthash_impl(const uint8_t * pos, size_t len, uint64_t seed) { case 5: v ^= (uint64_t)pos[4] << 32; case 4: v ^= (uint64_t)pos[3] << 24; case 3: v ^= (uint64_t)pos[2] << 16; - case 2: v ^= (uint64_t)pos[1] << 8; + case 2: v ^= (uint64_t)pos[1] << 8; case 1: v ^= (uint64_t)pos[0]; - h ^= mix(v); - h *= m; + h ^= mix(v); + h *= m; } return mix(h); } //------------------------------------------------------------ -template < bool bswap > -static void fasthash64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void fasthash64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = fasthash_impl((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void fasthash32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void fasthash32( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = fasthash_impl((const uint8_t *)in, len, (uint64_t)seed); + PUT_U32(fold(h), (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(fasthash, - $.src_url = "https://github.com/ztanml/fast-hash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/ztanml/fast-hash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(fasthash_32, - $.desc = "fast-hash, 32-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xE9481AFC, - $.verification_BE = 0x48BCE1ED, - $.hashfn_native = fasthash32, - $.hashfn_bswap = fasthash32 -); + $.desc = "fast-hash, 32-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xE9481AFC, + $.verification_BE = 0x48BCE1ED, + $.hashfn_native = fasthash32, + $.hashfn_bswap = fasthash32 + ); REGISTER_HASH(fasthash_64, - $.desc = "fast-hash, 64-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xA16231A7, - $.verification_BE = 0x82AD8DDB, - $.hashfn_native = fasthash64, - $.hashfn_bswap = fasthash64 -); + $.desc = "fast-hash, 64-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xA16231A7, + $.verification_BE = 0x82AD8DDB, + $.hashfn_native = fasthash64, + $.hashfn_bswap = fasthash64 + ); diff --git a/hashes/fletcher.cpp b/hashes/fletcher.cpp index 204ea6f2..bd223d18 100644 --- a/hashes/fletcher.cpp +++ b/hashes/fletcher.cpp @@ -31,12 +31,13 @@ // Hash based on 1 lane of ZFS's fletcher2 checksum. ZFS is always // guaranteed blocks of multiples-of-128 bytes for checksums, so it // does two of these on alternate sets of words. -template < bool fullhash, bool bswap > -static void fletcher2(const uint8_t * key, size_t len, uint64_t seed, uint8_t * out) { +template +static void fletcher2( const uint8_t * key, size_t len, uint64_t seed, uint8_t * out ) { const uint8_t * const endc = key + len; const uint8_t * const endw = key + (len & ~7); // Legacy homegrown seeding for SMHasher3 uint64_t A = seed, B = 0; + for (; key < endw; key += 8) { A += GET_U64(key, 0); B += A; @@ -56,12 +57,13 @@ static void fletcher2(const uint8_t * key, size_t len, uint64_t seed, uint8_t * // Hash based on 1 lane of ZFS's fletcher4 checksum. ZFS is always // guaranteed blocks of multiples-of-128 bytes for checksums, so it // does two of these on alternate sets of words. -template < bool fullhash, bool bswap > -static void fletcher4(const uint8_t * key, size_t len, uint64_t seed, uint8_t * out) { +template +static void fletcher4( const uint8_t * key, size_t len, uint64_t seed, uint8_t * out ) { const uint8_t * const endc = key + len; const uint8_t * const endw = key + (len & ~3); // Legacy homegrown seeding for SMHasher3 uint64_t A = seed, B = 0, C = 0, D = 0; + for (; key < endw; key += 4) { A += GET_U32(key, 0); B += A; @@ -90,10 +92,10 @@ static void fletcher4(const uint8_t * key, size_t len, uint64_t seed, uint8_t * // overflow operations. This is important to the mathematical // operation of the checksum, and it was excluded from the ZFS // implementations. -template < bool bswap > -static uint32_t fletcher32(const uint8_t * key, size_t len, uint64_t seed) { +template +static uint32_t fletcher32( const uint8_t * key, size_t len, uint64_t seed ) { // Legacy homegrown seeding for SMHasher3 - uint32_t c0 = (uint32_t)(seed + len), c1 = (uint32_t)((seed >> 32) + len); + uint32_t c0 = (uint32_t)(seed + len), c1 = (uint32_t)((seed >> 32) + len); while (len > 1) { // 360 16-bit blocks can be processed without the possibility @@ -105,23 +107,23 @@ static uint32_t fletcher32(const uint8_t * key, size_t len, uint64_t seed) { c1 += c0; } len -= blklen; - c0 = c0 % 65535; - c1 = c1 % 65535; - }; + c0 = c0 % 65535; + c1 = c1 % 65535; + } if (len) { c0 += *key; c1 += c0; - c0 = c0 % 65535; - c1 = c1 % 65535; + c0 = c0 % 65535; + c1 = c1 % 65535; } - return (c1 << 16 | c0); + return c1 << 16 | c0; } -template < bool bswap > -static uint64_t fletcher64(const uint8_t * key, size_t len, uint64_t seed) { +template +static uint64_t fletcher64( const uint8_t * key, size_t len, uint64_t seed ) { // Legacy homegrown seeding for SMHasher3 - uint64_t c0 = seed + len, c1 = seed + len; + uint64_t c0 = seed + len, c1 = seed + len; while (len > 3) { // 92681 32-bit blocks can be processed without the possibility @@ -133,144 +135,146 @@ static uint64_t fletcher64(const uint8_t * key, size_t len, uint64_t seed) { c1 += c0; } len -= blklen; - c0 = c0 % 4294967295; - c1 = c1 % 4294967295; - }; + c0 = c0 % 4294967295; + c1 = c1 % 4294967295; + } if (len > 0) { do { c0 += *key++; c1 += c0; len--; } while (len > 0); - c0 = c0 % 4294967295; - c1 = c1 % 4294967295; + c0 = c0 % 4294967295; + c1 = c1 % 4294967295; } - return (c1 << 32 | c0); + return c1 << 32 | c0; } //------------------------------------------------------------ -template < bool bswap > -static void fletcher2_64(const void * in, const size_t len, const seed_t seed, void * out) { - fletcher2((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); +template +static void fletcher2_64( const void * in, const size_t len, const seed_t seed, void * out ) { + fletcher2((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); } -template < bool bswap > -static void fletcher2_128(const void * in, const size_t len, const seed_t seed, void * out) { - fletcher2((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); +template +static void fletcher2_128( const void * in, const size_t len, const seed_t seed, void * out ) { + fletcher2((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); } -template < bool bswap > -static void fletcher4_64(const void * in, const size_t len, const seed_t seed, void * out) { - fletcher4((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); +template +static void fletcher4_64( const void * in, const size_t len, const seed_t seed, void * out ) { + fletcher4((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); } -template < bool bswap > -static void fletcher4_256(const void * in, const size_t len, const seed_t seed, void * out) { - fletcher4((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); +template +static void fletcher4_256( const void * in, const size_t len, const seed_t seed, void * out ) { + fletcher4((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); } -template < bool bswap > -static void fletcher32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void fletcher32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = fletcher32((const uint8_t *)in, len, (uint64_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void fletcher64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void fletcher64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = fletcher64((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(fletcher, - $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(fletcher2__64, - $.desc = "fletcher2 from ZFS (one lane, best 64 bits)", - $.sort_order = 10, - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x890767C0, - $.verification_BE = 0x8FC6FD34, - $.hashfn_native = fletcher2_64, - $.hashfn_bswap = fletcher2_64 -); + $.desc = "fletcher2 from ZFS (one lane, best 64 bits)", + $.sort_order = 10, + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x890767C0, + $.verification_BE = 0x8FC6FD34, + $.hashfn_native = fletcher2_64, + $.hashfn_bswap = fletcher2_64 + ); REGISTER_HASH(fletcher2, - $.desc = "fletcher2 from ZFS (one lane, all 128 bits)", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x70FD3480, - $.verification_BE = 0xFC346DA5, - $.hashfn_native = fletcher2_128, - $.hashfn_bswap = fletcher2_128 -); + $.desc = "fletcher2 from ZFS (one lane, all 128 bits)", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x70FD3480, + $.verification_BE = 0xFC346DA5, + $.hashfn_native = fletcher2_128, + $.hashfn_bswap = fletcher2_128 + ); REGISTER_HASH(fletcher4__64, - $.desc = "fletcher4 from ZFS (one lane, best 64 bits)", - $.sort_order = 20, - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x47660EB7, - $.verification_BE = 0xA502FD23, - $.hashfn_native = fletcher4_64, - $.hashfn_bswap = fletcher4_64 -); + $.desc = "fletcher4 from ZFS (one lane, best 64 bits)", + $.sort_order = 20, + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x47660EB7, + $.verification_BE = 0xA502FD23, + $.hashfn_native = fletcher4_64, + $.hashfn_bswap = fletcher4_64 + ); REGISTER_HASH(fletcher4, - $.desc = "fletcher4 from ZFS (one lane, all 256 bits)", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 256, - $.verification_LE = 0x1F1358EF, - $.verification_BE = 0x94EECE23, - $.hashfn_native = fletcher4_256, - $.hashfn_bswap = fletcher4_256 -); + $.desc = "fletcher4 from ZFS (one lane, all 256 bits)", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 256, + $.verification_LE = 0x1F1358EF, + $.verification_BE = 0x94EECE23, + $.hashfn_native = fletcher4_256, + $.hashfn_bswap = fletcher4_256 + ); REGISTER_HASH(Fletcher_32, - $.desc = "Fletcher's checksum, 32-bit, IV == len", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_MODULUS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x4FE14644, - $.verification_BE = 0x05853CCE, - $.hashfn_native = fletcher32, - $.hashfn_bswap = fletcher32 -); + $.desc = "Fletcher's checksum, 32-bit, IV == len", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_MODULUS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x4FE14644, + $.verification_BE = 0x05853CCE, + $.hashfn_native = fletcher32, + $.hashfn_bswap = fletcher32 + ); REGISTER_HASH(Fletcher_64, - $.desc = "Fletcher's checksum, 64-bit, IV == len", - $.sort_order = 0, - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_MODULUS | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x2E16C3AA, - $.verification_BE = 0x1E644927, - $.hashfn_native = fletcher64, - $.hashfn_bswap = fletcher64 -); + $.desc = "Fletcher's checksum, 64-bit, IV == len", + $.sort_order = 0, + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_MODULUS | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x2E16C3AA, + $.verification_BE = 0x1E644927, + $.hashfn_native = fletcher64, + $.hashfn_bswap = fletcher64 + ); diff --git a/hashes/floppsyhash.cpp b/hashes/floppsyhash.cpp index 212c2258..d2ba7bac 100644 --- a/hashes/floppsyhash.cpp +++ b/hashes/floppsyhash.cpp @@ -35,32 +35,32 @@ static_assert(std::numeric_limits::is_iec559, "IEEE 754 floating point r //------------------------------------------------------------ // Q function : Continued Egyptian Fraction update function -template < bool old> -static FORCE_INLINE void q(double * state, double key_val, - double numerator, double denominator) { +template +static FORCE_INLINE void q( double * state, double key_val, double numerator, double denominator ) { double frac = numerator / denominator; + state[0] += frac; - state[0] = 1.0 / state[0]; + state[0] = 1.0 / state[0]; if (!old) { key_val += M_PI; } state[1] += key_val; - state[1] = numerator / state[1]; + state[1] = numerator / state[1]; } // round function : process the message -template < bool old> -static FORCE_INLINE void round(const uint8_t * msg, size_t len, double * state) { +template +static FORCE_INLINE void round( const uint8_t * msg, size_t len, double * state ) { double numerator = 1.0; // Loop - for (size_t i = 0; i < len; i++ ) { + for (size_t i = 0; i < len; i++) { double val = (double)msg[i]; double tmp; if (old) { - tmp = (double)(msg[i] + i + 1); + tmp = (double)(msg[i] + i + 1); } else { - tmp = val * M_E; - tmp += (double)(i + 1); + tmp = val * M_E; + tmp += (double)(i + 1); } double denominator = tmp / state[1]; @@ -71,33 +71,33 @@ static FORCE_INLINE void round(const uint8_t * msg, size_t len, double * state) if (old) { double tmp; - tmp = M_PI + state[1]; + tmp = M_PI + state[1]; state[0] *= tmp; - tmp = M_E + state[0]; + tmp = M_E + state[0]; state[1] *= tmp; } } // setup function : setup the state -static FORCE_INLINE void setup(double * state, double init = 0) { +static FORCE_INLINE void setup( double * state, double init = 0 ) { if (init == 0) { state[0] = (double)3.0; - state[1] = (double)1.0/7.0; + state[1] = (double)1.0 / 7.0; } else { double tmp = 1.0 / init; - tmp += init; - state[0] = pow(tmp, 1.0/3.0); - state[1] = pow(tmp, 1.0/7.0); + tmp += init; + state[0] = pow(tmp, 1.0 / 3.0); + state[1] = pow(tmp, 1.0 / 7.0); } } //------------------------------------------------------------ -//static_assert(sizeof(double) == 8); -template < bool old, bool bswap > -static void floppsyhash(const void * in, const size_t len, const seed_t seed, void * out) { +// static_assert(sizeof(double) == 8); +template +static void floppsyhash( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * data = (const uint8_t *)in; - double state[2]; - uint8_t seedbuf[4]; + double state[2]; + uint8_t seedbuf[4]; PUT_U32((uint32_t)seed, seedbuf, 0); @@ -126,40 +126,40 @@ static void floppsyhash(const void * in, const size_t len, const seed_t seed, vo //------------------------------------------------------------ REGISTER_FAMILY(floppsy, - $.src_url = "https://github.com/dosyago/floppsy", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/dosyago/floppsy", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(floppsyhash, - $.desc = "Floppsyhash v1.1.10 (floating-point hash using continued Egyptian fractions)", - $.hash_flags = - FLAG_HASH_SMALL_SEED | - FLAG_HASH_FLOATING_POINT , - $.impl_flags = - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_DIVIDE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x5F9F6226, - $.verification_BE = 0x4D4F96F0, - $.hashfn_native = floppsyhash, - $.hashfn_bswap = floppsyhash -); + $.desc = "Floppsyhash v1.1.10 (floating-point hash using continued Egyptian fractions)", + $.hash_flags = + FLAG_HASH_SMALL_SEED | + FLAG_HASH_FLOATING_POINT, + $.impl_flags = + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_DIVIDE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x5F9F6226, + $.verification_BE = 0x4D4F96F0, + $.hashfn_native = floppsyhash, + $.hashfn_bswap = floppsyhash + ); REGISTER_HASH(floppsyhash__old, - $.desc = "Floppsyhash (old version, fka \"tifuhash\")", - $.hash_flags = - FLAG_HASH_SMALL_SEED | - FLAG_HASH_FLOATING_POINT , - $.impl_flags = - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_DIVIDE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x644236D4, - $.verification_BE = 0x7A3D2F7E, - $.hashfn_native = floppsyhash, - $.hashfn_bswap = floppsyhash -); + $.desc = "Floppsyhash (old version, fka \"tifuhash\")", + $.hash_flags = + FLAG_HASH_SMALL_SEED | + FLAG_HASH_FLOATING_POINT, + $.impl_flags = + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_DIVIDE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x644236D4, + $.verification_BE = 0x7A3D2F7E, + $.hashfn_native = floppsyhash, + $.hashfn_bswap = floppsyhash + ); diff --git a/hashes/fnv.cpp b/hashes/fnv.cpp index 558530c3..6cf28509 100644 --- a/hashes/fnv.cpp +++ b/hashes/fnv.cpp @@ -27,24 +27,24 @@ #include "Platform.h" #include "Hashlib.h" -template < typename hashT, bool bswap > -static void fibonacci(const void * in, const size_t len, const seed_t seed, void * out) { - hashT h = (hashT)seed; - const hashT * dw = (const hashT *)in; - const hashT * const endw = &dw[len/sizeof(hashT)]; - const uint64_t C = UINT64_C(11400714819323198485); +template +static void fibonacci( const void * in, const size_t len, const seed_t seed, void * out ) { + hashT h = (hashT)seed; + const hashT * dw = (const hashT *)in; + const hashT * const endw = &dw[len / sizeof(hashT)]; + const uint64_t C = UINT64_C(11400714819323198485); hashT w; - //word stepper + // word stepper while (dw < endw) { memcpy(&w, dw++, sizeof(w)); - w = COND_BSWAP(w, bswap); + w = COND_BSWAP(w, bswap); h += w * C; } - //byte stepper - if (len & (sizeof(hashT)-1)) { + // byte stepper + if (len & (sizeof(hashT) - 1)) { uint8_t * dc = (uint8_t *)dw; - const uint8_t *const endc = &((const uint8_t*)in)[len]; + const uint8_t * const endc = &((const uint8_t *)in)[len]; while (dc < endc) { h += *dc++ * C; } @@ -56,13 +56,13 @@ static void fibonacci(const void * in, const size_t len, const seed_t seed, void // All seeding below this is homegrown for SMHasher3 -template < typename hashT, bool bswap > -static void FNV1a(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void FNV1a( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * data = (const uint8_t *)in; - const hashT C1 = (sizeof(hashT)==4)? UINT32_C(2166136261) : - UINT64_C(0xcbf29ce484222325); - const hashT C2 = (sizeof(hashT)==4)? UINT32_C(16777619) : - UINT64_C(0x100000001b3); + const hashT C1 = (sizeof(hashT) == 4) ? UINT32_C(2166136261) : + UINT64_C(0xcbf29ce484222325); + const hashT C2 = (sizeof(hashT) == 4) ? UINT32_C( 16777619) : + UINT64_C(0x100000001b3); hashT h = (hashT)seed; h ^= C1; @@ -75,27 +75,27 @@ static void FNV1a(const void * in, const size_t len, const seed_t seed, void * o memcpy(out, &h, sizeof(h)); } -template < typename hashT, bool bswap > -static void FNV2(const void * in, const size_t len, const seed_t seed, void * out) { - const hashT * dw = (const hashT *)in; - const hashT * const endw = &dw[len/sizeof(hashT)]; - const uint64_t C1 = (sizeof(hashT)==4)? UINT32_C(2166136261) : - UINT64_C(0xcbf29ce484222325); - const uint64_t C2 = (sizeof(hashT)==4)? UINT32_C(16777619) : - UINT64_C(0x100000001b3); +template +static void FNV2( const void * in, const size_t len, const seed_t seed, void * out ) { + const hashT * dw = (const hashT *)in; + const hashT * const endw = &dw[len / sizeof(hashT)]; + const uint64_t C1 = (sizeof(hashT) == 4) ? UINT32_C(2166136261) : + UINT64_C(0xcbf29ce484222325); + const uint64_t C2 = (sizeof(hashT) == 4) ? UINT32_C( 16777619) : + UINT64_C(0x100000001b3); hashT h = C1 ^ (hashT)seed; hashT w; - //word stepper + // word stepper while (dw < endw) { memcpy(&w, dw++, sizeof(w)); h ^= COND_BSWAP(w, bswap); h *= C2; } - //byte stepper - if (len & (sizeof(hashT)-1)) { + // byte stepper + if (len & (sizeof(hashT) - 1)) { uint8_t * dc = (uint8_t *)dw; - const uint8_t *const endc = &((const uint8_t*)in)[len]; + const uint8_t * const endc = &((const uint8_t *)in)[len]; while (dc < endc) { h ^= *dc++; h *= C2; @@ -106,45 +106,45 @@ static void FNV2(const void * in, const size_t len, const seed_t seed, void * ou memcpy(out, &h, sizeof(h)); } -template < bool bswap > -static void FNV_YoshimitsuTRIAD(const void * in, const size_t olen, const seed_t seed, void * out) { - const uint8_t *p = (const uint8_t *)in; - const uint32_t PRIME = 709607; - uint32_t hash32A = UINT32_C(2166136261) ^ seed; - uint32_t hash32B = UINT32_C(2166136261) + olen; - uint32_t hash32C = UINT32_C(2166136261); - size_t len = olen; +template +static void FNV_YoshimitsuTRIAD( const void * in, const size_t olen, const seed_t seed, void * out ) { + const uint8_t * p = (const uint8_t *)in; + const uint32_t PRIME = 709607; + uint32_t hash32A = UINT32_C(2166136261) ^ seed; + uint32_t hash32B = UINT32_C(2166136261) + olen; + uint32_t hash32C = UINT32_C(2166136261); + size_t len = olen; for (; len >= 3 * 2 * sizeof(uint32_t); len -= 3 * 2 * sizeof(uint32_t), p += 3 * 2 * sizeof(uint32_t)) { - hash32A = (hash32A ^ (ROTL32(GET_U32(p, 0), 5) ^ GET_U32(p, 4))) * PRIME; - hash32B = (hash32B ^ (ROTL32(GET_U32(p, 8), 5) ^ GET_U32(p, 12))) * PRIME; - hash32C = (hash32C ^ (ROTL32(GET_U32(p, 16), 5) ^ GET_U32(p, 20))) * PRIME; + hash32A = (hash32A ^ (ROTL32(GET_U32(p, 0), 5) ^ GET_U32(p, 4))) * PRIME; + hash32B = (hash32B ^ (ROTL32(GET_U32(p, 8), 5) ^ GET_U32(p, 12))) * PRIME; + hash32C = (hash32C ^ (ROTL32(GET_U32(p, 16), 5) ^ GET_U32(p, 20))) * PRIME; } if (p != (const uint8_t *)in) { hash32A = (hash32A ^ ROTL32(hash32C, 5)) * PRIME; } - //Cases 0. .31 + // Cases 0. .31 if (len & (4 * sizeof(uint32_t))) { - hash32A = (hash32A ^ (ROTL32(GET_U32(p, 0), 5) ^ GET_U32(p, 4))) * PRIME; - hash32B = (hash32B ^ (ROTL32(GET_U32(p, 8), 5) ^ GET_U32(p, 12))) * PRIME; - p += 8 * sizeof(uint16_t); + hash32A = (hash32A ^ (ROTL32(GET_U32(p, 0), 5) ^ GET_U32(p, 4))) * PRIME; + hash32B = (hash32B ^ (ROTL32(GET_U32(p, 8), 5) ^ GET_U32(p, 12))) * PRIME; + p += 8 * sizeof(uint16_t); } - //Cases 0. .15 + // Cases 0. .15 if (len & (2 * sizeof(uint32_t))) { hash32A = (hash32A ^ GET_U32(p, 0)) * PRIME; hash32B = (hash32B ^ GET_U32(p, 4)) * PRIME; - p += 4 * sizeof(uint16_t); + p += 4 * sizeof(uint16_t); } - //Cases:0. .7 + // Cases:0. .7 if (len & sizeof(uint32_t)) { hash32A = (hash32A ^ GET_U16(p, 0)) * PRIME; hash32B = (hash32B ^ GET_U16(p, 2)) * PRIME; - p += 2 * sizeof(uint16_t); + p += 2 * sizeof(uint16_t); } - //Cases:0. .3 + // Cases:0. .3 if (len & sizeof(uint16_t)) { hash32A = (hash32A ^ GET_U16(p, 0)) * PRIME; - p += sizeof(uint16_t); + p += sizeof(uint16_t); } if (len & 1) { hash32A = (hash32A ^ *p) * PRIME; @@ -157,28 +157,28 @@ static void FNV_YoshimitsuTRIAD(const void * in, const size_t olen, const seed_t memcpy(out, &hash32A, 4); } -template < bool keeplsb > -static FORCE_INLINE uint64_t _PADr_KAZE(uint64_t x, int n) { - if (n >= 64) return 0; +template +static FORCE_INLINE uint64_t _PADr_KAZE( uint64_t x, int n ) { + if (n >= 64) { return 0; } if (keeplsb) { return (x << n) >> n; } else { - return (x >> n); + return x >> n; } } -template < bool bswap > -static void FNV_Totenschiff(const void * in, const size_t olen, const seed_t seed, void * out) { - const uint8_t * p = (uint8_t *)in; - const uint32_t PRIME = 591798841; - uint32_t hash32; - uint64_t hash64 = (uint64_t)seed ^ UINT64_C(14695981039346656037); - uint64_t PADDEDby8; - size_t len = olen; +template +static void FNV_Totenschiff( const void * in, const size_t olen, const seed_t seed, void * out ) { + const uint8_t * p = (uint8_t *)in; + const uint32_t PRIME = 591798841; + uint32_t hash32; + uint64_t hash64 = (uint64_t)seed ^ UINT64_C(14695981039346656037); + uint64_t PADDEDby8; + size_t len = olen; for (; len > 8; len -= 8, p += 8) { PADDEDby8 = GET_U64(p, 0); - hash64 = (hash64 ^ PADDEDby8) * PRIME; + hash64 = (hash64 ^ PADDEDby8) * PRIME; } // Here len is 1..8. when (8-8) the QWORD remains intact @@ -205,29 +205,29 @@ static void FNV_Totenschiff(const void * in, const size_t olen, const seed_t see // // Many thanks go to Yurii 'Hordi' Hordiienko, he lessened with 3 // instructions the original 'Pippip', thus: -template < bool bswap > -static void FNV_Pippip_Yurii(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * str = (uint8_t *)in; - const uint32_t PRIME = 591798841; - uint32_t hash32; - uint64_t hash64 = (uint64_t)seed ^ UINT64_C(14695981039346656037); - size_t Cycles, NDhead; +template +static void FNV_Pippip_Yurii( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * str = (uint8_t *)in; + const uint32_t PRIME = 591798841; + uint32_t hash32; + uint64_t hash64 = (uint64_t)seed ^ UINT64_C(14695981039346656037); + size_t Cycles, NDhead; if (len > 8) { Cycles = ((len - 1) >> 4) + 1; NDhead = len - (Cycles << 3); #pragma nounroll for (; Cycles--; str += 8) { - hash64 = (hash64 ^ (GET_U64(str, 0))) * PRIME; + hash64 = (hash64 ^ (GET_U64(str, 0) )) * PRIME; hash64 = (hash64 ^ (GET_U64(str, NDhead))) * PRIME; } } else { if (isLE() ^ bswap) { - hash64 = (hash64 ^ _PADr_KAZE(GET_U64(str, 0), (8 - len) << 3)) * - PRIME; + hash64 = (hash64 ^ _PADr_KAZE(GET_U64(str, 0), (8 - len) << 3)) * + PRIME; } else { hash64 = (hash64 ^ _PADr_KAZE(GET_U64(str, 0), (8 - len) << 3)) * - PRIME; + PRIME; } } hash32 = (uint32_t)(hash64 ^ (hash64 >> 32)); @@ -239,153 +239,153 @@ static void FNV_Pippip_Yurii(const void * in, const size_t len, const seed_t see // Also https://www.codeproject.com/articles/716530/fastest-hash-function-for-table-lookups-in-c REGISTER_FAMILY(fnv, - $.src_url = "http://www.sanmayce.com/Fastest_Hash/index.html", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "http://www.sanmayce.com/Fastest_Hash/index.html", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(fibonacci_32, - $.desc = "32-bit wordwise Fibonacci hash (Knuth)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x09952480, - $.verification_BE = 0x006F7705, - $.hashfn_native = fibonacci, - $.hashfn_bswap = fibonacci, - $.badseeds = {0, UINT64_C(0xffffffff00000000)} /* !! all keys ending with 0x0000_0000 */ -); + $.desc = "32-bit wordwise Fibonacci hash (Knuth)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x09952480, + $.verification_BE = 0x006F7705, + $.hashfn_native = fibonacci, + $.hashfn_bswap = fibonacci, + $.badseeds = { 0, UINT64_C (0xffffffff00000000) } /* !! all keys ending with 0x0000_0000 */ + ); REGISTER_HASH(fibonacci_64, - $.desc = "64-bit wordwise Fibonacci hash (Knuth)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xFE3BD380, - $.verification_BE = 0x3E67D58C, - $.hashfn_native = fibonacci, - $.hashfn_bswap = fibonacci, - $.badseeds = {0, UINT64_C(0xffffffff00000000)} /* !! all keys ending with 0x0000_0000 */ -); + $.desc = "64-bit wordwise Fibonacci hash (Knuth)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xFE3BD380, + $.verification_BE = 0x3E67D58C, + $.hashfn_native = fibonacci, + $.hashfn_bswap = fibonacci, + $.badseeds = { 0, UINT64_C (0xffffffff00000000) } /* !! all keys ending with 0x0000_0000 */ + ); REGISTER_HASH(FNV_1a_32, - $.desc = "32-bit bytewise FNV-1a (Fowler-Noll-Vo)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xE3CBBE91, - $.verification_BE = 0x656F95A0, - $.hashfn_native = FNV1a, - $.hashfn_bswap = FNV1a, - $.badseeds = {0x811c9dc5} -); + $.desc = "32-bit bytewise FNV-1a (Fowler-Noll-Vo)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xE3CBBE91, + $.verification_BE = 0x656F95A0, + $.hashfn_native = FNV1a, + $.hashfn_bswap = FNV1a, + $.badseeds = { 0x811c9dc5 } + ); REGISTER_HASH(FNV_1a_64, - $.desc = "64-bit bytewise FNV-1a (Fowler-Noll-Vo)", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x103455FC, - $.verification_BE = 0x4B032B63, - $.hashfn_native = FNV1a, - $.hashfn_bswap = FNV1a, - $.badseeds = {0x811c9dc5, 0xcbf29ce4, 0x84222325, UINT64_C(0xcbf29ce484222325)} -); + $.desc = "64-bit bytewise FNV-1a (Fowler-Noll-Vo)", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x103455FC, + $.verification_BE = 0x4B032B63, + $.hashfn_native = FNV1a, + $.hashfn_bswap = FNV1a, + $.badseeds = { 0x811c9dc5, 0xcbf29ce4, 0x84222325, UINT64_C (0xcbf29ce484222325) } + ); REGISTER_HASH(FNV_1a_32__wordwise, - $.desc = "32-bit wordwise hash based on FNV-1a", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x739801C5, - $.verification_BE = 0xC5999647, - $.hashfn_native = FNV2, - $.hashfn_bswap = FNV2 -); + $.desc = "32-bit wordwise hash based on FNV-1a", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x739801C5, + $.verification_BE = 0xC5999647, + $.hashfn_native = FNV2, + $.hashfn_bswap = FNV2 + ); REGISTER_HASH(FNV_1a_64__wordwise, - $.desc = "64-bit wordwise hash based on FNV1-a", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x1967C625, - $.verification_BE = 0x06F5053E, - $.hashfn_native = FNV2, - $.hashfn_bswap = FNV2 -); + $.desc = "64-bit wordwise hash based on FNV1-a", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x1967C625, + $.verification_BE = 0x06F5053E, + $.hashfn_native = FNV2, + $.hashfn_bswap = FNV2 + ); REGISTER_HASH(FNV_YoshimitsuTRIAD, - $.desc = "FNV-YoshimitsuTRIAD 32-bit (sanmayce)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xD8AFFD71, - $.verification_BE = 0x85C2EC2F, - $.hashfn_native = FNV_YoshimitsuTRIAD, - $.hashfn_bswap = FNV_YoshimitsuTRIAD, - $.badseeds = {0x811c9dc5, 0x23d4a49d} -); + $.desc = "FNV-YoshimitsuTRIAD 32-bit (sanmayce)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xD8AFFD71, + $.verification_BE = 0x85C2EC2F, + $.hashfn_native = FNV_YoshimitsuTRIAD, + $.hashfn_bswap = FNV_YoshimitsuTRIAD, + $.badseeds = { 0x811c9dc5, 0x23d4a49d } + ); REGISTER_HASH(FNV_Totenschiff, - $.desc = "FNV-Totenschiff 32-bit (sanmayce)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_READ_PAST_EOB| - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x95D95ACF, - $.verification_BE = 0xC16E2C8F, - $.hashfn_native = FNV_Totenschiff, - $.hashfn_bswap = FNV_Totenschiff, - $.badseeds = {0x811c9dc5} -); + $.desc = "FNV-Totenschiff 32-bit (sanmayce)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x95D95ACF, + $.verification_BE = 0xC16E2C8F, + $.hashfn_native = FNV_Totenschiff, + $.hashfn_bswap = FNV_Totenschiff, + $.badseeds = { 0x811c9dc5 } + ); REGISTER_HASH(FNV_PippipYurii, - $.desc = "FNV-Pippip-Yurii 32-bit (sanmayce)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_READ_PAST_EOB| - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xE79AE3E4, - $.verification_BE = 0x90C8C706, - $.hashfn_native = FNV_Pippip_Yurii, - $.hashfn_bswap = FNV_Pippip_Yurii, - $.badseeds = {0x811c9dc5} -); + $.desc = "FNV-Pippip-Yurii 32-bit (sanmayce)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xE79AE3E4, + $.verification_BE = 0x90C8C706, + $.hashfn_native = FNV_Pippip_Yurii, + $.hashfn_bswap = FNV_Pippip_Yurii, + $.badseeds = { 0x811c9dc5 } + ); diff --git a/hashes/halftimehash.cpp b/hashes/halftimehash.cpp index 6d5aa658..08534b8a 100644 --- a/hashes/halftimehash.cpp +++ b/hashes/halftimehash.cpp @@ -37,701 +37,762 @@ //------------------------------------------------------------ namespace halftime_hash { + namespace advanced { + namespace { +//------------------------------------------------------------ + inline uint64_t Xor( uint64_t a, uint64_t b ) { return a ^ b; } -namespace advanced { + inline uint64_t Plus( uint64_t a, uint64_t b ) { return a + b; } -namespace { + inline uint64_t Minus( uint64_t a, uint64_t b ) { return a - b; } -//------------------------------------------------------------ - inline uint64_t Xor(uint64_t a, uint64_t b) { return a ^ b; } - inline uint64_t Plus(uint64_t a, uint64_t b) { return a + b; } - inline uint64_t Minus(uint64_t a, uint64_t b) { return a - b; } - inline uint64_t LeftShift(uint64_t a, int s) { return a << s; } - inline uint64_t RightShift32(uint64_t a) { return a >> 32; } - inline uint64_t Sum(uint64_t a) { return a; } - inline uint64_t Negate(uint64_t a) { return -a; } - - inline uint64_t Plus32(uint64_t a, uint64_t b) { - uint64_t result; - uint32_t temp[2] = {(uint32_t)a + (uint32_t)b, - (uint32_t)(a >> 32) + (uint32_t)(b >> 32)}; - result = temp[0] + (((uint64_t)temp[1]) << 32); - return result; - } + inline uint64_t LeftShift( uint64_t a, int s ) { return a << s; } - inline uint64_t Times(uint64_t a, uint64_t b) { - constexpr uint64_t mask = (((uint64_t)1) << 32) - 1; - return (a & mask) * (b & mask); - } + inline uint64_t RightShift32( uint64_t a ) { return a >> 32; } - template < bool bswap > - struct BlockWrapperScalar { - using Block = uint64_t; + inline uint64_t Sum( uint64_t a ) { return a; } - static uint64_t LoadBlock(const void* x) { - auto y = reinterpret_cast(x); - return GET_U64(y, 0); - } + inline uint64_t Negate( uint64_t a ) { return -a; } - static uint64_t LoadBlockNative(const void* x) { - auto y = reinterpret_cast(x); - return GET_U64(y, 0); - } + inline uint64_t Plus32( uint64_t a, uint64_t b ) { + uint64_t result; + uint32_t temp[2] = { + (uint32_t)a + (uint32_t)b, + (uint32_t)(a >> 32) + (uint32_t)(b >> 32) + }; + + result = temp[0] + (((uint64_t)temp[1]) << 32); + return result; + } + + inline uint64_t Times( uint64_t a, uint64_t b ) { + constexpr uint64_t mask = (((uint64_t)1) << 32) - 1; + + return (a & mask) * (b & mask); + } + + template + struct BlockWrapperScalar { + using Block = uint64_t; + + static uint64_t LoadBlock( const void * x ) { + auto y = reinterpret_cast(x); + + return GET_U64(y, 0); + } - static uint64_t LoadOne(uint64_t entropy) { return entropy; } - }; + static uint64_t LoadBlockNative( const void * x ) { + auto y = reinterpret_cast(x); + + return GET_U64(y, 0); + } + + static uint64_t LoadOne( uint64_t entropy ) { return entropy; } + }; #if defined(HAVE_ARM_NEON) - using u128 = uint64x2_t; - - inline u128 LeftShift(u128 a, int i) { return vshlq_s64(a, vdupq_n_s64(i)); } - inline u128 Plus(u128 a, u128 b) { return vaddq_s64(a, b); } - inline u128 Minus(u128 a, u128 b) { return vsubq_s64(a, b); } - inline u128 Plus32(u128 a, u128 b) { return vaddq_s32(a, b); } - inline u128 RightShift32(u128 a) { return vshrq_n_u64(a, 32); } - - inline u128 Times(u128 a, u128 b) { - uint32x2_t a_lo = vmovn_u64(a); - uint32x2_t b_lo = vmovn_u64(b); - return vmull_u32(a_lo, b_lo); - } + using u128 = uint64x2_t; - inline u128 Xor(u128 a, u128 b) { return veorq_s32(a, b); } + inline u128 LeftShift( u128 a, int i ) { return vshlq_s64(a, vdupq_n_s64(i)); } - static inline u128 Negate(u128 a) { - const auto zero = vdupq_n_s64(0); - return Minus(zero, a); - } + inline u128 Plus( u128 a, u128 b ) { return vaddq_s64(a, b); } - inline uint64_t Sum(u128 a) { return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } + inline u128 Minus( u128 a, u128 b ) { return vsubq_s64(a, b); } - template < bool bswap > - struct BlockWrapper128 { - using Block = u128; + inline u128 Plus32( u128 a, u128 b ) { return vaddq_s32(a, b); } - static u128 LoadBlock(const void* x) { - auto y = reinterpret_cast(x); - if (bswap) { - return vrev64q_u8(vld1q_s32(y)); + inline u128 RightShift32( u128 a ) { return vshrq_n_u64(a, 32); } + + inline u128 Times( u128 a, u128 b ) { + uint32x2_t a_lo = vmovn_u64(a); + uint32x2_t b_lo = vmovn_u64(b); + + return vmull_u32(a_lo, b_lo); } - return vld1q_s32(y); - } - static u128 LoadBlockNative(const void* x) { - auto y = reinterpret_cast(x); - return vld1q_s32(y); - } + inline u128 Xor( u128 a, u128 b ) { return veorq_s32(a, b); } + + static inline u128 Negate( u128 a ) { + const auto zero = vdupq_n_s64(0); + + return Minus(zero, a); + } + + inline uint64_t Sum( u128 a ) { return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } + + template + struct BlockWrapper128 { + using Block = u128; + + static u128 LoadBlock( const void * x ) { + auto y = reinterpret_cast(x); - static u128 LoadOne(uint64_t entropy) { return vdupq_n_s64(entropy); } - }; + if (bswap) { + return vrev64q_u8(vld1q_s32(y)); + } + return vld1q_s32(y); + } + + static u128 LoadBlockNative( const void * x ) { + auto y = reinterpret_cast(x); + + return vld1q_s32(y); + } + + static u128 LoadOne( uint64_t entropy ) { return vdupq_n_s64(entropy); } + }; #elif defined(HAVE_SSE_2) - using u128 = __m128i; - - inline u128 LeftShift(u128 a, int i) { return _mm_slli_epi64(a, i); } - inline u128 Plus(u128 a, u128 b) { return _mm_add_epi64(a, b); } - inline u128 Minus(u128 a, u128 b) { return _mm_sub_epi64(a, b); } - inline u128 Plus32(u128 a, u128 b) { return _mm_add_epi32(a, b); } - inline u128 RightShift32(u128 a) { return _mm_srli_epi64(a, 32); } - inline u128 Times(u128 a, u128 b) { return _mm_mul_epu32(a, b); } - inline u128 Xor(u128 a, u128 b) { return _mm_xor_si128(a, b); } - - static inline u128 Negate(u128 a) { - const auto zero = _mm_set1_epi64x(0); - return Minus(zero, a); - } + using u128 = __m128i; + + inline u128 LeftShift( u128 a, int i ) { return _mm_slli_epi64(a, i); } + + inline u128 Plus( u128 a, u128 b ) { return _mm_add_epi64(a, b); } + + inline u128 Minus( u128 a, u128 b ) { return _mm_sub_epi64(a, b); } - inline uint64_t Sum(u128 a) { return (uint64_t)a[0] + (uint64_t)a[1]; } + inline u128 Plus32( u128 a, u128 b ) { return _mm_add_epi32(a, b); } - template < bool bswap > - struct BlockWrapper128 { - using Block = u128; + inline u128 RightShift32( u128 a ) { return _mm_srli_epi64(a, 32); } - static u128 LoadBlock(const void* x) { - auto y = reinterpret_cast(x); - if (bswap) { - return mm_bswap64(_mm_loadu_si128(y)); + inline u128 Times( u128 a, u128 b ) { return _mm_mul_epu32(a, b); } + + inline u128 Xor( u128 a, u128 b ) { return _mm_xor_si128(a, b); } + + static inline u128 Negate( u128 a ) { + const auto zero = _mm_set1_epi64x(0); + + return Minus(zero, a); } - return _mm_loadu_si128(y); - } - static u128 LoadBlockNative(const void* x) { - auto y = reinterpret_cast(x); - return _mm_loadu_si128(y); - } + inline uint64_t Sum( u128 a ) { return (uint64_t)a[0] + (uint64_t)a[1]; } + + template + struct BlockWrapper128 { + using Block = u128; + + static u128 LoadBlock( const void * x ) { + auto y = reinterpret_cast(x); - static u128 LoadOne(uint64_t entropy) { return _mm_set1_epi64x(entropy); } - }; + if (bswap) { + return mm_bswap64(_mm_loadu_si128(y)); + } + return _mm_loadu_si128(y); + } + + static u128 LoadBlockNative( const void * x ) { + auto y = reinterpret_cast(x); + + return _mm_loadu_si128(y); + } + + static u128 LoadOne( uint64_t entropy ) { return _mm_set1_epi64x(entropy); } + }; #endif #if defined(HAVE_AVX2) - using u256 = __m256i; - - inline u256 Plus(u256 a, u256 b) { return _mm256_add_epi64(a, b); } - inline u256 Plus32(u256 a, u256 b) { return _mm256_add_epi32(a, b); } - inline u256 Times(u256 a, u256 b) { return _mm256_mul_epu32(a, b); } - inline u256 Xor(u256 a, u256 b) { return _mm256_xor_si256(a, b); } - inline u256 LeftShift(u256 a, int i) { return _mm256_slli_epi64(a, i); } - inline u256 RightShift32(u256 a) { return _mm256_srli_epi64(a, 32); } - inline u256 Minus(u256 a, u256 b) { return _mm256_sub_epi64(a, b); } - - static inline u256 Negate(u256 a) { - const auto zero = _mm256_set1_epi64x(0); - return Minus(zero, a); - } + using u256 = __m256i; - inline uint64_t Sum(u256 a) { - auto c = _mm256_extracti128_si256(a, 0); - auto d = _mm256_extracti128_si256(a, 1); - c = _mm_add_epi64(c, d); - static_assert(sizeof(c[0]) == sizeof(uint64_t), "u256 too granular"); - static_assert(sizeof(c) == 2 * sizeof(uint64_t), "u256 too granular"); - return (uint64_t)c[0] + (uint64_t)c[1]; - } + inline u256 Plus( u256 a, u256 b ) { return _mm256_add_epi64(a, b); } + + inline u256 Plus32( u256 a, u256 b ) { return _mm256_add_epi32(a, b); } + + inline u256 Times( u256 a, u256 b ) { return _mm256_mul_epu32(a, b); } - template < bool bswap > - struct BlockWrapper256 { - using Block = u256; + inline u256 Xor( u256 a, u256 b ) { return _mm256_xor_si256(a, b); } - static u256 LoadBlock(const void* x) { - auto y = reinterpret_cast(x); - if (bswap) { - return mm256_bswap64(_mm256_loadu_si256(y)); + inline u256 LeftShift( u256 a, int i ) { return _mm256_slli_epi64(a, i); } + + inline u256 RightShift32( u256 a ) { return _mm256_srli_epi64(a, 32); } + + inline u256 Minus( u256 a, u256 b ) { return _mm256_sub_epi64(a, b); } + + static inline u256 Negate( u256 a ) { + const auto zero = _mm256_set1_epi64x(0); + + return Minus(zero, a); } - return _mm256_loadu_si256(y); - } - static u256 LoadBlockNative(const void* x) { - auto y = reinterpret_cast(x); - return _mm256_loadu_si256(y); - } + inline uint64_t Sum( u256 a ) { + auto c = _mm256_extracti128_si256(a, 0); + auto d = _mm256_extracti128_si256(a, 1); - static u256 LoadOne(uint64_t entropy) { return _mm256_set1_epi64x(entropy); } - }; + c = _mm_add_epi64(c, d); + static_assert(sizeof(c[0]) == sizeof(uint64_t) , "u256 too granular"); + static_assert(sizeof(c) == 2 * sizeof(uint64_t), "u256 too granular"); + return (uint64_t)c[0] + (uint64_t)c[1]; + } + + template + struct BlockWrapper256 { + using Block = u256; + + static u256 LoadBlock( const void * x ) { + auto y = reinterpret_cast(x); + + if (bswap) { + return mm256_bswap64(_mm256_loadu_si256(y)); + } + return _mm256_loadu_si256(y); + } + + static u256 LoadBlockNative( const void * x ) { + auto y = reinterpret_cast(x); + + return _mm256_loadu_si256(y); + } + + static u256 LoadOne( uint64_t entropy ) { return _mm256_set1_epi64x(entropy); } + }; #endif #if defined(HAVE_AVX512_F) - using u512 = __m512i; - - inline u512 Plus(u512 a, u512 b) { return _mm512_add_epi64(a, b); } - inline u512 Plus32(u512 a, u512 b) { return _mm512_add_epi32(a, b); } - inline u512 Times(u512 a, u512 b) { return _mm512_mul_epu32(a, b); } - inline u512 Xor(u512 a, u512 b) { return _mm512_xor_epi32(a, b); } - inline uint64_t Sum(u512 a) { return _mm512_reduce_add_epi64(a); } - inline u512 RightShift32(u512 a) { return _mm512_srli_epi64(a, 32); } - // inline u512 RightShift32(u512 a, int i) { return _mm512_shuffle_epi32(a, - // _MM_PERM_ACAC); } - inline u512 LeftShift(u512 a, int i) { return _mm512_slli_epi64(a, i); } - inline u512 Minus(u512 a, u512 b) { return _mm512_sub_epi64(a, b); } - inline u512 Negate(u512 a) { return Minus(_mm512_set1_epi64(0), a); } - - template < bool bswap > - struct BlockWrapper512 { - using Block = u512; - - static Block LoadBlock(const void* x) { - if (bswap) { - return mm512_bswap64(_mm512_loadu_si512(x)); - } - return _mm512_loadu_si512(x); - } + using u512 = __m512i; - static Block LoadBlockNative(const void* x) { - return _mm512_loadu_si512(x); - } + inline u512 Plus( u512 a, u512 b ) { return _mm512_add_epi64(a, b); } - static Block LoadOne(uint64_t entropy) { - return _mm512_set1_epi64(entropy); - } - }; + inline u512 Plus32( u512 a, u512 b ) { return _mm512_add_epi32(a, b); } + + inline u512 Times( u512 a, u512 b ) { return _mm512_mul_epu32(a, b); } + + inline u512 Xor( u512 a, u512 b ) { return _mm512_xor_epi32(a, b); } + + inline uint64_t Sum( u512 a ) { return _mm512_reduce_add_epi64(a); } + + inline u512 RightShift32( u512 a ) { return _mm512_srli_epi64(a, 32); } + + // inline u512 RightShift32(u512 a, int i) { return _mm512_shuffle_epi32(a, + // _MM_PERM_ACAC); } + inline u512 LeftShift( u512 a, int i ) { return _mm512_slli_epi64(a, i); } + + inline u512 Minus( u512 a, u512 b ) { return _mm512_sub_epi64(a, b); } + + inline u512 Negate( u512 a ) { return Minus(_mm512_set1_epi64(0), a); } + + template + struct BlockWrapper512 { + using Block = u512; + + static Block LoadBlock( const void * x ) { + if (bswap) { + return mm512_bswap64(_mm512_loadu_si512(x)); + } + return _mm512_loadu_si512(x); + } + + static Block LoadBlockNative( const void * x ) { + return _mm512_loadu_si512(x); + } + + static Block LoadOne( uint64_t entropy ) { + return _mm512_set1_epi64(entropy); + } + }; #endif - template - T MultiplyAdd(const T & summand, const T & factor1, const T & factor2) { - return Plus(summand, Times(factor1, factor2)); - } + template + T MultiplyAdd( const T & summand, const T & factor1, const T & factor2 ) { + return Plus(summand, Times(factor1, factor2)); + } #if defined(HAVE_ARM_NEON) - template <> - u128 MultiplyAdd(const u128 & summand, const u128 & factor1, const u128 & factor2) { - return vmlal_u32(summand, vmovn_u64(factor1), vmovn_u64(factor2)); - } -#endif + template <> + u128 MultiplyAdd( const u128 & summand, const u128 & factor1, const u128 & factor2 ) { + return vmlal_u32(summand, vmovn_u64(factor1), vmovn_u64(factor2)); + } + +#endif //------------------------------------------------------------ -template -inline void Encode3(Block raw_io[9 * 3]) { - auto io = reinterpret_cast(raw_io); - constexpr unsigned x = 0, y = 1, z = 2; - - const Block* iter = io[0]; - io[7][x] = io[8][x] = iter[x]; - io[7][y] = io[8][y] = iter[y]; - io[7][z] = io[8][z] = iter[z]; - iter += 1; - - auto DistributeRaw = [io, iter](unsigned slot, unsigned label, - std::initializer_list rest) { - for (unsigned i : rest) { - io[slot][i] = Xor(io[slot][i], iter[label]); - } - }; - - auto Distribute3 = [&iter, DistributeRaw, x, y, z](unsigned idx, - std::initializer_list a, - std::initializer_list b, - std::initializer_list c) { - DistributeRaw(idx, x, a); - DistributeRaw(idx, y, b); - DistributeRaw(idx, z, c); - iter += 1; - }; - - while (iter != io[9]) { - Distribute3(7, {x}, {y}, {z}); - } - - iter = io[1]; - Distribute3(8, {z}, {x, z}, {y}); - Distribute3(8, {x, z}, {x, y, z}, {y, z}); - Distribute3(8, {y}, {y, z}, {x, z}); - Distribute3(8, {x, y}, {z}, {x}); - Distribute3(8, {y, z}, {x, y}, {x, y, z}); - Distribute3(8, {x, y, z}, {x}, {x, y}); -} + template + inline void Encode3( Block raw_io[9 * 3] ) { + auto io = reinterpret_cast(raw_io); + constexpr unsigned x = 0, y = 1, z = 2; + + const Block * iter = io[0]; + + io[7][x] = io[8][x] = iter[x]; + io[7][y] = io[8][y] = iter[y]; + io[7][z] = io[8][z] = iter[z]; + iter += 1; + + auto DistributeRaw = [io, iter]( unsigned slot, unsigned label, + std::initializer_list rest ) { + for (unsigned i: rest) { + io[slot][i] = Xor(io[slot][i], iter[label]); + } + }; + + auto Distribute3 = [&iter, DistributeRaw, x, y, z]( unsigned idx, + std::initializer_list a , + std::initializer_list b , + std::initializer_list c ) { + DistributeRaw(idx, x, a); + DistributeRaw(idx, y, b); + DistributeRaw(idx, z, c); + iter += 1; + }; + + while (iter != io[9]) { + Distribute3(7, { x }, { y }, { z }); + } + + iter = io[1]; + Distribute3(8, { z } , { x, z } , { y } ); + Distribute3(8, { x, z } , { x, y, z }, { y, z }); + Distribute3(8, { y } , { y, z } , { x, z }); + Distribute3(8, { x, y } , { z } , { x } ); + Distribute3(8, { y, z } , { x, y } , { x, y, z }); + Distribute3(8, { x, y, z }, { x } , { x, y }); + } -template -inline void Encode2(Block raw_io[7 * 3]) { - auto io = reinterpret_cast(raw_io); - for (int i = 0; i < 3; ++i) { - io[6][i] = io[0][i]; - for (int j = 1; j < 6; ++j) { - io[6][i] = Xor(io[6][i], io[j][i]); - } - } -} + template + inline void Encode2( Block raw_io[7 * 3] ) { + auto io = reinterpret_cast(raw_io); + + for (int i = 0; i < 3; ++i) { + io[6][i] = io[0][i]; + for (int j = 1; j < 6; ++j) { + io[6][i] = Xor(io[6][i], io[j][i]); + } + } + } // https://docs.switzernet.com/people/emin-gabrielyan/051102-erasure-10-7-resilient/ -template -inline void Encode4(Block raw_io[10 * 3]) { - auto io = reinterpret_cast(raw_io); - - constexpr unsigned x = 0, y = 1, z = 2; - - const Block* iter = io[0]; - io[7][x] = io[8][x] = io[9][x] = iter[x]; - io[7][y] = io[8][y] = io[9][y] = iter[y]; - io[7][z] = io[8][z] = io[9][z] = iter[z]; - iter += 1; - - auto DistributeRaw = [io, iter](unsigned slot, unsigned label, - std::initializer_list rest) { - for (unsigned i : rest) { - io[slot][i] = Xor(io[slot][i], iter[label]); - } - }; - - auto Distribute3 = [&iter, DistributeRaw, x, y, z](unsigned idx, - std::initializer_list a, - std::initializer_list b, - std::initializer_list c) { - DistributeRaw(idx, x, a); - DistributeRaw(idx, y, b); - DistributeRaw(idx, z, c); - iter += 1; - }; - - while (iter != io[10]) { - Distribute3(7, {x}, {y}, {z}); - } - - iter = io[1]; - Distribute3(8, {z}, {x, z}, {y}); // 73 - Distribute3(8, {x, z}, {x, y, z}, {y, z}); // 140 - Distribute3(8, {y}, {y, z}, {x, z}); // 167 - Distribute3(8, {x, y}, {z}, {x}); // 198 - Distribute3(8, {y, z}, {x, y}, {x, y, z}); // 292 - Distribute3(8, {x, y, z}, {x}, {x, y}); // 323 - - iter = io[1]; - Distribute3(9, {x, z}, {x, y, z}, {y, z}); // 140 - Distribute3(9, {x, y}, {z}, {x}); // 198 - Distribute3(9, {z}, {x, z}, {y}); // 73 - Distribute3(9, {y, z}, {x, y}, {x, y, z}); // 292 - Distribute3(9, {x, y, z}, {x}, {x, y}); // 323 - Distribute3(9, {y}, {y, z}, {x, z}); // 167 -} + template + inline void Encode4( Block raw_io[10 * 3] ) { + auto io = reinterpret_cast(raw_io); + + constexpr unsigned x = 0, y = 1, z = 2; + + const Block * iter = io[0]; + + io[7][x] = io[8][x] = io[9][x] = iter[x]; + io[7][y] = io[8][y] = io[9][y] = iter[y]; + io[7][z] = io[8][z] = io[9][z] = iter[z]; + iter += 1; + + auto DistributeRaw = [io, iter]( unsigned slot, unsigned label, + std::initializer_list rest ) { + for (unsigned i: rest) { + io[slot][i] = Xor(io[slot][i], iter[label]); + } + }; + + auto Distribute3 = [&iter, DistributeRaw, x, y, z]( unsigned idx, + std::initializer_list a , + std::initializer_list b , + std::initializer_list c ) { + DistributeRaw(idx, x, a); + DistributeRaw(idx, y, b); + DistributeRaw(idx, z, c); + iter += 1; + }; + + while (iter != io[10]) { + Distribute3(7, { x }, { y }, { z }); + } + + iter = io[1]; + Distribute3(8, { z } , { x, z } , { y } ); // 73 + Distribute3(8, { x, z } , { x, y, z }, { y, z }); // 140 + Distribute3(8, { y } , { y, z } , { x, z }); // 167 + Distribute3(8, { x, y } , { z } , { x } ); // 198 + Distribute3(8, { y, z } , { x, y } , { x, y, z }); // 292 + Distribute3(8, { x, y, z }, { x } , { x, y }); // 323 + + iter = io[1]; + Distribute3(9, { x, z } , { x, y, z }, { y, z }); // 140 + Distribute3(9, { x, y } , { z } , { x } ); // 198 + Distribute3(9, { z } , { x, z } , { y } ); // 73 + Distribute3(9, { y, z } , { x, y } , { x, y, z }); // 292 + Distribute3(9, { x, y, z }, { x } , { x, y }); // 323 + Distribute3(9, { y } , { y, z } , { x, z }); // 167 + } // https://docs.switzernet.com/people/emin-gabrielyan/051103-erasure-9-5-resilient/ -template -inline void Encode5(Block raw_io[9 * 3]) { - auto io = reinterpret_cast(raw_io); - - constexpr unsigned x = 0, y = 1, z = 2; - - const Block* iter = io[0]; - io[5][x] = io[6][x] = iter[x]; - io[5][y] = io[6][y] = iter[y]; - io[5][z] = io[6][z] = iter[z]; - - io[7][x] = io[8][x] = iter[y]; - io[7][y] = io[8][y] = iter[z]; - io[7][z] = io[8][z] = Xor(iter[x], iter[y]); - iter += 1; - - auto DistributeRaw = [io, iter](unsigned slot, unsigned label, - std::initializer_list rest) { - for (unsigned i : rest) { - io[slot][i] = Xor(io[slot][i], iter[label]); - } - }; - - auto Distribute3 = [&iter, DistributeRaw, x, y, z](unsigned idx, - std::initializer_list a, - std::initializer_list b, - std::initializer_list c) { - DistributeRaw(idx, x, a); - DistributeRaw(idx, y, b); - DistributeRaw(idx, z, c); - iter += 1; - }; - - while (iter != io[9]) { - Distribute3(5, {x}, {y}, {z}); - } - - iter = io[1]; - Distribute3(6, {z}, {x, z}, {y}); // 73 - Distribute3(6, {x, z}, {x, y, z}, {y, z}); // 140 - Distribute3(6, {y}, {y, z}, {x, z}); // 167 - Distribute3(6, {x, y}, {z}, {x}); // 198 - - iter = io[1]; - Distribute3(7, {x, y, z}, {x}, {x, y}); // 323 - Distribute3(7, {x, z}, {x, y, z}, {y, z}); // 140 - Distribute3(7, {x}, {y}, {z}); // 11 - Distribute3(7, {y}, {y, z}, {x, z}); // 167 - - iter = io[1]; - Distribute3(8, {x}, {y}, {z}); // 11 - Distribute3(8, {x, y}, {z}, {x}); // 198 - Distribute3(8, {y, z}, {x, y}, {x, y, z}); // 292 - Distribute3(8, {x, z}, {x, y, z}, {y, z}); // 140 -} + template + inline void Encode5( Block raw_io[9 * 3] ) { + auto io = reinterpret_cast(raw_io); + + constexpr unsigned x = 0, y = 1, z = 2; + + const Block * iter = io[0]; + + io[5][x] = io[6][x] = iter[x]; + io[5][y] = io[6][y] = iter[y]; + io[5][z] = io[6][z] = iter[z]; + + io[7][x] = io[8][x] = iter[y]; + io[7][y] = io[8][y] = iter[z]; + io[7][z] = io[8][z] = Xor(iter[x], iter[y]); + iter += 1; + + auto DistributeRaw = [io, iter]( unsigned slot, unsigned label, + std::initializer_list rest ) { + for (unsigned i: rest) { + io[slot][i] = Xor(io[slot][i], iter[label]); + } + }; + + auto Distribute3 = [&iter, DistributeRaw, x, y, z]( unsigned idx, + std::initializer_list a , + std::initializer_list b , + std::initializer_list c ) { + DistributeRaw(idx, x, a); + DistributeRaw(idx, y, b); + DistributeRaw(idx, z, c); + iter += 1; + }; + + while (iter != io[9]) { + Distribute3(5, { x }, { y }, { z }); + } + + iter = io[1]; + Distribute3(6, { z } , { x, z } , { y } ); // 73 + Distribute3(6, { x, z } , { x, y, z }, { y, z }); // 140 + Distribute3(6, { y } , { y, z } , { x, z }); // 167 + Distribute3(6, { x, y } , { z } , { x } ); // 198 + + iter = io[1]; + Distribute3(7, { x, y, z }, { x } , { x, y }); // 323 + Distribute3(7, { x, z } , { x, y, z }, { y, z }); // 140 + Distribute3(7, { x } , { y }, { z }); // 11 + Distribute3(7, { y } , { y, z } , { x, z }); // 167 + + iter = io[1]; + Distribute3(8, { x } , { y }, { z }); // 11 + Distribute3(8, { x, y } , { z } , { x } ); // 198 + Distribute3(8, { y, z } , { x, y } , { x, y, z }); // 292 + Distribute3(8, { x, z } , { x, y, z }, { y, z }); // 140 + } -template -inline void Combine2(const Block input[7], Block output[2]); + template + inline void Combine2( const Block input[7], Block output[2] ); -template -inline void Combine3(const Block input[9], Block output[3]); + template + inline void Combine3( const Block input[9], Block output[3] ); -template -inline void Combine4(const Block input[10], Block output[3]); + template + inline void Combine4( const Block input[10], Block output[3] ); -template -inline void Combine5(const Block input[9], Block output[3]); + template + inline void Combine5( const Block input[9], Block output[3] ); -constexpr inline uint64_t FloorLog(uint64_t a, uint64_t b) { - return (0 == a) ? 0 : ((b < a) ? 0 : (1 + (FloorLog(a, b / a)))); -} + constexpr inline uint64_t FloorLog( uint64_t a, uint64_t b ) { + return (0 == a) ? 0 : ((b < a) ? 0 : (1 + (FloorLog(a, b / a)))); + } -template -struct EhcBadger { - using Block = typename BlockWrapper::Block; - - static Block Mix(const Block & accum, const Block & input, const Block & entropy) { - Block output = Plus32(entropy, input); - Block twin = RightShift32(output); - output = MultiplyAdd(accum, output, twin); - return output; - } - - static Block MixOne(const Block & accum, const Block & input, uint64_t entropy) { - return Mix(accum, input, BlockWrapper::LoadOne(entropy)); - } - - static Block MixNone(const Block & input, uint64_t entropy_word) { - Block entropy = BlockWrapper::LoadOne(entropy_word); - Block output = Plus32(entropy, input); - Block twin = RightShift32(output); - output = Times(output, twin); - return output; - } - - static void EhcUpperLayer(const Block (&input)[fanout][out_width], - const uint64_t entropy[out_width * (fanout - 1)], - Block (&output)[out_width]) { - for (unsigned i = 0; i < out_width; ++i) { - output[i] = input[0][i]; - for (unsigned j = 1; j < fanout; ++j) { - output[i] = MixOne(output[i], input[j][i], entropy[(fanout - 1) * i + j - 1]); - } - } - } - - static void Encode(Block io[encoded_dimension][in_width]) { - static_assert(2 <= out_width && out_width <= 5, "uhoh"); - if (out_width == 3) return Encode3(&io[0][0]); - if (out_width == 2) return Encode2(&io[0][0]); - if (out_width == 4) return Encode4(&io[0][0]); - if (out_width == 5) return Encode5(&io[0][0]); - } - - static Block SimpleTimes(std::integral_constant, const Block & x) { return Negate(x); } - static Block SimpleTimes(std::integral_constant, const Block & x) { return x; } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return LeftShift(x, 1); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return Plus(x, LeftShift(x, 1)); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return LeftShift(x, 2); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return Plus(x, LeftShift(x, 2)); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return Minus(LeftShift(x, 3), x); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return LeftShift(x, 3); - } - static Block SimpleTimes(std::integral_constant, const Block & x) { - return Plus(x, LeftShift(x, 3)); - } - - template - static Block SimplerTimes(const Block & x) { - return SimpleTimes(std::integral_constant{}, x); - } - - template - static void Dot2(Block sinks[2], const Block & x) { - sinks[0] = Plus(sinks[0], SimplerTimes(x)); - sinks[1] = Plus(sinks[1], SimplerTimes(x)); - } - - template - static void Dot3(Block sinks[3], const Block & x) { - Dot2(sinks, x); - sinks[2] = Plus(sinks[2], SimplerTimes(x)); - } - - template - static void Dot4(Block sinks[4], const Block & x) { - Dot3(sinks, x); - sinks[3] = Plus(sinks[3], SimplerTimes(x)); - } - - template - static void Dot5(Block sinks[5], const Block & x) { - Dot4(sinks, x); - sinks[4] = Plus(sinks[4], SimplerTimes(x)); - } - - static void Combine(const Block input[encoded_dimension], Block (&output)[out_width]) { - if (out_width == 3) return Combine3(input, output); - if (out_width == 2) return Combine2(input, output); - if (out_width == 4) return Combine4(input, output); - if (out_width == 5) return Combine5(input, output); - } - - static void Load(const uint8_t input[dimension * in_width * sizeof(Block)], - Block output[dimension][in_width]) { - static_assert(dimension * in_width <= 28, ""); + template + struct EhcBadger { + using Block = typename BlockWrapper::Block; + + static Block Mix( const Block & accum, const Block & input, const Block & entropy ) { + Block output = Plus32(entropy, input); + Block twin = RightShift32(output); + + output = MultiplyAdd(accum, output, twin); + return output; + } + + static Block MixOne( const Block & accum, const Block & input, uint64_t entropy ) { + return Mix(accum, input, BlockWrapper::LoadOne(entropy)); + } + + static Block MixNone( const Block & input, uint64_t entropy_word ) { + Block entropy = BlockWrapper::LoadOne(entropy_word); + Block output = Plus32(entropy, input); + Block twin = RightShift32(output); + + output = Times(output, twin); + return output; + } + + static void EhcUpperLayer( const Block (& input)[fanout][out_width], + const uint64_t entropy[out_width * (fanout - 1)], Block (& output)[out_width] ) { + for (unsigned i = 0; i < out_width; ++i) { + output[i] = input[0][i]; + for (unsigned j = 1; j < fanout; ++j) { + output[i] = MixOne(output[i], input[j][i], entropy[(fanout - 1) * i + j - 1]); + } + } + } + + static void Encode( Block io[encoded_dimension][in_width] ) { + static_assert(2 <= out_width && out_width <= 5, "uhoh"); + if (out_width == 3) { return Encode3(&io[0][0]); } + if (out_width == 2) { return Encode2(&io[0][0]); } + if (out_width == 4) { return Encode4(&io[0][0]); } + if (out_width == 5) { return Encode5(&io[0][0]); } + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { return Negate(x); } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { return x; } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return LeftShift(x, 1); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return Plus(x, LeftShift(x, 1)); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return LeftShift(x, 2); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return Plus(x, LeftShift(x, 2)); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return Minus(LeftShift(x, 3), x); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return LeftShift(x, 3); + } + + static Block SimpleTimes( std::integral_constant, const Block & x ) { + return Plus(x, LeftShift(x, 3)); + } + + template + static Block SimplerTimes( const Block & x ) { + return SimpleTimes(std::integral_constant{}, x); + } + + template + static void Dot2( Block sinks[2], const Block & x ) { + sinks[0] = Plus(sinks[0], SimplerTimes(x)); + sinks[1] = Plus(sinks[1], SimplerTimes(x)); + } + + template + static void Dot3( Block sinks[3], const Block & x ) { + Dot2(sinks, x); + sinks[2] = Plus(sinks[2], SimplerTimes(x)); + } + + template + static void Dot4( Block sinks[4], const Block & x ) { + Dot3(sinks, x); + sinks[3] = Plus(sinks[3], SimplerTimes(x)); + } + + template + static void Dot5( Block sinks[5], const Block & x ) { + Dot4(sinks, x); + sinks[4] = Plus(sinks[4], SimplerTimes(x)); + } + + static void Combine( const Block input[encoded_dimension], Block (& output)[out_width] ) { + if (out_width == 3) { return Combine3(input, output); } + if (out_width == 2) { return Combine2(input, output); } + if (out_width == 4) { return Combine4(input, output); } + if (out_width == 5) { return Combine5(input, output); } + } + + static void Load( const uint8_t input[dimension * in_width * sizeof(Block)], + Block output[dimension][in_width] ) { + static_assert(dimension * in_width <= 28, ""); #if !defined(__clang__) -#pragma GCC unroll 28 + #pragma GCC unroll 28 #else -#pragma unroll + #pragma unroll #endif - for (unsigned i = 0; i < dimension; ++i) { + for (unsigned i = 0; i < dimension; ++i) { #if !defined(__clang__) -#pragma GCC unroll 28 + #pragma GCC unroll 28 #else -#pragma unroll + #pragma unroll #endif - for (unsigned j = 0; j < in_width; ++j) { - output[i][j] = - BlockWrapper::LoadBlock(&input[(i * in_width + j) * sizeof(Block)]); - } - } - } - - static void Hash(const Block (&input)[encoded_dimension][in_width], - const uint64_t entropy[encoded_dimension][in_width], - Block output[encoded_dimension]) { - for (unsigned i = 0; i < encoded_dimension; ++i) { - output[i] = MixNone(input[i][0], entropy[i][0]); - // TODO: should loading take care of this? - } - for (unsigned j = 1; j < in_width; ++j) { - for (unsigned i = 0; i < encoded_dimension; ++i) { - output[i] = MixOne(output[i], input[i][j], entropy[i][j]); - // TODO: this might be optional; it might not matter which way we iterate over - // entropy - } - } - } - - static void EhcBaseLayer(const uint8_t input[dimension * in_width * sizeof(Block)], - const uint64_t raw_entropy[encoded_dimension][in_width], - Block (&output)[out_width]) { - Block scratch[encoded_dimension][in_width]; - Block tmpout[encoded_dimension]; - Load(input, scratch); - Encode(scratch); - Hash(scratch, raw_entropy, tmpout); - Combine(tmpout, output); - } - - static void DfsTreeHash(const uint8_t* data, size_t block_group_length, - Block stack[][fanout][out_width], int stack_lengths[], - const uint64_t* entropy) { - auto entropy_matrix = reinterpret_cast(entropy); - for (size_t k = 0; k < block_group_length; ++k) { - int i = 0; - while (stack_lengths[i] == fanout) ++i; - for (int j = i - 1; j >= 0; --j) { - EhcUpperLayer( - stack[j], - &entropy[encoded_dimension * in_width + (fanout - 1) * out_width * j], - stack[j + 1][stack_lengths[j + 1]]); - stack_lengths[j] = 0; - stack_lengths[j + 1] += 1; - } - - EhcBaseLayer(&data[k * dimension * in_width * sizeof(Block)], entropy_matrix, - stack[0][stack_lengths[0]]); - stack_lengths[0] += 1; - } - } - - // auto b = sizeof(Block) / sizeof(uint64_t); - static constexpr size_t GEBN_b() { return sizeof(Block) / sizeof(uint64_t); } - // auto h = FloorLog(fanout, n / (b * dimension * in_width)); - static constexpr size_t GEBN_h(size_t n) { return FloorLog(fanout, n / (GEBN_b() * dimension * in_width)); } - static constexpr size_t GetEntropyBytesNeeded(size_t n) { - return sizeof(uint64_t) * (encoded_dimension * in_width + (fanout - 1) * out_width * GEBN_h(n) + - GEBN_b() * fanout * out_width * GEBN_h(n) + GEBN_b() * dimension * in_width + out_width - 1); - } - - struct BlockGreedy { - private: - const uint64_t* seeds; - Block accum[out_width] = {}; - - public: - BlockGreedy(const uint64_t seeds[]) : seeds(seeds) {} - - void Insert(const Block (&x)[out_width]) { - for (unsigned i = 0; i < out_width; ++i) { - accum[i] = Mix(accum[i], x[i], BlockWrapper::LoadBlockNative(seeds)); - seeds += sizeof(Block) / sizeof(uint64_t); - } - } - - void Insert(const Block & x) { - for (unsigned i = 0; i < out_width; ++i) { - accum[i] = - Mix(accum[i], x, - BlockWrapper::LoadBlockNative(&seeds[i * sizeof(Block) / sizeof(uint64_t)])); - } - // Toeplitz - seeds += sizeof(Block) / sizeof(uint64_t); - } - - void Hash(uint64_t output[out_width]) const { - for (unsigned i = 0; i < out_width; ++i) { - output[i] = Sum(accum[i]); - } - } - }; - - static void DfsGreedyFinalizer(const Block stack[][fanout][out_width], - const int stack_lengths[], const uint8_t* uint8_t_input, - size_t uint8_t_length, const uint64_t* entropy, - uint64_t output[out_width]) { - BlockGreedy b(entropy); - for (int j = 0; stack_lengths[j] > 0; ++j) { - for (int k = 0; k < stack_lengths[j]; k += 1) { - b.Insert(stack[j][k]); - } - } - - size_t i = 0; - for (; i + sizeof(Block) <= uint8_t_length; i += sizeof(Block)) { - b.Insert(BlockWrapper::LoadBlock(&uint8_t_input[i])); - } - - if (1) { - uint8_t extra[sizeof(Block)]; - memcpy(extra, &uint8_t_input[i], uint8_t_length - i); - memset(extra + uint8_t_length - i, 0, sizeof(extra) - uint8_t_length + i); - b.Insert(BlockWrapper::LoadBlock(extra)); - } else if (1) { - Block extra = {}; - memcpy(&extra, &uint8_t_input[i], uint8_t_length - i); - b.Insert(extra); - } else { - Block extra; - uint8_t* extra_uint8_t = reinterpret_cast(&extra); - for (unsigned j = 0; j < sizeof(Block); ++j) { - if (j < uint8_t_length - i) { - extra_uint8_t[j] = uint8_t_input[i + j]; - } else { - extra_uint8_t[j] = 0; - } - } - b.Insert(extra); - } - b.Hash(output); - } -}; // EhcBadger + for (unsigned j = 0; j < in_width; ++j) { + output[i][j] = + BlockWrapper::LoadBlock(&input[(i * in_width + j) * sizeof(Block)]); + } + } + } + + static void Hash( const Block (& input)[encoded_dimension][in_width], + const uint64_t entropy[encoded_dimension][in_width], Block output[encoded_dimension] ) { + for (unsigned i = 0; i < encoded_dimension; ++i) { + output[i] = MixNone(input[i][0], entropy[i][0]); + // TODO: should loading take care of this? + } + for (unsigned j = 1; j < in_width; ++j) { + for (unsigned i = 0; i < encoded_dimension; ++i) { + output[i] = MixOne(output[i], input[i][j], entropy[i][j]); + // TODO: this might be optional; it might not matter which way we iterate over + // entropy + } + } + } + + static void EhcBaseLayer( const uint8_t input[dimension * in_width * sizeof(Block)], + const uint64_t raw_entropy[encoded_dimension][in_width], Block (& output)[out_width] ) { + Block scratch[encoded_dimension][in_width]; + Block tmpout[encoded_dimension]; + + Load(input, scratch); + Encode(scratch); + Hash(scratch, raw_entropy, tmpout); + Combine(tmpout, output); + } + + static void DfsTreeHash( const uint8_t * data, size_t block_group_length, + Block stack[][fanout][out_width], int stack_lengths[], const uint64_t * entropy ) { + auto entropy_matrix = reinterpret_cast(entropy); + + for (size_t k = 0; k < block_group_length; ++k) { + int i = 0; + while (stack_lengths[i] == fanout) { ++i; } + for (int j = i - 1; j >= 0; --j) { + EhcUpperLayer(stack[j], + &entropy[encoded_dimension * in_width + (fanout - 1) * out_width * j], + stack[j + 1][stack_lengths[j + 1]]); + stack_lengths[j] = 0; + stack_lengths[j + 1] += 1; + } + + EhcBaseLayer(&data[k * dimension * in_width * sizeof(Block)], + entropy_matrix, stack[0][stack_lengths[0]]); + stack_lengths[0] += 1; + } + } + + // auto b = sizeof(Block) / sizeof(uint64_t); + static constexpr size_t GEBN_b() { return sizeof(Block) / sizeof(uint64_t); } + + // auto h = FloorLog(fanout, n / (b * dimension * in_width)); + static constexpr size_t GEBN_h( size_t n ) { + return FloorLog(fanout, n / (GEBN_b() * dimension * in_width)); + } + + static constexpr size_t GetEntropyBytesNeeded( size_t n ) { + return sizeof(uint64_t) * (encoded_dimension * in_width + (fanout - 1) * out_width * GEBN_h(n) + + GEBN_b() * fanout * out_width * GEBN_h(n) + GEBN_b() * dimension * in_width + out_width - 1); + } + + struct BlockGreedy { + private: + const uint64_t * seeds; + Block accum[out_width] = {}; + + public: + BlockGreedy( const uint64_t seeds[] ) : + seeds( seeds ) {} + + void Insert( const Block (& x)[out_width] ) { + for (unsigned i = 0; i < out_width; ++i) { + accum[i] = Mix(accum[i], x[i], BlockWrapper::LoadBlockNative(seeds)); + seeds += sizeof(Block) / sizeof(uint64_t); + } + } + + void Insert( const Block & x ) { + for (unsigned i = 0; i < out_width; ++i) { + accum[i] = + Mix(accum[i], x, BlockWrapper::LoadBlockNative( + &seeds[i * sizeof(Block) / sizeof(uint64_t)])); + } + // Toeplitz + seeds += sizeof(Block) / sizeof(uint64_t); + } + + void Hash( uint64_t output[out_width] ) const { + for (unsigned i = 0; i < out_width; ++i) { + output[i] = Sum(accum[i]); + } + } + }; + + static void DfsGreedyFinalizer( const Block stack[][fanout][out_width], const int stack_lengths[], + const uint8_t * uint8_t_input, size_t uint8_t_length, const uint64_t * entropy, + uint64_t output[out_width] ) { + BlockGreedy b( entropy ); + + for (int j = 0; stack_lengths[j] > 0; ++j) { + for (int k = 0; k < stack_lengths[j]; k += 1) { + b.Insert(stack[j][k]); + } + } + + size_t i = 0; + for (; i + sizeof(Block) <= uint8_t_length; i += sizeof(Block)) { + b.Insert(BlockWrapper::LoadBlock(&uint8_t_input[i])); + } + + if (1) { + uint8_t extra[sizeof(Block)]; + memcpy(extra, &uint8_t_input[i], uint8_t_length - i); + memset(extra + uint8_t_length - i, 0, sizeof(extra) - uint8_t_length + i); + b.Insert(BlockWrapper::LoadBlock(extra)); + } else if (1) { + Block extra = {}; + memcpy(&extra, &uint8_t_input[i], uint8_t_length - i); + b.Insert(extra); + } else { + Block extra; + uint8_t * extra_uint8_t = reinterpret_cast(&extra); + for (unsigned j = 0; j < sizeof(Block); ++j) { + if (j < uint8_t_length - i) { + extra_uint8_t[j] = uint8_t_input[i + j]; + } else { + extra_uint8_t[j] = 0; + } + } + b.Insert(extra); + } + b.Hash(output); + } + }; // EhcBadger // evenness: 2 weight: 10 // 0 0 1 4 1 1 2 2 1 // 1 1 0 0 1 4 1 2 2 // 1 4 1 1 0 0 2 1 2 -template -inline void Combine3(const Block input[9], Block output[3]) { - output[1] = input[0]; - output[2] = input[0]; + template + inline void Combine3( const Block input[9], Block output[3] ) { + output[1] = input[0]; + output[2] = input[0]; - output[1] = Plus(output[1], input[1]); - output[2] = Plus(output[2], LeftShift(input[1], 2)); + output[1] = Plus(output[1], input[1]); + output[2] = Plus(output[2], LeftShift(input[1], 2)); - output[0] = input[2]; - output[2] = Plus(output[2], input[2]); + output[0] = input[2]; + output[2] = Plus(output[2], input[2]); - output[0] = Plus(output[0], LeftShift(input[3], 2)); - output[2] = Plus(output[2], input[3]); + output[0] = Plus(output[0], LeftShift(input[3], 2)); + output[2] = Plus(output[2], input[3]); - output[0] = Plus(output[0], input[4]); - output[1] = Plus(output[1], input[4]); + output[0] = Plus(output[0], input[4]); + output[1] = Plus(output[1], input[4]); - output[0] = Plus(output[0], input[5]); - output[1] = Plus(output[1], LeftShift(input[5], 2)); + output[0] = Plus(output[0], input[5]); + output[1] = Plus(output[1], LeftShift(input[5], 2)); - Badger::template Dot3<2, 1, 2>(output, input[6]); - Badger::template Dot3<2, 2, 1>(output, input[7]); - Badger::template Dot3<1, 2, 2>(output, input[8]); -} + Badger::template Dot3<2, 1, 2>(output, input[6]); + Badger::template Dot3<2, 2, 1>(output, input[7]); + Badger::template Dot3<1, 2, 2>(output, input[8]); + } -template -inline void Combine2(const Block input[7], Block output[2]) { - output[0] = input[0]; - output[1] = input[1]; + template + inline void Combine2( const Block input[7], Block output[2] ) { + output[0] = input[0]; + output[1] = input[1]; - Badger::template Dot2<1, 1>(output, input[2]); - Badger::template Dot2<1, 2>(output, input[3]); - Badger::template Dot2<2, 1>(output, input[4]); - Badger::template Dot2<1, 4>(output, input[5]); - Badger::template Dot2<4, 1>(output, input[6]); -} + Badger::template Dot2<1, 1>(output, input[2]); + Badger::template Dot2<1, 2>(output, input[3]); + Badger::template Dot2<2, 1>(output, input[4]); + Badger::template Dot2<1, 4>(output, input[5]); + Badger::template Dot2<4, 1>(output, input[6]); + } // evenness: 4 weight: 16 // 8 8 0 2 1 8 2 1 2 4 @@ -745,31 +806,31 @@ inline void Combine2(const Block input[7], Block output[2]) { // 2 0 1 0 4 0 1 1 1 1 // 1 1 0 1 0 0 4 1 2 8 -template -inline void Combine4(const Block input[10], Block output[4]) { - output[2] = LeftShift(input[0], 1); - output[3] = input[0]; + template + inline void Combine4( const Block input[10], Block output[4] ) { + output[2] = LeftShift(input[0], 1); + output[3] = input[0]; - output[1] = input[1]; - output[3] = Plus(output[3], input[1]); + output[1] = input[1]; + output[3] = Plus(output [3], input[1 ] ); - output[1] = Plus(output[1], LeftShift(input[2], 1)); - output[2] = Plus(output[2], input[2]); + output[1] = Plus(output [1], LeftShift(input[2], 1)); + output[2] = Plus(output [2], input[2 ] ); - output[0] = input[3]; - output[3] = Plus(output[3], input[3]); + output[0] = input[3]; + output[3] = Plus(output [3], input[3 ] ); - output[0] = Plus(output[0], input[4]); - output[2] = Plus(output[2], LeftShift(input[4], 2)); + output[0] = Plus(output [0], input[4 ] ); + output[2] = Plus(output [2], LeftShift(input[4], 2)); - output[0] = Plus(output[0], LeftShift(input[5], 2)); - output[1] = Plus(output[1], input[5]); + output[0] = Plus(output [0], LeftShift(input[5], 2)); + output[1] = Plus(output [1], input[5 ] ); - Badger::template Dot4<2, 1, 1, 4>(output, input[6]); - Badger::template Dot4<4, 2, 1, 1>(output, input[7]); - Badger::template Dot4<1, 4, 1, 2>(output, input[8]); - Badger::template Dot4<1, 1, 1, 8>(output, input[9]); -} + Badger::template Dot4<2, 1, 1, 4>(output, input[6]); + Badger::template Dot4<4, 2, 1, 1>(output, input[7]); + Badger::template Dot4<1, 4, 1, 2>(output, input[8]); + Badger::template Dot4<1, 1, 1, 8>(output, input[9]); + } // TODO: // 0 0 0 0 1 x x x x @@ -785,353 +846,381 @@ inline void Combine4(const Block input[10], Block output[4]) { // 0 0 0 1 0 1 4 9 8 // 0 0 0 0 1 1 5 3 9 -template -inline void Combine5(const Block input[10], Block output[5]) { - output[0] = input[0]; - output[1] = input[1]; - output[2] = input[2]; - output[3] = input[3]; - output[4] = input[4]; - - output[0] = Plus(output[0], input[5]); - output[1] = Plus(output[1], input[5]); - output[2] = Plus(output[2], input[5]); - output[3] = Plus(output[3], input[5]); - output[4] = Plus(output[4], input[5]); - - Badger::template Dot5<1, 2, 3, 4, 5>(output, input[6]); - Badger::template Dot5<2, 1, 8, 9, 3>(output, input[7]); - Badger::template Dot5<4, 7, 5, 8, 9>(output, input[8]); -} + template + inline void Combine5( const Block input[10], Block output[5] ) { + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[2]; + output[3] = input[3]; + output[4] = input[4]; + + output[0] = Plus(output[0], input[5]); + output[1] = Plus(output[1], input[5]); + output[2] = Plus(output[2], input[5]); + output[3] = Plus(output[3], input[5]); + output[4] = Plus(output[4], input[5]); + + Badger::template Dot5<1, 2, 3, 4, 5>(output, input[6]); + Badger::template Dot5<2, 1, 8, 9, 3>(output, input[7]); + Badger::template Dot5<4, 7, 5, 8, 9>(output, input[8]); + } -template -inline uint64_t TabulateBytes(uint64_t input, const uint64_t entropy[256 * width]) { - const uint64_t(&table)[width][256] = - *reinterpret_cast(entropy); - uint64_t result = 0; - for (unsigned i = 0; i < width; ++i) { - uint8_t index = input >> (i * CHAR_BIT); - result ^= table[i][index]; - } - return result; -} + template + inline uint64_t TabulateBytes( uint64_t input, const uint64_t entropy[256 * width] ) { + const uint64_t(&table)[width][256] = + *reinterpret_cast(entropy); + uint64_t result = 0; + for (unsigned i = 0; i < width; ++i) { + uint8_t index = input >> (i * CHAR_BIT); + result ^= table[i][index]; + } + return result; + } -template -static void Hash(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - constexpr unsigned kMaxStackSize = 9; - constexpr unsigned kFanout = 8; + template + static void Hash( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + constexpr unsigned kMaxStackSize = 9; + constexpr unsigned kFanout = 8; - using Block = typename BlockWrapper::Block; + using Block = typename BlockWrapper::Block; - Block stack[kMaxStackSize][kFanout][out_width]; - int stack_lengths[kMaxStackSize] = {}; - size_t wide_length = length / sizeof(Block) / (dimension * in_width); + Block stack[kMaxStackSize][kFanout][out_width]; + int stack_lengths[kMaxStackSize] = {}; + size_t wide_length = length / sizeof(Block) / (dimension * in_width); - EhcBadger::DfsTreeHash(uint8_t_input, wide_length, stack, stack_lengths, entropy); - entropy += encoded_dimension * in_width + out_width * (kFanout - 1) * kMaxStackSize; + EhcBadger::DfsTreeHash(uint8_t_input, wide_length, stack, stack_lengths, entropy); + entropy += encoded_dimension * in_width + out_width * (kFanout - 1) * kMaxStackSize; - auto used_uint8_ts = wide_length * sizeof(Block) * (dimension * in_width); - uint8_t_input += used_uint8_ts; + auto used_uint8_ts = wide_length * sizeof(Block) * (dimension * in_width); + uint8_t_input += used_uint8_ts; - EhcBadger::DfsGreedyFinalizer(stack, stack_lengths, uint8_t_input, - length - used_uint8_ts, entropy, output); -} + EhcBadger::DfsGreedyFinalizer(stack, stack_lengths, uint8_t_input, + length - used_uint8_ts, entropy, output); + } -template -struct alignas(alignof(Block)) Repeat { - Block it[count]; -}; + template + struct alignas( alignof(Block)) Repeat { + Block it[count]; + }; + + template + struct RepeatWrapper { + using InnerBlock = typename InnerBlockWrapper::Block; + + using Block = Repeat; + + static Block LoadOne( uint64_t entropy ) { + Block result; + + for (unsigned i = 0; i < count; ++i) { + result.it[i] = InnerBlockWrapper::LoadOne(entropy); + } + return result; + } + + static Block LoadBlock( const void * x ) { + auto y = reinterpret_cast(x); + Block result; + + for (unsigned i = 0; i < count; ++i) { + result.it[i] = InnerBlockWrapper::LoadBlock(y + i * sizeof(InnerBlock)); + } + return result; + } + + static Block LoadBlockNative( const void * x ) { + auto y = reinterpret_cast(x); + Block result; + + for (unsigned i = 0; i < count; ++i) { + result.it[i] = InnerBlockWrapper::LoadBlockNative(y + i * sizeof(InnerBlock)); + } + return result; + } + }; + + template + inline Repeat Xor( const Repeat & a, const Repeat & b ) { + Repeat result; + + for (unsigned i = 0; i < count; ++i) { + result.it[i] = Xor(a.it[i], b.it[i]); + } + return result; + } -template -struct RepeatWrapper { - using InnerBlock = typename InnerBlockWrapper::Block; + template + inline Repeat Plus32( const Repeat & a, const Repeat & b ) { + Repeat result; - using Block = Repeat; + for (unsigned i = 0; i < count; ++i) { + result.it[i] = Plus32(a.it[i], b.it[i]); + } + return result; + } - static Block LoadOne(uint64_t entropy) { - Block result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = InnerBlockWrapper::LoadOne(entropy); - } - return result; - } - - static Block LoadBlock(const void* x) { - auto y = reinterpret_cast(x); - Block result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = InnerBlockWrapper::LoadBlock(y + i * sizeof(InnerBlock)); - } - return result; - } - - static Block LoadBlockNative(const void* x) { - auto y = reinterpret_cast(x); - Block result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = InnerBlockWrapper::LoadBlockNative(y + i * sizeof(InnerBlock)); - } - return result; - } -}; - -template -inline Repeat Xor(const Repeat & a, const Repeat & b) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = Xor(a.it[i], b.it[i]); - } - return result; -} + template + inline Repeat Plus( const Repeat & a, const Repeat & b ) { + Repeat result; -template -inline Repeat Plus32(const Repeat & a, const Repeat & b) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = Plus32(a.it[i], b.it[i]); - } - return result; -} + for (unsigned i = 0; i < count; ++i) { + result.it[i] = Plus(a.it[i], b.it[i]); + } + return result; + } -template -inline Repeat Plus(const Repeat & a, const Repeat & b) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = Plus(a.it[i], b.it[i]); - } - return result; -} + template + inline Repeat Minus( const Repeat & a, const Repeat & b ) { + Repeat result; -template -inline Repeat Minus(const Repeat & a, const Repeat & b) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = Minus(a.it[i], b.it[i]); - } - return result; -} + for (unsigned i = 0; i < count; ++i) { + result.it[i] = Minus(a.it[i], b.it[i]); + } + return result; + } -template -inline Repeat LeftShift(const Repeat & a, int s) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = LeftShift(a.it[i], s); - } - return result; -} + template + inline Repeat LeftShift( const Repeat & a, int s ) { + Repeat result; -template -inline Repeat RightShift32(const Repeat & a) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = RightShift32(a.it[i]); - } - return result; -} + for (unsigned i = 0; i < count; ++i) { + result.it[i] = LeftShift(a.it[i], s); + } + return result; + } -template -inline Repeat Times(const Repeat & a, const Repeat & b) { - Repeat result; - for (unsigned i = 0; i < count; ++i) { - result.it[i] = Times(a.it[i], b.it[i]); - } - return result; -} + template + inline Repeat RightShift32( const Repeat & a ) { + Repeat result; -template -inline uint64_t Sum(const Repeat & a) { - uint64_t result = 0; - for (unsigned i = 0; i < count; ++i) { - result += Sum(a.it[i]); - } - return result; -} + for (unsigned i = 0; i < count; ++i) { + result.it[i] = RightShift32(a.it[i]); + } + return result; + } -template -inline Repeat Negate(const Repeat & a) { - Repeat b; - for (unsigned i = 0; i < count; ++i) { - b.it[i] = Negate(a.it[i]); - } - return b; -} + template + inline Repeat Times( const Repeat & a, const Repeat & b ) { + Repeat result; -} // namespace + for (unsigned i = 0; i < count; ++i) { + result.it[i] = Times(a.it[i], b.it[i]); + } + return result; + } + + template + inline uint64_t Sum( const Repeat & a ) { + uint64_t result = 0; + + for (unsigned i = 0; i < count; ++i) { + result += Sum(a.it[i]); + } + return result; + } + + template + inline Repeat Negate( const Repeat & a ) { + Repeat b; + + for (unsigned i = 0; i < count; ++i) { + b.it[i] = Negate(a.it[i]); + } + return b; + } + } // namespace //------------------------------------------------------------ -template -inline constexpr size_t GetEntropyBytesNeeded(size_t n) { - return (3 == out_width) - ? EhcBadger::GetEntropyBytesNeeded(n) - : (2 == out_width) - ? EhcBadger::GetEntropyBytesNeeded( - n) - : (4 == out_width) - ? EhcBadger::GetEntropyBytesNeeded( - n) - : EhcBadger::GetEntropyBytesNeeded( - n); -} + template + inline constexpr size_t GetEntropyBytesNeeded( size_t n ) { + return (3 == out_width) ? + EhcBadger::GetEntropyBytesNeeded(n) : + (2 == out_width) ? + EhcBadger::GetEntropyBytesNeeded(n) + : + (4 == out_width) ? + EhcBadger::GetEntropyBytesNeeded(n) + : + EhcBadger::GetEntropyBytesNeeded(n); + } // auto b = 8; -inline constexpr size_t MEBN_b() { return 8; } + inline constexpr size_t MEBN_b() { return 8; } + // auto h = FloorLog(8, ~0ull / 21); -inline constexpr size_t MEBN_h() { return FloorLog(8, ~0ull / 21); } + inline constexpr size_t MEBN_h() { return FloorLog(8, ~0ull / 21); } + // auto tab_words = 0;//6 * 8 * 256; // TODO: include words of tabulation? -inline constexpr size_t MEBN_tab_words() { return 0; } + inline constexpr size_t MEBN_tab_words() { return 0; } + // auto words = 21 + 7 * 5 * h + b * 8 * 5 * h + b * 21 + 5 - 1; -inline constexpr size_t MEBN_words() { return 21 + 7 * 5 * MEBN_h() + MEBN_b() * 8 * 5 * MEBN_h() + MEBN_b() * 21 + 5 - 1; } -inline constexpr size_t MaxEntropyBytesNeeded() { - return sizeof(uint64_t) * (MEBN_words() + MEBN_tab_words()); -} + inline constexpr size_t MEBN_words() { + return 21 + 7 * 5 * MEBN_h() + MEBN_b() * 8 * 5 * MEBN_h() + MEBN_b() * 21 + 5 - 1; + } -template -inline uint64_t TabulateAfter(const uint64_t* entropy, const uint8_t* uint8_t_input, - size_t length) { - const uint64_t(&table)[sizeof(uint64_t) * (1 + width)][256] = - *reinterpret_cast(entropy); - entropy += width * 256; - uint64_t output[width]; - Hasher(entropy, uint8_t_input, length, output); - uint64_t result = TabulateBytes(length, &table[0][0]); - for (int i = 0; i < width; ++i) { - result ^= TabulateBytes(output[i], &table[8 * (i + 1)][0]); - } - return result; -} + inline constexpr size_t MaxEntropyBytesNeeded() { + return sizeof(uint64_t) * (MEBN_words() + MEBN_tab_words()); + } + + template + inline uint64_t TabulateAfter( const uint64_t * entropy, const uint8_t * uint8_t_input, size_t length ) { + const uint64_t(&table)[sizeof(uint64_t) * (1 + width)][256] = + *reinterpret_cast(entropy); + entropy += width * 256; + uint64_t output[width]; + Hasher(entropy, uint8_t_input, length, output); + uint64_t result = TabulateBytes(length, &table[0][0]); + for (int i = 0; i < width; ++i) { + result ^= TabulateBytes(output[i], &table[8 * (i + 1)][0]); + } + return result; + } //------------------------------------------------------------ -template -inline void V4Scalar(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 8>, dimension, in_width, - encoded_dimension, out_width>(entropy, uint8_t_input, length, output); -} + template + inline void V4Scalar( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 8>, dimension, in_width, + encoded_dimension, out_width>(entropy, uint8_t_input, length, output); + } -template -inline void V3Scalar(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 4>, dimension, in_width, - encoded_dimension, out_width>(entropy, uint8_t_input, length, output); -} + template + inline void V3Scalar( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 4>, dimension, in_width, + encoded_dimension, out_width>(entropy, uint8_t_input, length, output); + } -template -inline void V2Scalar(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 2>, dimension, in_width, - encoded_dimension, out_width>(entropy, uint8_t_input, length, output); -} + template + inline void V2Scalar( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 2>, dimension, in_width, + encoded_dimension, out_width>(entropy, uint8_t_input, length, output); + } -template -inline void V1Scalar(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, dimension, in_width, encoded_dimension, out_width>( - entropy, uint8_t_input, length, output); -} + template + inline void V1Scalar( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, dimension, in_width, encoded_dimension, out_width>( + entropy, uint8_t_input, length, output); + } #if defined(HAVE_ARM_NEON) -template -inline void V2Neon(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, dimension, in_width, encoded_dimension, out_width>( - entropy, uint8_t_input, length, output); -} -template -inline void V3Neon(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 2>, dimension, in_width, encoded_dimension, - out_width>(entropy, uint8_t_input, length, output); -} + template + inline void V2Neon( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, dimension, in_width, encoded_dimension, out_width>( + entropy, uint8_t_input, length, output); + } + + template + inline void V3Neon( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 2>, dimension, in_width, encoded_dimension, + out_width>(entropy, uint8_t_input, length, output); + } + + template + inline void V4Neon( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 4>, dimension, in_width, encoded_dimension, + out_width>(entropy, uint8_t_input, length, output); + } -template -inline void V4Neon(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 4>, dimension, in_width, encoded_dimension, - out_width>(entropy, uint8_t_input, length, output); -} #else // HAVE_ARM_NEON -#if defined(HAVE_SSE_2) -template -inline void V2Sse2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, dimension, in_width, encoded_dimension, out_width>( - entropy, uint8_t_input, length, output); -} + #if defined(HAVE_SSE_2) + + template + inline void V2Sse2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, dimension, in_width, encoded_dimension, out_width>( + entropy, uint8_t_input, length, output); + } -template -inline void V3Sse2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 2>, dimension, in_width, encoded_dimension, - out_width>(entropy, uint8_t_input, length, output); -} + template + inline void V3Sse2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 2>, dimension, in_width, encoded_dimension, + out_width>(entropy, uint8_t_input, length, output); + } -template -inline void V4Sse2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 4>, dimension, in_width, encoded_dimension, - out_width>(entropy, uint8_t_input, length, output); -} -#endif + template + inline void V4Sse2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 4>, dimension, in_width, encoded_dimension, + out_width>(entropy, uint8_t_input, length, output); + } -#if defined(HAVE_AVX2) -template -inline void V3Avx2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, dimension, in_width, encoded_dimension, out_width>( - entropy, uint8_t_input, length, output); -} + #endif -template -inline void V4Avx2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, 2>, dimension, in_width, encoded_dimension, - out_width>(entropy, uint8_t_input, length, output); -} -#endif + #if defined(HAVE_AVX2) -#if defined(HAVE_AVX512_F) -template -inline void V4Avx512(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]) { - return Hash, dimension, in_width, encoded_dimension, out_width>( - entropy, uint8_t_input, length, output); -} -#endif + template + inline void V3Avx2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, dimension, in_width, encoded_dimension, out_width>( + entropy, uint8_t_input, length, output); + } + + template + inline void V4Avx2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, 2>, dimension, in_width, encoded_dimension, + out_width>(entropy, uint8_t_input, length, output); + } + + #endif + + #if defined(HAVE_AVX512_F) + + template + inline void V4Avx512( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ) { + return Hash, dimension, in_width, encoded_dimension, out_width>( + entropy, uint8_t_input, length, output); + } + + #endif #endif // HAVE_ARM_NEON -template -static inline void V4(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]); -template -static inline void V3(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]); -template -static inline void V2(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]); -template -static inline void V1(const uint64_t* entropy, const uint8_t* uint8_t_input, size_t length, - uint64_t output[out_width]); + template + static inline void V4( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ); + + template + static inline void V3( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ); + + template + static inline void V2( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ); + + template + static inline void V1( const uint64_t * entropy, const uint8_t * uint8_t_input, + size_t length, uint64_t output[out_width] ); //------------------------------------------------------------ #define SPECIALIZE(version, isa, out_width, dimension, in_width, encoded_dimension) \ @@ -1159,92 +1248,87 @@ static inline void V1(const uint64_t* entropy, const uint8_t* uint8_t_input, siz #if defined(HAVE_ARM_NEON) -SPECIALIZE_4(4, Neon) -SPECIALIZE_4(3, Neon) -SPECIALIZE_4(2, Neon) -SPECIALIZE_4(1, Scalar) + SPECIALIZE_4(4, Neon ) + SPECIALIZE_4(3, Neon ) + SPECIALIZE_4(2, Neon ) + SPECIALIZE_4(1, Scalar) #elif defined(HAVE_AVX512_F) -SPECIALIZE_4(4, Avx512) -SPECIALIZE_4(3, Avx2) -SPECIALIZE_4(2, Sse2) -SPECIALIZE_4(1, Scalar) + SPECIALIZE_4(4, Avx512) + SPECIALIZE_4(3, Avx2 ) + SPECIALIZE_4(2, Sse2 ) + SPECIALIZE_4(1, Scalar) #elif defined(HAVE_AVX2) -SPECIALIZE_4(4, Avx2) -SPECIALIZE_4(3, Avx2) -SPECIALIZE_4(2, Sse2) -SPECIALIZE_4(1, Scalar) + SPECIALIZE_4(4, Avx2 ) + SPECIALIZE_4(3, Avx2 ) + SPECIALIZE_4(2, Sse2 ) + SPECIALIZE_4(1, Scalar) #elif defined(HAVE_SSE_2) -SPECIALIZE_4(4, Sse2) -SPECIALIZE_4(3, Sse2) -SPECIALIZE_4(2, Sse2) -SPECIALIZE_4(1, Scalar) + SPECIALIZE_4(4, Sse2 ) + SPECIALIZE_4(3, Sse2 ) + SPECIALIZE_4(2, Sse2 ) + SPECIALIZE_4(1, Scalar) #else -SPECIALIZE_4(4, Scalar) -SPECIALIZE_4(3, Scalar) -SPECIALIZE_4(2, Scalar) -SPECIALIZE_4(1, Scalar) + SPECIALIZE_4(4, Scalar) + SPECIALIZE_4(3, Scalar) + SPECIALIZE_4(2, Scalar) + SPECIALIZE_4(1, Scalar) #endif - -} // namespace advanced + } // namespace advanced //------------------------------------------------------------ -static constexpr size_t kEntropyBytesNeeded = - 256 * 3 * sizeof(uint64_t) * sizeof(uint64_t) + - advanced::GetEntropyBytesNeeded< - advanced::RepeatWrapper, 8>, 2>(~0ul); - -template < bool bswap > -static inline uint64_t HalftimeHashStyle512( - const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], const uint8_t input[], - size_t length) { - return advanced::TabulateAfter, 2>(entropy, input, length); -} - -template < bool bswap > -static inline uint64_t HalftimeHashStyle256( - const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], const uint8_t input[], - size_t length) { - return advanced::TabulateAfter, 2>(entropy, input, length); -} + static constexpr size_t kEntropyBytesNeeded = + 256 * 3 * sizeof(uint64_t) * sizeof(uint64_t) + + advanced::GetEntropyBytesNeeded< + advanced::RepeatWrapper, 8>, 2>(~0ul); + + template + static inline uint64_t HalftimeHashStyle512( const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], + const uint8_t input[], size_t length ) { + return advanced::TabulateAfter, 2>(entropy, input, length); + } -template < bool bswap > -static inline uint64_t HalftimeHashStyle128( - const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], const uint8_t input[], - size_t length) { - return advanced::TabulateAfter, 2>(entropy, input, length); -} + template + static inline uint64_t HalftimeHashStyle256( const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], + const uint8_t input[], size_t length ) { + return advanced::TabulateAfter, 2>(entropy, input, length); + } -template < bool bswap > -static inline uint64_t HalftimeHashStyle64( - const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], const uint8_t input[], - size_t length) { - return advanced::TabulateAfter, 2>(entropy, input, length); -} + template + static inline uint64_t HalftimeHashStyle128( const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], + const uint8_t input[], size_t length ) { + return advanced::TabulateAfter, 2>(entropy, input, length); + } -} // namespace halftime_hash + template + static inline uint64_t HalftimeHashStyle64( const uint64_t entropy[kEntropyBytesNeeded / sizeof(uint64_t)], + const uint8_t input[], size_t length ) { + return advanced::TabulateAfter, 2>(entropy, input, length); + } +} // namespace halftime_hash //------------------------------------------------------------ alignas(64) static thread_local uint64_t - halftime_hash_random[8 * ((halftime_hash::kEntropyBytesNeeded / 64) + 1)]; +halftime_hash_random[8 * ((halftime_hash::kEntropyBytesNeeded / 64) + 1)]; // romu random number generator for seeding the HalftimeHash entropy -static uint64_t splitmix(uint64_t & state) { - uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15)); - z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9); - z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb); - return z ^ (z >> 31); +static uint64_t splitmix( uint64_t & state ) { + uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15)); + + z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9); + z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb); + return z ^ (z >> 31); } -static uintptr_t halftime_hash_seed_init(const seed_t seed) { +static uintptr_t halftime_hash_seed_init( const seed_t seed ) { uint64_t mState = seed; uint64_t wState = splitmix(mState); uint64_t xState = splitmix(mState); @@ -1273,108 +1357,112 @@ static uintptr_t halftime_hash_seed_init(const seed_t seed) { } //------------------------------------------------------------ -template < bool bswap > -static void HalftimeHash64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void HalftimeHash64( const void * in, const size_t len, const seed_t seed, void * out ) { const uint64_t * random_words = (const uint64_t *)(uintptr_t)seed; - uint64_t h = halftime_hash::HalftimeHashStyle64(random_words, (const uint8_t *)in, (size_t)len); + uint64_t h = halftime_hash::HalftimeHashStyle64(random_words, (const uint8_t *)in, (size_t)len); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void HalftimeHash128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void HalftimeHash128( const void * in, const size_t len, const seed_t seed, void * out ) { const uint64_t * random_words = (const uint64_t *)(uintptr_t)seed; - uint64_t h = halftime_hash::HalftimeHashStyle128(random_words, (const uint8_t *)in, (size_t)len); + uint64_t h = halftime_hash::HalftimeHashStyle128(random_words, (const uint8_t *)in, (size_t)len); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void HalftimeHash256(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void HalftimeHash256( const void * in, const size_t len, const seed_t seed, void * out ) { const uint64_t * random_words = (const uint64_t *)(uintptr_t)seed; - uint64_t h = halftime_hash::HalftimeHashStyle256(random_words, (const uint8_t *)in, (size_t)len); + uint64_t h = halftime_hash::HalftimeHashStyle256(random_words, (const uint8_t *)in, (size_t)len); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void HalftimeHash512(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void HalftimeHash512( const void * in, const size_t len, const seed_t seed, void * out ) { const uint64_t * random_words = (const uint64_t *)(uintptr_t)seed; - uint64_t h = halftime_hash::HalftimeHashStyle512(random_words, (const uint8_t *)in, (size_t)len); + uint64_t h = halftime_hash::HalftimeHashStyle512(random_words, (const uint8_t *)in, (size_t)len); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(halftimehash, - $.src_url = "https://github.com/jbapple/HalftimeHash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/jbapple/HalftimeHash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(HalftimeHash_64, - $.desc = "Halftime Hash (64-bit blocks)", - $.sort_order = 10, - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.verification_LE = 0xED42E424, - $.verification_BE = 0x7EE5ED6F, - $.hashfn_native = HalftimeHash64, - $.hashfn_bswap = HalftimeHash64, - $.seedfn = halftime_hash_seed_init -); + $.desc = "Halftime Hash (64-bit blocks)", + $.sort_order = 10, + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xED42E424, + $.verification_BE = 0x7EE5ED6F, + $.hashfn_native = HalftimeHash64, + $.hashfn_bswap = HalftimeHash64, + $.seedfn = halftime_hash_seed_init + ); REGISTER_HASH(HalftimeHash_128, - $.desc = "Halftime Hash (128-bit blocks)", - $.sort_order = 20, - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.verification_LE = 0x952DF141, - $.verification_BE = 0xD79E990B, - $.hashfn_native = HalftimeHash128, - $.hashfn_bswap = HalftimeHash128, - $.seedfn = halftime_hash_seed_init -); + $.desc = "Halftime Hash (128-bit blocks)", + $.sort_order = 20, + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x952DF141, + $.verification_BE = 0xD79E990B, + $.hashfn_native = HalftimeHash128, + $.hashfn_bswap = HalftimeHash128, + $.seedfn = halftime_hash_seed_init + ); REGISTER_HASH(HalftimeHash_256, - $.desc = "Halftime Hash (256-bit blocks)", - $.sort_order = 30, - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.verification_LE = 0x912330EA, - $.verification_BE = 0x23C24991, - $.hashfn_native = HalftimeHash256, - $.hashfn_bswap = HalftimeHash256, - $.seedfn = halftime_hash_seed_init -); + $.desc = "Halftime Hash (256-bit blocks)", + $.sort_order = 30, + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x912330EA, + $.verification_BE = 0x23C24991, + $.hashfn_native = HalftimeHash256, + $.hashfn_bswap = HalftimeHash256, + $.seedfn = halftime_hash_seed_init + ); REGISTER_HASH(HalftimeHash_512, - $.desc = "Halftime Hash (512-bit blocks)", - $.sort_order = 40, - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT , - $.bits = 64, - $.verification_LE = 0x1E0F99EA, - $.verification_BE = 0xA3A0AE42, - $.hashfn_native = HalftimeHash512, - $.hashfn_bswap = HalftimeHash512, - $.seedfn = halftime_hash_seed_init -); + $.desc = "Halftime Hash (512-bit blocks)", + $.sort_order = 40, + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x1E0F99EA, + $.verification_BE = 0xA3A0AE42, + $.hashfn_native = HalftimeHash512, + $.hashfn_bswap = HalftimeHash512, + $.seedfn = halftime_hash_seed_init + ); diff --git a/hashes/hasshe2.cpp b/hashes/hasshe2.cpp index bca5cc21..d10e1740 100644 --- a/hashes/hasshe2.cpp +++ b/hashes/hasshe2.cpp @@ -32,35 +32,39 @@ #include "Hashlib.h" #if defined(HAVE_SSE_2) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif //------------------------------------------------------------ alignas(16) const static uint32_t coeffs[12] = { - /* Four carefully selected coefficients and interleaving zeros. */ - 0x98b365a1, 0, 0x52c69cab, 0, - 0xb76a9a41, 0, 0xcc4d2c7b, 0, - /* 128 bits of random data. */ - 0x564a4447, 0xc7265595, 0xe20c241d, 0x128fa608, + /* Four carefully selected coefficients and interleaving zeros. */ + 0x98b365a1, 0, 0x52c69cab, 0, + 0xb76a9a41, 0, 0xcc4d2c7b, 0, + /* 128 bits of random data. */ + 0x564a4447, 0xc7265595, 0xe20c241d, 0x128fa608, }; //------------------------------------------------------------ // Portable implementation of the hash -static void combine_and_mix(uint64_t state[4], const uint64_t input[2]) { - /* Phase 1: Perform four 32x32->64 bit multiplication with the - input block and words 1 and 3 coeffs, respectively. This - effectively propagates a bit change in input to 32 more - significant bit positions. Combine into internal state by - subtracting the result of multiplications from the internal - state. */ +static void combine_and_mix( uint64_t state[4], const uint64_t input[2] ) { + /* + * Phase 1: Perform four 32x32->64 bit multiplication with the + * input block and words 1 and 3 coeffs, respectively. This + * effectively propagates a bit change in input to 32 more + * significant bit positions. Combine into internal state by + * subtracting the result of multiplications from the internal + * state. + */ state[0] -= ((uint64_t)(coeffs[0])) * (input[1] & 0xffffffff); - state[1] -= ((uint64_t)(coeffs[2])) * (input[1] >> 32); + state[1] -= ((uint64_t)(coeffs[2])) * (input[1] >> 32); state[2] -= ((uint64_t)(coeffs[4])) * (input[0] & 0xffffffff); - state[3] -= ((uint64_t)(coeffs[6])) * (input[0] >> 32); + state[3] -= ((uint64_t)(coeffs[6])) * (input[0] >> 32); - /* Phase 2: Perform shifts and xors to propagate the 32-bit - changes produced above into 64-bit (and even a little larger) - changes in the internal state. */ + /* + * Phase 2: Perform shifts and xors to propagate the 32-bit + * changes produced above into 64-bit (and even a little larger) + * changes in the internal state. + */ /* state ^= state >64> 29; */ /* state +64= state <64< 16; */ /* state ^= state >64> 21; */ @@ -74,61 +78,68 @@ static void combine_and_mix(uint64_t state[4], const uint64_t input[2]) { state[3] += (state[3] << 32) + (state[2] >> 32); state[2] += (state[2] << 32); - /* Phase 3: Propagate the changes among the four 64-bit words by - performing 64-bit subtractions and 32-bit word shuffling. */ - state[0] -= state[2]; - state[1] -= state[3]; + /* + * Phase 3: Propagate the changes among the four 64-bit words by + * performing 64-bit subtractions and 32-bit word shuffling. + */ + state[0] -= state [2]; + state[1] -= state [3]; uint64_t tmp; - tmp = state[2]; + tmp = state [2]; state[2] = ((state[2] >> 32) + (state[3] << 32)) - state[0]; state[3] = ((state[3] >> 32) + (tmp << 32)) - state[1]; - tmp = state[1]; + tmp = state [1]; state[1] = ((state[0] >> 32) + (state[0] << 32)) - state[3]; state[0] = tmp - state[2]; - tmp = state[2]; + tmp = state [2]; state[2] = ((state[3] >> 32) + (state[2] << 32)) - state[0]; state[3] = ((tmp >> 32) + (state[3] << 32)) - state[1]; - tmp = state[0]; + tmp = state [0]; state[0] = ((state[1] >> 32) + (state[0] << 32)) - state[2]; state[1] = ((tmp >> 32) + (state[1] << 32)) - state[3]; - /* With good coefficients any one-bit flip in the input has now - changed all bits in the internal state with a probability - between 45% to 55%. */ + /* + * With good coefficients any one-bit flip in the input has now + * changed all bits in the internal state with a probability + * between 45% to 55%. + */ } -template < bool orig, bool bswap > -static void hasshe2_portable(const uint8_t * input_buf, size_t n_bytes, uint64_t seed, void *output_state) { +template +static void hasshe2_portable( const uint8_t * input_buf, size_t n_bytes, uint64_t seed, void * output_state ) { uint64_t state[4]; uint64_t input[2]; uint64_t seed2 = orig ? seed : (seed + (uint64_t)n_bytes); - /* Initialize internal state to something random. (Alternatively, - if hashing a chain of data, read in the previous hash result from - somewhere.) - - Seeding is homegrown for SMHasher3 - */ - state[0] = coeffs[ 8] + (((uint64_t)coeffs[ 9]) << 32); - state[1] = coeffs[10] + (((uint64_t)coeffs[11]) << 32); + /* + * Initialize internal state to something random. (Alternatively, + * if hashing a chain of data, read in the previous hash result from + * somewhere.) + * + * Seeding is homegrown for SMHasher3 + */ + state[0] = coeffs[ 8] + (((uint64_t)coeffs[ 9]) << 32); + state[1] = coeffs[10] + (((uint64_t)coeffs[11]) << 32); state[0] ^= seed; state[1] ^= seed2; - state[2] = state[0]; - state[3] = state[1]; + state[2] = state[0]; + state[3] = state[1]; while (n_bytes >= 16) { - /* Read in 16 bytes, or 128 bits, from buf. Advance buf and - decrement n_bytes accordingly. */ + /* + * Read in 16 bytes, or 128 bits, from buf. Advance buf and + * decrement n_bytes accordingly. + */ for (int i = 0; i < 2; i++) { - input[i] = GET_U64(input_buf, i*8); + input[i] = GET_U64(input_buf, i * 8); } input_buf += 16; - n_bytes -= 16; + n_bytes -= 16; combine_and_mix(state, input); } @@ -137,15 +148,17 @@ static void hasshe2_portable(const uint8_t * input_buf, size_t n_bytes, uint64_t memcpy(buf, input_buf, n_bytes); memset(buf + n_bytes, 0, 16 - n_bytes); for (int i = 0; i < 2; i++) { - input[i] = GET_U64(buf, i*8); + input[i] = GET_U64(buf, i * 8); } combine_and_mix(state, input); } - /* Postprocessing. Copy half of the internal state into fake input, - replace it with the constant rnd_data, and do one combine and mix - phase more. */ + /* + * Postprocessing. Copy half of the internal state into fake input, + * replace it with the constant rnd_data, and do one combine and mix + * phase more. + */ input[0] = state[0]; input[1] = state[1]; state[0] = coeffs[ 8] + (((uint64_t)coeffs[ 9]) << 32); @@ -153,7 +166,7 @@ static void hasshe2_portable(const uint8_t * input_buf, size_t n_bytes, uint64_t combine_and_mix(state, input); for (int i = 0; i < 4; i++) { - PUT_U64(state[i], (uint8_t *)output_state, i*8); + PUT_U64(state[i], (uint8_t *)output_state, i * 8); } } @@ -197,97 +210,104 @@ static void hasshe2_portable(const uint8_t * input_buf, size_t n_bytes, uint64_t changed all bits in the internal state with a probability \ between 45% to 55%. */ -template < bool orig, bool bswap > -static void hasshe2_sse2(const uint8_t * input_buf, size_t n_bytes, uint64_t seed, void *output_state) { - __m128i coeffs_1, coeffs_2, rnd_data, seed_xmm, input, state_1, state_2; - coeffs_1 = _mm_load_si128((__m128i *) coeffs); - coeffs_2 = _mm_load_si128((__m128i *) (coeffs + 4)); - rnd_data = _mm_load_si128((__m128i *) (coeffs + 8)); - seed_xmm = _mm_set_epi64x(orig ? seed : (seed + n_bytes), seed); - - /* Initialize internal state to something random. (Alternatively, - if hashing a chain of data, read in the previous hash result from - somewhere.) - - Seeding is homegrown for SMHasher3 - */ - state_1 = state_2 = _mm_xor_si128(rnd_data, seed_xmm); - - while (n_bytes >= 16) { - /* Read in 16 bytes, or 128 bits, from buf. Advance buf and - decrement n_bytes accordingly. */ - input = _mm_loadu_si128((__m128i *) input_buf); - if (bswap) { input = mm_bswap64(input); } - input_buf += 16; - n_bytes -= 16; - - COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); - } - if (n_bytes > 0) { - alignas(16) uint8_t buf[16]; - memcpy(buf, input_buf, n_bytes); - memset(buf + n_bytes, 0, 16 - n_bytes); - input = _mm_load_si128((__m128i *) buf); - if (bswap) { input = mm_bswap64(input); } - COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); - } - - /* Postprocessing. Copy half of the internal state into fake input, - replace it with the constant rnd_data, and do one combine and mix - phase more. */ - input = state_1; - state_1 = rnd_data; - - COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); - - if (bswap) { - state_1 = mm_bswap64(state_1); - state_2 = mm_bswap64(state_2); - } - _mm_storeu_si128((__m128i *)output_state, state_1); - _mm_storeu_si128((__m128i *)((char*)output_state + 16), state_2); +template +static void hasshe2_sse2( const uint8_t * input_buf, size_t n_bytes, uint64_t seed, void * output_state ) { + __m128i coeffs_1, coeffs_2, rnd_data, seed_xmm, input, state_1, state_2; + + coeffs_1 = _mm_load_si128((__m128i *)coeffs ); + coeffs_2 = _mm_load_si128((__m128i *)(coeffs + 4)); + rnd_data = _mm_load_si128((__m128i *)(coeffs + 8)); + seed_xmm = _mm_set_epi64x(orig ? seed : (seed + n_bytes), seed); + + /* + * Initialize internal state to something random. (Alternatively, + * if hashing a chain of data, read in the previous hash result from + * somewhere.) + * + * Seeding is homegrown for SMHasher3 + */ + state_1 = state_2 = _mm_xor_si128(rnd_data, seed_xmm); + + while (n_bytes >= 16) { + /* + * Read in 16 bytes, or 128 bits, from buf. Advance buf and + * decrement n_bytes accordingly. + */ + input = _mm_loadu_si128((__m128i *)input_buf); + if (bswap) { input = mm_bswap64(input); } + input_buf += 16; + n_bytes -= 16; + + COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); + } + if (n_bytes > 0) { + alignas(16) uint8_t buf[16]; + memcpy(buf, input_buf, n_bytes); + memset(buf + n_bytes, 0, 16 - n_bytes); + input = _mm_load_si128((__m128i *)buf); + if (bswap) { input = mm_bswap64(input); } + COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); + } + + /* + * Postprocessing. Copy half of the internal state into fake input, + * replace it with the constant rnd_data, and do one combine and mix + * phase more. + */ + input = state_1; + state_1 = rnd_data; + + COMBINE_AND_MIX(coeffs_1, coeffs_2, state_1, state_2, input); + + if (bswap) { + state_1 = mm_bswap64(state_1); + state_2 = mm_bswap64(state_2); + } + _mm_storeu_si128((__m128i *)output_state, state_1); + _mm_storeu_si128((__m128i *)((char *)output_state + 16), state_2); } + #endif -template < bool orig, bool bswap > -static void Hasshe2(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void Hasshe2( const void * in, const size_t len, const seed_t seed, void * out ) { #if defined(HAVE_SSE_2) - hasshe2_sse2((const uint8_t *)in, len, (uint64_t)seed, out); + hasshe2_sse2((const uint8_t *)in, len, (uint64_t)seed, out); #else - hasshe2_portable((const uint8_t *)in, len, (uint64_t)seed, out); + hasshe2_portable((const uint8_t *)in, len, (uint64_t)seed, out); #endif } REGISTER_FAMILY(hasshe2, - $.src_url = "http://cessu.blogspot.com/2008/11/hashing-with-sse2-revisited-or-my-hash.html", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "http://cessu.blogspot.com/2008/11/hashing-with-sse2-revisited-or-my-hash.html", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(hasshe2, - $.desc = "hasshe2 (SSE2-oriented hash)", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0x68CBC5F1, - $.verification_BE = 0x562ECEB4, - $.hashfn_native = Hasshe2, - $.hashfn_bswap = Hasshe2 -); + $.desc = "hasshe2 (SSE2-oriented hash)", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0x68CBC5F1, + $.verification_BE = 0x562ECEB4, + $.hashfn_native = Hasshe2, + $.hashfn_bswap = Hasshe2 + ); REGISTER_HASH(hasshe2__tweaked, - $.desc = "hasshe2 (SSE2-oriented hash, tweaked to add len into IV)", - $.hash_flags = - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0xBAF6B1BF, - $.verification_BE = 0x35A87D75, - $.hashfn_native = Hasshe2, - $.hashfn_bswap = Hasshe2 -); + $.desc = "hasshe2 (SSE2-oriented hash, tweaked to add len into IV)", + $.hash_flags = + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0xBAF6B1BF, + $.verification_BE = 0x35A87D75, + $.hashfn_native = Hasshe2, + $.hashfn_bswap = Hasshe2 + ); diff --git a/hashes/jodyhash.cpp b/hashes/jodyhash.cpp index a926e55f..9d4e8dfd 100644 --- a/hashes/jodyhash.cpp +++ b/hashes/jodyhash.cpp @@ -51,45 +51,45 @@ static const uint32_t tail_mask_32[] = { //------------------------------------------------------------ // Version increments when algorithm changes incompatibly -//#define JODY_HASH_VERSION 5 +// #define JODY_HASH_VERSION 5 #define JODY_HASH_CONSTANT UINT32_C(0x1f3d5b79) #define JODY_HASH_SHIFT 14 -template < typename T, bool bswap > -static T jody_block_hash(const uint8_t * RESTRICT data, const size_t count, const T start_hash) { +template +static T jody_block_hash( const uint8_t * RESTRICT data, const size_t count, const T start_hash ) { T hash = start_hash; T element; T partial_salt; const T * const tail_mask = (sizeof(T) == 4) ? - (const T *)tail_mask_32 : (const T *)tail_mask_64; + (const T *)tail_mask_32 : (const T *)tail_mask_64; size_t len; /* Don't bother trying to hash a zero-length block */ - if (count == 0) return hash; + if (count == 0) { return hash; } len = count / sizeof(T); for (; len > 0; len--) { element = (sizeof(T) == 4) ? - GET_U32(data, 0) : GET_U64(data, 0) ; - hash += element; - hash += JODY_HASH_CONSTANT; + GET_U32(data, 0) : GET_U64(data, 0); + hash += element; + hash += JODY_HASH_CONSTANT; /* bit rotate left */ - hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); - hash ^= element; + hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); + hash ^= element; /* bit rotate left */ - hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); - hash ^= JODY_HASH_CONSTANT; - hash += element; - data += sizeof(T); + hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); + hash ^= JODY_HASH_CONSTANT; + hash += element; + data += sizeof(T); } /* Handle data tail (for blocks indivisible by sizeof(T)) */ len = count & (sizeof(T) - 1); if (len) { partial_salt = JODY_HASH_CONSTANT & tail_mask[len]; - element = (sizeof(T) == 4) ? - GET_U32(data, 0) : GET_U64(data, 0) ; + element = (sizeof(T) == 4) ? + GET_U32(data, 0) : GET_U64(data, 0); if (isLE() ^ bswap) { element &= tail_mask[len]; } else { @@ -97,9 +97,9 @@ static T jody_block_hash(const uint8_t * RESTRICT data, const size_t count, cons } hash += element; hash += partial_salt; - hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); + hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); hash ^= element; - hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); + hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(T) * 8 - JODY_HASH_SHIFT); hash ^= partial_salt; hash += element; } @@ -108,51 +108,53 @@ static T jody_block_hash(const uint8_t * RESTRICT data, const size_t count, cons } //------------------------------------------------------------ -template < bool bswap > -static void jodyhash32(const void * in, const size_t len, const seed_t seed, void * out) { - uint32_t h = jody_block_hash((const uint8_t *)in, len, (uint32_t)seed); +template +static void jodyhash32( const void * in, const size_t len, const seed_t seed, void * out ) { + uint32_t h = jody_block_hash((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void jodyhash64(const void * in, const size_t len, const seed_t seed, void * out) { - uint64_t h = jody_block_hash((const uint8_t *)in, len, (uint64_t)seed); +template +static void jodyhash64( const void * in, const size_t len, const seed_t seed, void * out ) { + uint64_t h = jody_block_hash((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(jodyhash, - $.src_url = "https://github.com/jbruchon/jodyhash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/jbruchon/jodyhash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(jodyhash_32, - $.desc = "jodyhash v5, 32-bit", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xFB47D60D, - $.verification_BE = 0xB94C9789, - $.hashfn_native = jodyhash32, - $.hashfn_bswap = jodyhash32 -); + $.desc = "jodyhash v5, 32-bit", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xFB47D60D, + $.verification_BE = 0xB94C9789, + $.hashfn_native = jodyhash32, + $.hashfn_bswap = jodyhash32 + ); REGISTER_HASH(jodyhash_64, - $.desc = "jodyhash v5, 64-bit", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // appending zero bytes might not alter hash! - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x9F09E57F, - $.verification_BE = 0xF9CDDA2C, - $.hashfn_native = jodyhash64, - $.hashfn_bswap = jodyhash64 -); + $.desc = "jodyhash v5, 64-bit", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// appending zero bytes might not alter hash! + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x9F09E57F, + $.verification_BE = 0xF9CDDA2C, + $.hashfn_native = jodyhash64, + $.hashfn_bswap = jodyhash64 + ); diff --git a/hashes/khash.cpp b/hashes/khash.cpp index 3033f0a8..54759241 100644 --- a/hashes/khash.cpp +++ b/hashes/khash.cpp @@ -32,8 +32,9 @@ // "khash" is really *only* these two mathematical functions. // khash64_fn maps 2 64-bit inputs to a 64-bit output, // and khash32_fn maps 3 32-bit inputs to a 32-bit output. -static inline uint64_t khash64_fn(uint64_t input, uint64_t func) { +static inline uint64_t khash64_fn( uint64_t input, uint64_t func ) { uint64_t h = func; + h ^= input - 7; h ^= ROTR64(h, 31); h -= ROTR64(h, 11); @@ -46,35 +47,36 @@ static inline uint64_t khash64_fn(uint64_t input, uint64_t func) { h ^= input - 2; h -= ROTR64(h, 19); - h += ROTR64(h, 5); + h += ROTR64(h, 5); h -= ROTR64(h, 31); return h; } -static inline uint32_t khash32_fn(uint32_t input, uint32_t func1, uint32_t func2) { +static inline uint32_t khash32_fn( uint32_t input, uint32_t func1, uint32_t func2 ) { uint32_t h = input; + h = ROTR32(h, 16); h ^= func2; h -= 5; h = ROTR32(h, 17); h += func1; - h = ROTR32(h, 1); + h = ROTR32(h, 1); h += ROTR32(h, 27); - h ^= ROTR32(h, 3); + h ^= ROTR32(h, 3); h -= ROTR32(h, 17); h -= ROTR32(h, 27); h ^= input - 107; h -= ROTR32(h, 11); - h ^= ROTR32(h, 7); - h -= ROTR32(h, 5); + h ^= ROTR32(h, 7); + h -= ROTR32(h, 5); return h; } // Just initialize with the fractional part of sqrt(2) -//#define khash64(input) khash64_fn(input, 0x6a09e667f3bcc908) -//#define khash32(input) khash32_fn(input, 0x6a09e667, 0xf3bcc908) +// #define khash64(input) khash64_fn(input, 0x6a09e667f3bcc908) +// #define khash32(input) khash32_fn(input, 0x6a09e667, 0xf3bcc908) //------------------------------------------------------------ // These hash functions operate on any amount of data, and hash it @@ -85,18 +87,19 @@ static inline uint32_t khash32_fn(uint32_t input, uint32_t func1, uint32_t func2 // handle 64-bit seeds but return the existing results when the high // 32 bits are zero, so that the verification value is unchanged. -template < bool bswap > -static void khash32(const void * in, const size_t len, const seed_t seed, void * out) { - uint32_t seedlo = (uint32_t)(seed); - uint32_t seedhi = (uint32_t)(seed >> 32); - uint32_t hash = ~seedlo; - const uint32_t K = UINT32_C(0xf3bcc908) ^ seedhi; +template +static void khash32( const void * in, const size_t len, const seed_t seed, void * out ) { + uint32_t seedlo = (uint32_t)(seed ); + uint32_t seedhi = (uint32_t)(seed >> 32); + uint32_t hash = ~seedlo; + const uint32_t K = UINT32_C(0xf3bcc908) ^ seedhi; const uint8_t * const endw = &((const uint8_t *)in)[len & ~3]; - uint8_t * dw = (uint8_t*)in; + uint8_t * dw = (uint8_t *)in; + while (dw < endw) { hash ^= khash32_fn(GET_U32(dw, 0), seed, K); - dw += 4; + dw += 4; } const size_t flen = len & 3; if (flen) { @@ -111,16 +114,17 @@ static void khash32(const void * in, const size_t len, const seed_t seed, void * PUT_U32(hash, (uint8_t *)out, 0); } -template < bool bswap > -static void khash64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void khash64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t seed64 = ((uint64_t)seed ^ UINT64_C(0x6a09e66700000000)); - uint64_t hash = ~seed64; + uint64_t hash = ~seed64; const uint8_t * const endw = &((const uint8_t *)in)[len & ~7]; - uint8_t * dw = (uint8_t*)in; + uint8_t * dw = (uint8_t *)in; + while (dw < endw) { hash ^= khash64_fn(GET_U64(dw, 0), seed64); - dw += 8; + dw += 8; } const size_t flen = len & 7; if (flen) { @@ -137,38 +141,38 @@ static void khash64(const void * in, const size_t len, const seed_t seed, void * //------------------------------------------------------------ REGISTER_FAMILY(khash, - $.src_url = "https://github.com/Keith-Cancel/k-hash", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/Keith-Cancel/k-hash", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(khash_32, - $.desc = "K-Hash 32 bit mixer-based hash", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xA17DA29E, - $.verification_BE = 0x59073F57, - $.hashfn_native = khash32, - $.hashfn_bswap = khash32 -); + $.desc = "K-Hash 32 bit mixer-based hash", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xA17DA29E, + $.verification_BE = 0x59073F57, + $.hashfn_native = khash32, + $.hashfn_bswap = khash32 + ); REGISTER_HASH(khash_64, - $.desc = "K-Hash 64 bit mixer-based hash", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x44BD88C4, - $.verification_BE = 0xCF3003D1, - $.hashfn_native = khash64, - $.hashfn_bswap = khash64 -); + $.desc = "K-Hash 64 bit mixer-based hash", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x44BD88C4, + $.verification_BE = 0xCF3003D1, + $.hashfn_native = khash64, + $.hashfn_bswap = khash64 + ); diff --git a/hashes/komihash.cpp b/hashes/komihash.cpp index 2e63dd8d..49a902ef 100644 --- a/hashes/komihash.cpp +++ b/hashes/komihash.cpp @@ -29,6 +29,7 @@ #include "Mathmult.h" //------------------------------------------------------------ + /* * Function builds an unsigned 64-bit value out of remaining bytes in a * message, and pads it with the "final byte". This function can only be @@ -39,26 +40,25 @@ * @param MsgLen Message's remaining length, in bytes; can be 0. * @param fb Final byte used for padding. */ -template < bool bswap > -static inline uint64_t kh_lpu64ec_l3(const uint8_t* const Msg, - const size_t MsgLen, uint64_t fb) { +template +static inline uint64_t kh_lpu64ec_l3( const uint8_t * const Msg, const size_t MsgLen, uint64_t fb ) { if (MsgLen < 4) { - const uint8_t* const Msg3 = Msg + MsgLen - 3; - const int ml8 = (int) (MsgLen << 3); - const uint64_t m = (uint64_t) Msg3[ 0 ] | (uint64_t) Msg3[ 1 ] << 8 | - (uint64_t) Msg3[ 2 ] << 16; + const uint8_t * const Msg3 = Msg + MsgLen - 3; + const int ml8 = (int)(MsgLen << 3); + const uint64_t m = (uint64_t)Msg3[0] | (uint64_t)Msg3[1] << 8 | + (uint64_t)Msg3[2] << 16; - return(fb << ml8 | m >> (24 - ml8)); + return fb << ml8 | m >> (24 - ml8); } - const int ml8 = (int) (MsgLen << 3); - const uint64_t mh = GET_U32(Msg + MsgLen - 4, 0); - const uint64_t ml = GET_U32(Msg, 0); + const int ml8 = (int)(MsgLen << 3); + const uint64_t mh = GET_U32(Msg + MsgLen - 4, 0); + const uint64_t ml = GET_U32(Msg , 0); if (isLE() ^ bswap) { - return(fb << ml8 | ml | (mh >> (64 - ml8)) << 32); + return fb << ml8 | ml | (mh >> (64 - ml8)) << 32; } else { - return(fb << ml8 | mh | (ml >> (64 - ml8)) << 32); + return fb << ml8 | mh | (ml >> (64 - ml8)) << 32; } } @@ -72,34 +72,33 @@ static inline uint64_t kh_lpu64ec_l3(const uint8_t* const Msg, * @param MsgLen Message's remaining length, in bytes; cannot be 0. * @param fb Final byte used for padding. */ -template < bool bswap > -static inline uint64_t kh_lpu64ec_nz(const uint8_t* const Msg, - const size_t MsgLen, uint64_t fb) { +template +static inline uint64_t kh_lpu64ec_nz( const uint8_t * const Msg, const size_t MsgLen, uint64_t fb ) { if (MsgLen < 4) { fb <<= (MsgLen << 3); - uint64_t m = Msg[ 0 ]; + uint64_t m = Msg[0]; if (MsgLen > 1) { - m |= (uint64_t) Msg[ 1 ] << 8; + m |= (uint64_t)Msg[1] << 8; if (MsgLen > 2) { - m |= (uint64_t) Msg[ 2 ] << 16; + m |= (uint64_t)Msg[2] << 16; } } - return(fb | m); + return fb | m; } - const int ml8 = (int) (MsgLen << 3); - const uint64_t mh = GET_U32(Msg + MsgLen - 4, 0); - const uint64_t ml = GET_U32(Msg, 0); + const int ml8 = (int)(MsgLen << 3); + const uint64_t mh = GET_U32(Msg + MsgLen - 4, 0); + const uint64_t ml = GET_U32(Msg , 0); if (isLE() ^ bswap) { // mh has remaining bytes from MSB, so shift off low bits - return (fb << ml8 | ml | (mh >> (64 - ml8)) << 32); + return fb << ml8 | ml | (mh >> (64 - ml8)) << 32; } else { // mh has remaining bytes from LSB, so shift off high bits - return (fb << ml8 | mh | (ml >> (64 - ml8)) << 32); + return fb << ml8 | mh | (ml >> (64 - ml8)) << 32; } } @@ -113,37 +112,36 @@ static inline uint64_t kh_lpu64ec_nz(const uint8_t* const Msg, * @param MsgLen Message's remaining length, in bytes; can be 0. * @param fb Final byte used for padding. */ -template < bool bswap > -static inline uint64_t kh_lpu64ec_l4(const uint8_t* const Msg, - const size_t MsgLen, uint64_t fb) { - const int ml8 = (int) (MsgLen << 3); +template +static inline uint64_t kh_lpu64ec_l4( const uint8_t * const Msg, const size_t MsgLen, uint64_t fb ) { + const int ml8 = (int)(MsgLen << 3); if (MsgLen < 5) { if (isLE() ^ bswap) { - return(fb << ml8 | - ((uint64_t)GET_U32(Msg + MsgLen - 4, 0)) >> (32 - ml8)); + return fb << ml8 | + ((uint64_t)GET_U32(Msg + MsgLen - 4, 0)) >> (32 - ml8); } else { // If MsgLen is 0 then "32 - ml8" is 32, and a uint32_t // shifted right by 32 bits is Undefined Behavior. This // odd construction avoids that. - return(fb << ml8 | - (((uint64_t)GET_U32(Msg + MsgLen - 4, 0)) & - (((uint64_t)UINT32_C(-1)) >> (32 - ml8)))); + return fb << ml8 | + (((uint64_t)GET_U32(Msg + MsgLen - 4, 0)) & + (((uint64_t)UINT32_C(-1)) >> (32 - ml8))); } } else { if (isLE() ^ bswap) { - return(fb << ml8 | GET_U64(Msg + MsgLen - 8, 0) >> (64 - ml8)); + return fb << ml8 | GET_U64(Msg + MsgLen - 8, 0) >> (64 - ml8); } else { - return(fb << ml8 | (GET_U64(Msg + MsgLen - 8, 0) & (UINT64_C(-1) >> (64 - ml8)))); + return fb << ml8 | (GET_U64(Msg + MsgLen - 8, 0) & (UINT64_C(-1) >> (64 - ml8))); } } } //------------------------------------------------------------ // Wrapper around Mathmult.h routine -static inline void kh_m128(const uint64_t m1, const uint64_t m2, - uint64_t* const rl, uint64_t* const rh) { +static inline void kh_m128( const uint64_t m1, const uint64_t m2, uint64_t * const rl, uint64_t * const rh ) { uint64_t rlo, rhi; + mult64_128(rlo, rhi, m1, m2); *rl = rlo; *rh = rhi; @@ -151,29 +149,31 @@ static inline void kh_m128(const uint64_t m1, const uint64_t m2, // Common hashing round with 16-byte input, using the "r1l" and "r1h" // temporary variables. -#define KOMIHASH_HASH16(m) \ - kh_m128(Seed1 ^ GET_U64(m, 0), \ - Seed5 ^ GET_U64(m, 8), &r1l, &r1h); \ - Seed5 += r1h; \ +#define KOMIHASH_HASH16(m) \ + kh_m128(Seed1 ^ GET_U64(m, 0), \ + Seed5 ^ GET_U64(m, 8), &r1l, &r1h); \ + Seed5 += r1h; \ Seed1 = Seed5 ^ r1l; // Common hashing round without input, using the "r2l" and "r2h" temporary // variables. -#define KOMIHASH_HASHROUND() \ - kh_m128(Seed1, Seed5, &r2l, &r2h); \ - Seed5 += r2h; \ +#define KOMIHASH_HASHROUND() \ + kh_m128(Seed1, Seed5, &r2l, &r2h); \ + Seed5 += r2h; \ Seed1 = Seed5 ^ r2l; // Common hashing finalization round, with the final hashing input expected in // the "r2l" and "r2h" temporary variables. -#define KOMIHASH_HASHFIN() \ - kh_m128(r2l, r2h, &r1l, &r1h); \ - Seed5 += r1h; \ - Seed1 = Seed5 ^ r1l; \ +#define KOMIHASH_HASHFIN() \ + kh_m128(r2l, r2h, &r1l, &r1h); \ + Seed5 += r1h; \ + Seed1 = Seed5 ^ r1l; \ KOMIHASH_HASHROUND(); //------------------------------------------------------------ + // KOMIHASH hash function + /* * @param Msg0 The message to produce a hash from. The alignment of this * pointer is unimportant. @@ -184,10 +184,9 @@ static inline void kh_m128(const uint64_t m1, const uint64_t m2, * need endianness-correction if this value is shared between big- and * little-endian systems. */ -template < bool bswap > -static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, - const uint64_t UseSeed) { - const uint8_t* Msg = (const uint8_t*) Msg0; +template +static inline uint64_t komihash_impl( const void * const Msg0, size_t MsgLen, const uint64_t UseSeed ) { + const uint8_t * Msg = (const uint8_t *)Msg0; // The seeds are initialized to the first mantissa bits of PI. uint64_t Seed1 = UINT64_C(0x243F6A8885A308D3) ^ (UseSeed & UINT64_C(0x5555555555555555)); @@ -226,18 +225,16 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, // addition). Message's statistics and distribution are thus // unimportant. - r2h ^= kh_lpu64ec_l3(Msg + 8, MsgLen - 8, - 1 << (Msg[ MsgLen - 1 ] >> 7)); + r2h ^= kh_lpu64ec_l3(Msg + 8, MsgLen - 8, 1 << (Msg[MsgLen - 1] >> 7)); r2l ^= GET_U64(Msg, 0); } else if (likely(MsgLen != 0)) { - r2l ^= kh_lpu64ec_nz(Msg, MsgLen, - 1 << (Msg[ MsgLen - 1 ] >> 7)); + r2l ^= kh_lpu64ec_nz(Msg , MsgLen , 1 << (Msg[MsgLen - 1] >> 7)); } KOMIHASH_HASHFIN(); - return (Seed1); + return Seed1; } if (likely(MsgLen < 32)) { @@ -249,7 +246,7 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, if (MsgLen > 23) { r2h = Seed5 ^ kh_lpu64ec_l4(Msg + 24, MsgLen - 24, fb); - r2l = Seed1 ^ GET_U64(Msg, 16); + r2l = Seed1 ^ GET_U64 (Msg, 16); } else { r2l = Seed1 ^ kh_lpu64ec_l4(Msg + 16, MsgLen - 16, fb); r2h = Seed5; @@ -257,7 +254,7 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, KOMIHASH_HASHFIN(); - return (Seed1); + return Seed1; } if (MsgLen > 63) { @@ -272,19 +269,15 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, do { prefetch(Msg); - kh_m128(Seed1 ^ GET_U64(Msg, 0), - Seed5 ^ GET_U64(Msg, 8), &r1l, &r1h); + kh_m128(Seed1 ^ GET_U64(Msg, 0) , Seed5 ^ GET_U64(Msg, 8) , &r1l, &r1h); - kh_m128(Seed2 ^ GET_U64(Msg, 16), - Seed6 ^ GET_U64(Msg, 24), &r2l, &r2h); + kh_m128(Seed2 ^ GET_U64(Msg, 16), Seed6 ^ GET_U64(Msg, 24), &r2l, &r2h); - kh_m128(Seed3 ^ GET_U64(Msg, 32), - Seed7 ^ GET_U64(Msg, 40), &r3l, &r3h); + kh_m128(Seed3 ^ GET_U64(Msg, 32), Seed7 ^ GET_U64(Msg, 40), &r3l, &r3h); - kh_m128(Seed4 ^ GET_U64(Msg, 48), - Seed8 ^ GET_U64(Msg, 56), &r4l, &r4h); + kh_m128(Seed4 ^ GET_U64(Msg, 48), Seed8 ^ GET_U64(Msg, 56), &r4l, &r4h); - Msg += 64; + Msg += 64; MsgLen -= 64; // Such "shifting" arrangement (below) does not increase @@ -298,11 +291,10 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, Seed6 += r2h; Seed7 += r3h; Seed8 += r4h; - Seed2 = Seed5 ^ r2l; - Seed3 = Seed6 ^ r3l; - Seed4 = Seed7 ^ r4l; - Seed1 = Seed8 ^ r1l; - + Seed2 = Seed5 ^ r2l; + Seed3 = Seed6 ^ r3l; + Seed4 = Seed7 ^ r4l; + Seed1 = Seed8 ^ r1l; } while (likely(MsgLen > 63)); Seed5 ^= Seed6 ^ Seed7 ^ Seed8; @@ -312,25 +304,25 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, prefetch(Msg); if (likely(MsgLen > 31)) { - KOMIHASH_HASH16(Msg); + KOMIHASH_HASH16(Msg ); KOMIHASH_HASH16(Msg + 16); - Msg += 32; + Msg += 32; MsgLen -= 32; } if (MsgLen > 15) { KOMIHASH_HASH16(Msg); - Msg += 16; + Msg += 16; MsgLen -= 16; } - const uint64_t fb = 1 << (Msg[ MsgLen - 1 ] >> 7); + const uint64_t fb = 1 << (Msg[MsgLen - 1] >> 7); if (MsgLen > 7) { r2h = Seed5 ^ kh_lpu64ec_l4(Msg + 8, MsgLen - 8, fb); - r2l = Seed1 ^ GET_U64(Msg, 0); + r2l = Seed1 ^ GET_U64 (Msg, 0); } else { r2l = Seed1 ^ kh_lpu64ec_l4(Msg, MsgLen, fb); r2h = Seed5; @@ -338,34 +330,35 @@ static inline uint64_t komihash_impl(const void* const Msg0, size_t MsgLen, KOMIHASH_HASHFIN(); - return (Seed1); + return Seed1; } //------------------------------------------------------------ -template < bool bswap > -static void komihash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void komihash( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = komihash_impl(in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(komihash, - $.src_url = "https://github.com/avaneev/komihash/", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/avaneev/komihash/", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(komihash, - $.desc = "komihash v4.3", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x703624A4, - $.verification_BE = 0xB954DBAB, - $.hashfn_native = komihash, - $.hashfn_bswap = komihash -); + $.desc = "komihash v4.3", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x703624A4, + $.verification_BE = 0xB954DBAB, + $.hashfn_native = komihash, + $.hashfn_bswap = komihash + ); diff --git a/hashes/lookup3.cpp b/hashes/lookup3.cpp index dcdba501..0aa5edc7 100644 --- a/hashes/lookup3.cpp +++ b/hashes/lookup3.cpp @@ -8,8 +8,8 @@ #include "Hashlib.h" //------------------------------------------------------------ -#define mix(a,b,c) \ -{ \ +#define mix(a,b,c) \ +{ \ a -= c; a ^= ROTL32(c, 4); c += b; \ b -= a; b ^= ROTL32(a, 6); a += c; \ c -= b; c ^= ROTL32(b, 8); b += a; \ @@ -18,8 +18,8 @@ c -= b; c ^= ROTL32(b, 4); b += a; \ } -#define final(a,b,c) \ -{ \ +#define final(a,b,c) \ +{ \ c ^= b; c -= ROTL32(b,14); \ a ^= c; a -= ROTL32(c,11); \ b ^= a; b -= ROTL32(a,25); \ @@ -29,87 +29,87 @@ c ^= b; c -= ROTL32(b,24); \ } -template < bool hash64, bool bswap > -static void hashlittle(const uint8_t * key, size_t length, uint64_t seed64, uint8_t * out) { - uint32_t a,b,c; /* internal state */ +template +static void hashlittle( const uint8_t * key, size_t length, uint64_t seed64, uint8_t * out ) { + uint32_t a, b, c; /* internal state */ - /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)length) + ((uint32_t)seed64); - c += (uint32_t)(seed64 >> 32); + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)length) + ((uint32_t)seed64); + c += (uint32_t)(seed64 >> 32); - /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ - while (length > 12) { - a += GET_U32(key, 0); - b += GET_U32(key, 4); - c += GET_U32(key, 8); - mix(a,b,c); - length -= 12; - key += 12; - } + /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ + while (length > 12) { + a += GET_U32(key, 0); + b += GET_U32(key, 4); + c += GET_U32(key, 8); + mix(a, b, c); + length -= 12; + key += 12; + } - /*----------------------------- handle the last (probably partial) block */ - switch(length) { - case 12: c+=GET_U32(key, 8); - b+=GET_U32(key, 4); - a+=GET_U32(key, 0); break; - case 11: c+=((uint32_t)key[10])<<16; /* fall through */ - case 10: c+=((uint32_t)key[9])<<8; /* fall through */ - case 9 : c+=key[8]; /* fall through */ - case 8 : b+=GET_U32(key, 4); - a+=GET_U32(key, 0); break; - case 7 : b+=((uint32_t)key[6])<<16; /* fall through */ - case 6 : b+=((uint32_t)key[5])<<8; /* fall through */ - case 5 : b+=key[4]; /* fall through */ - case 4 : a+=GET_U32(key, 0); break; - case 3 : a+=((uint32_t)key[2])<<16; /* fall through */ - case 2 : a+=((uint32_t)key[1])<<8; /* fall through */ - case 1 : a+=key[0]; break; - case 0 : goto out; /* zero length strings require no more mixing */ - } + /*----------------------------- handle the last (probably partial) block */ + switch (length) { + case 12: c += GET_U32(key, 8); + b += GET_U32(key, 4); + a += GET_U32(key, 0); break; + case 11: c += ((uint32_t)key[10]) << 16; /* fall through */ + case 10: c += ((uint32_t)key[ 9]) << 8; /* fall through */ + case 9: c += key[8]; /* fall through */ + case 8: b += GET_U32(key, 4); + a += GET_U32(key, 0); break; + case 7: b += ((uint32_t)key[ 6]) << 16; /* fall through */ + case 6: b += ((uint32_t)key[ 5]) << 8; /* fall through */ + case 5: b += key[4]; /* fall through */ + case 4: a += GET_U32(key, 0); break; + case 3: a += ((uint32_t)key[ 2]) << 16; /* fall through */ + case 2: a += ((uint32_t)key[ 1]) << 8; /* fall through */ + case 1: a += key[0]; break; + case 0: goto out; /* zero length strings require no more mixing */ + } - final(a,b,c); + final (a, b, c); - out: - PUT_U32(c, out, 0); - if (hash64) { PUT_U32(b, out, 4); } + out: + PUT_U32(c, out, 0); + if (hash64) { PUT_U32(b, out, 4); } } //------------------------------------------------------------ -template < bool hash64, bool bswap > -static void lookup3(const void * in, const size_t len, const seed_t seed, void * out) { - hashlittle((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); +template +static void lookup3( const void * in, const size_t len, const seed_t seed, void * out ) { + hashlittle((const uint8_t *)in, len, (uint64_t)seed, (uint8_t *)out); } //------------------------------------------------------------ REGISTER_FAMILY(lookup3, - $.src_url = "http://www.burtleburtle.net/bob/c/lookup3.c", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "http://www.burtleburtle.net/bob/c/lookup3.c", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(lookup3__32, - $.desc = "Bob Jenkins' lookup3 (32-bit output)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x3D83917A, - $.verification_BE = 0x18E6AA76, - $.hashfn_native = lookup3, - $.hashfn_bswap = lookup3 -); + $.desc = "Bob Jenkins' lookup3 (32-bit output)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x3D83917A, + $.verification_BE = 0x18E6AA76, + $.hashfn_native = lookup3, + $.hashfn_bswap = lookup3 + ); REGISTER_HASH(lookup3, - $.desc = "Bob Jenkins' lookup3 (64-bit output)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x6AE8AB7C, - $.verification_BE = 0x074EBE4E, - $.hashfn_native = lookup3, - $.hashfn_bswap = lookup3 -); + $.desc = "Bob Jenkins' lookup3 (64-bit output)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x6AE8AB7C, + $.verification_BE = 0x074EBE4E, + $.hashfn_native = lookup3, + $.hashfn_bswap = lookup3 + ); diff --git a/hashes/md5.cpp b/hashes/md5.cpp index 4f3a557c..974d3c25 100644 --- a/hashes/md5.cpp +++ b/hashes/md5.cpp @@ -50,9 +50,9 @@ //----------------------------------------------------------------------------- // Raw MD5 implementation typedef struct { - uint32_t total[2]; /*!< number of bytes processed */ - uint32_t state[4]; /*!< intermediate digest state */ - uint8_t buffer[64]; /*!< data block being processed */ + uint32_t total[2]; /*!< number of bytes processed */ + uint32_t state[4]; /*!< intermediate digest state */ + uint8_t buffer[64]; /*!< data block being processed */ uint8_t ipad[64]; /*!< HMAC: inner padding */ uint8_t opad[64]; /*!< HMAC: outer padding */ @@ -61,7 +61,7 @@ typedef struct { /* * MD5 context setup */ -static void md5_start(md5_context * ctx) { +static void md5_start( md5_context * ctx ) { ctx->total[0] = 0; ctx->total[1] = 0; @@ -74,32 +74,32 @@ static void md5_start(md5_context * ctx) { /* * MD5 process single data block */ -template < bool bswap > -static void md5_process(md5_context * ctx, uint8_t data[64]) { +template +static void md5_process( md5_context * ctx, uint8_t data[64] ) { uint32_t X[16], A, B, C, D; /* * These macros will cache the converted input data when byteswapping * is requested, and will just read directly from data when possible. */ -#define CACHEBLK(k) (bswap ? \ - (X[k]=GET_U32(data, 4*(k))) : \ +#define CACHEBLK(k) (bswap ? \ + (X[k]=GET_U32(data, 4*(k))) : \ (GET_U32(data,4*(k)))) -#define GETBLK(k) (bswap ? \ - (X[k]) : \ +#define GETBLK(k) (bswap ? \ + (X[k]) : \ (GET_U32(data,4*(k)))) -#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) +#define S(x, n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) -#define P(a,b,c,d,k,s,t) \ - { \ - a += F(b,c,d) + CACHEBLK(k) + t; a = S(a,s) + b; \ +#define P(a,b,c,d,k,s,t) \ + { \ + a += F(b,c,d) + CACHEBLK(k) + t; a = S(a,s) + b; \ } -#define Q(a,b,c,d,k,s,t) \ - { \ - a += F(b,c,d) + GETBLK(k) + t; a = S(a,s) + b; \ +#define Q(a,b,c,d,k,s,t) \ + { \ + a += F(b,c,d) + GETBLK(k) + t; a = S(a,s) + b; \ } A = ctx->state[0]; @@ -107,89 +107,89 @@ static void md5_process(md5_context * ctx, uint8_t data[64]) { C = ctx->state[2]; D = ctx->state[3]; -#define F(x,y,z) (z ^ (x & (y ^ z))) -//#define F(x,y,z) ((x & y) | (~x & z)) - - P( A, B, C, D, 0, 7, 0xD76AA478 ); - P( D, A, B, C, 1, 12, 0xE8C7B756 ); - P( C, D, A, B, 2, 17, 0x242070DB ); - P( B, C, D, A, 3, 22, 0xC1BDCEEE ); - P( A, B, C, D, 4, 7, 0xF57C0FAF ); - P( D, A, B, C, 5, 12, 0x4787C62A ); - P( C, D, A, B, 6, 17, 0xA8304613 ); - P( B, C, D, A, 7, 22, 0xFD469501 ); - P( A, B, C, D, 8, 7, 0x698098D8 ); - P( D, A, B, C, 9, 12, 0x8B44F7AF ); - P( C, D, A, B, 10, 17, 0xFFFF5BB1 ); - P( B, C, D, A, 11, 22, 0x895CD7BE ); - P( A, B, C, D, 12, 7, 0x6B901122 ); - P( D, A, B, C, 13, 12, 0xFD987193 ); - P( C, D, A, B, 14, 17, 0xA679438E ); - P( B, C, D, A, 15, 22, 0x49B40821 ); +#define F(x, y, z) (z ^ (x & (y ^ z))) +// #define F(x,y,z) ((x & y) | (~x & z)) + + P(A, B, C, D, 0, 7, 0xD76AA478); + P(D, A, B, C, 1, 12, 0xE8C7B756); + P(C, D, A, B, 2, 17, 0x242070DB); + P(B, C, D, A, 3, 22, 0xC1BDCEEE); + P(A, B, C, D, 4, 7, 0xF57C0FAF); + P(D, A, B, C, 5, 12, 0x4787C62A); + P(C, D, A, B, 6, 17, 0xA8304613); + P(B, C, D, A, 7, 22, 0xFD469501); + P(A, B, C, D, 8, 7, 0x698098D8); + P(D, A, B, C, 9, 12, 0x8B44F7AF); + P(C, D, A, B, 10, 17, 0xFFFF5BB1); + P(B, C, D, A, 11, 22, 0x895CD7BE); + P(A, B, C, D, 12, 7, 0x6B901122); + P(D, A, B, C, 13, 12, 0xFD987193); + P(C, D, A, B, 14, 17, 0xA679438E); + P(B, C, D, A, 15, 22, 0x49B40821); #undef F -#define F(x,y,z) (y ^ (z & (x ^ y))) -//#define F(x,y,z) ((z & x) | (~z & y)) - - Q( A, B, C, D, 1, 5, 0xF61E2562 ); - Q( D, A, B, C, 6, 9, 0xC040B340 ); - Q( C, D, A, B, 11, 14, 0x265E5A51 ); - Q( B, C, D, A, 0, 20, 0xE9B6C7AA ); - Q( A, B, C, D, 5, 5, 0xD62F105D ); - Q( D, A, B, C, 10, 9, 0x02441453 ); - Q( C, D, A, B, 15, 14, 0xD8A1E681 ); - Q( B, C, D, A, 4, 20, 0xE7D3FBC8 ); - Q( A, B, C, D, 9, 5, 0x21E1CDE6 ); - Q( D, A, B, C, 14, 9, 0xC33707D6 ); - Q( C, D, A, B, 3, 14, 0xF4D50D87 ); - Q( B, C, D, A, 8, 20, 0x455A14ED ); - Q( A, B, C, D, 13, 5, 0xA9E3E905 ); - Q( D, A, B, C, 2, 9, 0xFCEFA3F8 ); - Q( C, D, A, B, 7, 14, 0x676F02D9 ); - Q( B, C, D, A, 12, 20, 0x8D2A4C8A ); +#define F(x, y, z) (y ^ (z & (x ^ y))) +// #define F(x,y,z) ((z & x) | (~z & y)) + + Q(A, B, C, D, 1, 5, 0xF61E2562); + Q(D, A, B, C, 6, 9, 0xC040B340); + Q(C, D, A, B, 11, 14, 0x265E5A51); + Q(B, C, D, A, 0, 20, 0xE9B6C7AA); + Q(A, B, C, D, 5, 5, 0xD62F105D); + Q(D, A, B, C, 10, 9, 0x02441453); + Q(C, D, A, B, 15, 14, 0xD8A1E681); + Q(B, C, D, A, 4, 20, 0xE7D3FBC8); + Q(A, B, C, D, 9, 5, 0x21E1CDE6); + Q(D, A, B, C, 14, 9, 0xC33707D6); + Q(C, D, A, B, 3, 14, 0xF4D50D87); + Q(B, C, D, A, 8, 20, 0x455A14ED); + Q(A, B, C, D, 13, 5, 0xA9E3E905); + Q(D, A, B, C, 2, 9, 0xFCEFA3F8); + Q(C, D, A, B, 7, 14, 0x676F02D9); + Q(B, C, D, A, 12, 20, 0x8D2A4C8A); #undef F -#define F(x,y,z) (x ^ y ^ z) - - Q( A, B, C, D, 5, 4, 0xFFFA3942 ); - Q( D, A, B, C, 8, 11, 0x8771F681 ); - Q( C, D, A, B, 11, 16, 0x6D9D6122 ); - Q( B, C, D, A, 14, 23, 0xFDE5380C ); - Q( A, B, C, D, 1, 4, 0xA4BEEA44 ); - Q( D, A, B, C, 4, 11, 0x4BDECFA9 ); - Q( C, D, A, B, 7, 16, 0xF6BB4B60 ); - Q( B, C, D, A, 10, 23, 0xBEBFBC70 ); - Q( A, B, C, D, 13, 4, 0x289B7EC6 ); - Q( D, A, B, C, 0, 11, 0xEAA127FA ); - Q( C, D, A, B, 3, 16, 0xD4EF3085 ); - Q( B, C, D, A, 6, 23, 0x04881D05 ); - Q( A, B, C, D, 9, 4, 0xD9D4D039 ); - Q( D, A, B, C, 12, 11, 0xE6DB99E5 ); - Q( C, D, A, B, 15, 16, 0x1FA27CF8 ); - Q( B, C, D, A, 2, 23, 0xC4AC5665 ); +#define F(x, y, z) (x ^ y ^ z) + + Q(A, B, C, D, 5, 4, 0xFFFA3942); + Q(D, A, B, C, 8, 11, 0x8771F681); + Q(C, D, A, B, 11, 16, 0x6D9D6122); + Q(B, C, D, A, 14, 23, 0xFDE5380C); + Q(A, B, C, D, 1, 4, 0xA4BEEA44); + Q(D, A, B, C, 4, 11, 0x4BDECFA9); + Q(C, D, A, B, 7, 16, 0xF6BB4B60); + Q(B, C, D, A, 10, 23, 0xBEBFBC70); + Q(A, B, C, D, 13, 4, 0x289B7EC6); + Q(D, A, B, C, 0, 11, 0xEAA127FA); + Q(C, D, A, B, 3, 16, 0xD4EF3085); + Q(B, C, D, A, 6, 23, 0x04881D05); + Q(A, B, C, D, 9, 4, 0xD9D4D039); + Q(D, A, B, C, 12, 11, 0xE6DB99E5); + Q(C, D, A, B, 15, 16, 0x1FA27CF8); + Q(B, C, D, A, 2, 23, 0xC4AC5665); #undef F -#define F(x,y,z) (y ^ (x | ~z)) - - Q( A, B, C, D, 0, 6, 0xF4292244 ); - Q( D, A, B, C, 7, 10, 0x432AFF97 ); - Q( C, D, A, B, 14, 15, 0xAB9423A7 ); - Q( B, C, D, A, 5, 21, 0xFC93A039 ); - Q( A, B, C, D, 12, 6, 0x655B59C3 ); - Q( D, A, B, C, 3, 10, 0x8F0CCC92 ); - Q( C, D, A, B, 10, 15, 0xFFEFF47D ); - Q( B, C, D, A, 1, 21, 0x85845DD1 ); - Q( A, B, C, D, 8, 6, 0x6FA87E4F ); - Q( D, A, B, C, 15, 10, 0xFE2CE6E0 ); - Q( C, D, A, B, 6, 15, 0xA3014314 ); - Q( B, C, D, A, 13, 21, 0x4E0811A1 ); - Q( A, B, C, D, 4, 6, 0xF7537E82 ); - Q( D, A, B, C, 11, 10, 0xBD3AF235 ); - Q( C, D, A, B, 2, 15, 0x2AD7D2BB ); - Q( B, C, D, A, 9, 21, 0xEB86D391 ); +#define F(x, y, z) (y ^ (x | ~z)) + + Q(A, B, C, D, 0, 6, 0xF4292244); + Q(D, A, B, C, 7, 10, 0x432AFF97); + Q(C, D, A, B, 14, 15, 0xAB9423A7); + Q(B, C, D, A, 5, 21, 0xFC93A039); + Q(A, B, C, D, 12, 6, 0x655B59C3); + Q(D, A, B, C, 3, 10, 0x8F0CCC92); + Q(C, D, A, B, 10, 15, 0xFFEFF47D); + Q(B, C, D, A, 1, 21, 0x85845DD1); + Q(A, B, C, D, 8, 6, 0x6FA87E4F); + Q(D, A, B, C, 15, 10, 0xFE2CE6E0); + Q(C, D, A, B, 6, 15, 0xA3014314); + Q(B, C, D, A, 13, 21, 0x4E0811A1); + Q(A, B, C, D, 4, 6, 0xF7537E82); + Q(D, A, B, C, 11, 10, 0xBD3AF235); + Q(C, D, A, B, 2, 15, 0x2AD7D2BB); + Q(B, C, D, A, 9, 21, 0xEB86D391); #undef F @@ -202,18 +202,18 @@ static void md5_process(md5_context * ctx, uint8_t data[64]) { /* * MD5 process buffer */ -template < bool bswap > -static void md5_update(md5_context * ctx, uint8_t * input, size_t ilen) { +template +static void md5_update( md5_context * ctx, uint8_t * input, size_t ilen ) { uint32_t fill, left; if (ilen == 0) { return; } if (ilen >= UINT32_C(0xffffffff)) { return; } - left = ctx->total[0] & 0x3F; + left = ctx->total[0] & 0x3F; fill = 64 - left; - ctx->total[0] += ilen; - ctx->total[0] &= 0xFFFFFFFF; + ctx->total [0] += ilen; + ctx->total [0] &= 0xFFFFFFFF; if (ctx->total[0] < (uint32_t)ilen) { ctx->total[1]++; } @@ -222,7 +222,7 @@ static void md5_update(md5_context * ctx, uint8_t * input, size_t ilen) { md5_process(ctx, ctx->buffer); input += fill; ilen -= fill; - left = 0; + left = 0; } while (ilen >= 64) { @@ -246,23 +246,23 @@ static const uint8_t md5_padding[64] = { /* * MD5 final digest */ -template < bool bswap > -static void md5_finish(md5_context * ctx, uint8_t output[16]) { +template +static void md5_finish( md5_context * ctx, uint8_t output[16] ) { uint32_t last, padn; uint32_t high, low; - uint8_t msglen[8]; + uint8_t msglen[8]; - high = (ctx->total[0] >> 29) - | (ctx->total[1] << 3); - low = (ctx->total[0] << 3); + high = (ctx->total[0] >> 29) | + (ctx->total[1] << 3); + low = (ctx->total[0] << 3); - PUT_U32(low, msglen, 0); + PUT_U32(low , msglen, 0); PUT_U32(high, msglen, 4); - last = ctx->total[0] & 0x3F; + last = ctx->total [0] & 0x3F; padn = (last < 56) ? (56 - last) : (120 - last); - md5_update(ctx, (uint8_t *) md5_padding, padn); + md5_update(ctx, (uint8_t *)md5_padding, padn); md5_update(ctx, msglen, 8); PUT_U32(ctx->state[0], output, 0); @@ -273,9 +273,10 @@ static void md5_finish(md5_context * ctx, uint8_t output[16]) { //----------------------------------------------------------------------------- // Homegrown MD5 seeding function -static FORCE_INLINE void seed_md5(md5_context * ctx, const seed_t seed) { +static FORCE_INLINE void seed_md5( md5_context * ctx, const seed_t seed ) { const uint32_t seedlo = seed & 0xFFFFFFFF; const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + ctx->state[0] ^= seedlo; ctx->state[1] ^= seedhi; ctx->state[2] += seedlo; @@ -284,14 +285,14 @@ static FORCE_INLINE void seed_md5(md5_context * ctx, const seed_t seed) { //----------------------------------------------------------------------------- // Wrappers for rest of SMHasher3 -template < uint32_t hashbits, bool bswap > -static void MD5(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MD5( const void * in, const size_t len, const seed_t seed, void * out ) { md5_context md5_ctx; - uint8_t buf[16]; - uint8_t * hash = (hashbits >= 128) ? (uint8_t *)out : &buf[0]; + uint8_t buf[16]; + uint8_t * hash = (hashbits >= 128) ? (uint8_t *)out : &buf[0]; - md5_start (&md5_ctx); - seed_md5 (&md5_ctx, seed); + md5_start(&md5_ctx); + seed_md5(&md5_ctx, seed); md5_update(&md5_ctx, (uint8_t *)in, len); md5_finish(&md5_ctx, hash); @@ -300,74 +301,74 @@ static void MD5(const void * in, const size_t len, const seed_t seed, void * out // hash round, followed by "C" in the previous, etc. if (hashbits < 128) { if (hashbits <= 96) { - memcpy(out, &hash[4], (hashbits+7)/8); + memcpy(out, &hash[4], (hashbits + 7) / 8); } else { - memcpy(out, &hash[0], (hashbits+7)/8); + memcpy(out, &hash[0], (hashbits + 7) / 8); } } } REGISTER_FAMILY(md5, - $.src_url = "https://github.com/MattiaOng/md5-cracker/blob/master/md5.c", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/MattiaOng/md5-cracker/blob/master/md5.c", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MD5__32, - $.desc = "MD5, bits 32-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3 | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 32, - $.verification_LE = 0x4003D7EE, - $.verification_BE = 0x53A2E981, - $.hashfn_native = MD5<32,false>, - $.hashfn_bswap = MD5<32,true> -); + $.desc = "MD5, bits 32-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3 | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 32, + $.verification_LE = 0x4003D7EE, + $.verification_BE = 0x53A2E981, + $.hashfn_native = MD5<32, false>, + $.hashfn_bswap = MD5<32, true> + ); REGISTER_HASH(MD5__64, - $.desc = "MD5, bits 32-95", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3 | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0xF2E011D4, - $.verification_BE = 0xDE2E1FAD, - $.hashfn_native = MD5<64,false>, - $.hashfn_bswap = MD5<64,true> -); + $.desc = "MD5, bits 32-95", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3 | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0xF2E011D4, + $.verification_BE = 0xDE2E1FAD, + $.hashfn_native = MD5<64, false>, + $.hashfn_bswap = MD5<64, true> + ); REGISTER_HASH(MD5, - $.desc = "MD5", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3 | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 128, - $.verification_LE = 0x1363415D, - $.verification_BE = 0x242A18E0, - $.hashfn_native = MD5<128,false>, - $.hashfn_bswap = MD5<128,true> -); + $.desc = "MD5", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3 | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 128, + $.verification_LE = 0x1363415D, + $.verification_BE = 0x242A18E0, + $.hashfn_native = MD5<128, false>, + $.hashfn_bswap = MD5<128, true> + ); diff --git a/hashes/meowhash.cpp b/hashes/meowhash.cpp index e96ad6ca..9ad6f4c4 100644 --- a/hashes/meowhash.cpp +++ b/hashes/meowhash.cpp @@ -30,91 +30,91 @@ #if defined(HAVE_X86_64_AES) && defined(HAVE_SSE_4_1) -#include "Intrinsics.h" + #include "Intrinsics.h" typedef __m128i meow_u128; //------------------------------------------------------------ -//#define MEOW_HASH_VERSION 5 -//#define MEOW_HASH_VERSION_NAME "0.5/calico" +// #define MEOW_HASH_VERSION 5 +// #define MEOW_HASH_VERSION_NAME "0.5/calico" -#define MEOW_PAGESIZE 4096 -#define MEOW_PREFETCH 4096 -#define MEOW_PREFETCH_LIMIT 0x3ff + #define MEOW_PAGESIZE 4096 + #define MEOW_PREFETCH 4096 + #define MEOW_PREFETCH_LIMIT 0x3ff // fwojcik: Why is this needed? -#if defined(_MSC_VER) && !defined(__clang__) -#define INSTRUCTION_REORDER_BARRIER _ReadWriteBarrier() -#else -#define INSTRUCTION_REORDER_BARRIER -#endif + #if defined(_MSC_VER) && !defined(__clang__) + #define INSTRUCTION_REORDER_BARRIER _ReadWriteBarrier() + #else + #define INSTRUCTION_REORDER_BARRIER + #endif //------------------------------------------------------------ -#define MeowU64From(A, I) (_mm_extract_epi64((A), (I))) -#define MeowU32From(A, I) (_mm_extract_epi32((A), (I))) -#define prefetcht0(A) _mm_prefetch((char const *)(A), _MM_HINT_T0) -#define movdqu_imm(B) _mm_loadu_si128((meow_u128 *)(B)) -#define movdqu(A, B) A = _mm_loadu_si128((meow_u128 *)(B)) -#define movq(A, B, C) A = _mm_set_epi64x(C, B); -#define aesdec(A, B) A = _mm_aesdec_si128(A, B) -#define pshufb(A, B) A = _mm_shuffle_epi8(A, B) -#define pxor(A, B) A = _mm_xor_si128(A, B) -#define paddq(A, B) A = _mm_add_epi64(A, B) -#define pand(A, B) A = _mm_and_si128(A, B) -#define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i) + #define MeowU64From(A, I) (_mm_extract_epi64((A), (I))) + #define MeowU32From(A, I) (_mm_extract_epi32((A), (I))) + #define prefetcht0(A) _mm_prefetch((char const *)(A), _MM_HINT_T0) + #define movdqu_imm(B) _mm_loadu_si128((meow_u128 *)(B)) + #define movdqu(A, B) A = _mm_loadu_si128((meow_u128 *)(B)) + #define movq(A, B, C) A = _mm_set_epi64x(C, B); + #define aesdec(A, B) A = _mm_aesdec_si128(A, B) + #define pshufb(A, B) A = _mm_shuffle_epi8(A, B) + #define pxor(A, B) A = _mm_xor_si128(A, B) + #define paddq(A, B) A = _mm_add_epi64(A, B) + #define pand(A, B) A = _mm_and_si128(A, B) + #define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i) // NOTE(casey): pxor_clear is a nonsense thing that is only here // because compilers don't detect xor(a, a) is clearing a :( -#define pxor_clear(A, B) A = _mm_setzero_si128(); + #define pxor_clear(A, B) A = _mm_setzero_si128(); //------------------------------------------------------------ -#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ - aesdec(r1, r2); \ - INSTRUCTION_REORDER_BARRIER; \ - paddq(r3, i1); \ - pxor(r2, i2); \ - aesdec(r2, r4); \ - INSTRUCTION_REORDER_BARRIER; \ - paddq(r5, i3); \ +#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ + aesdec(r1, r2); \ + INSTRUCTION_REORDER_BARRIER; \ + paddq(r3, i1); \ + pxor(r2, i2); \ + aesdec(r2, r4); \ + INSTRUCTION_REORDER_BARRIER; \ + paddq(r5, i3); \ pxor(r4, i4); -#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \ - if (bswap) { \ - MEOW_MIX_REG(r1, r2, r3, r4, r5, \ - mm_bswap64(movdqu_imm((ptr) + 15)), \ - mm_bswap64(movdqu_imm((ptr) + 0)), \ - mm_bswap64(movdqu_imm((ptr) + 1)), \ - mm_bswap64(movdqu_imm((ptr) + 16))) \ - } else { \ - MEOW_MIX_REG(r1, r2, r3, r4, r5, \ - movdqu_imm((ptr) + 15), \ - movdqu_imm((ptr) + 0), \ - movdqu_imm((ptr) + 1), \ - movdqu_imm((ptr) + 16)) \ +#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \ + if (bswap) { \ + MEOW_MIX_REG(r1, r2, r3, r4, r5, \ + mm_bswap64(movdqu_imm((ptr) + 15)), \ + mm_bswap64(movdqu_imm((ptr) + 0)), \ + mm_bswap64(movdqu_imm((ptr) + 1)), \ + mm_bswap64(movdqu_imm((ptr) + 16))) \ + } else { \ + MEOW_MIX_REG(r1, r2, r3, r4, r5, \ + movdqu_imm((ptr) + 15), \ + movdqu_imm((ptr) + 0), \ + movdqu_imm((ptr) + 1), \ + movdqu_imm((ptr) + 16)) \ } -#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \ - aesdec(r1, r4); \ - paddq(r2, r5); \ - pxor(r4, r6); \ - aesdec(r4, r2); \ - paddq(r5, r6); \ +#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \ + aesdec(r1, r4); \ + paddq(r2, r5); \ + pxor(r4, r6); \ + aesdec(r4, r2); \ + paddq(r5, r6); \ pxor(r2, r3) //------------------------------------------------------------ static const uint8_t MeowShiftAdjust[32] = { - 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, - 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; static const uint8_t MeowMaskLen[32] = { - 255,255,255,255, - 255,255,255,255, - 255,255,255,255, - 255,255,255,255, - 0,0,0,0, - 0,0,0,0, - 0,0,0,0, - 0,0,0,0 + 255, 255, 255, 255, + 255, 255, 255, 255, + 255, 255, 255, 255, + 255, 255, 255, 255, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; // NOTE(casey): The default seed is now a "nothing-up-our-sleeves" @@ -143,8 +143,8 @@ static const uint8_t MeowDefaultSeed[128] = { // // NOTE(casey): Single block version // -template < bool bswap > -static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * SourceInit, uint64_t extraseed) { +template +static meow_u128 MeowHash( const void * Seed128Init, size_t Len, const void * SourceInit, uint64_t extraseed ) { const uint8_t * const SourceInit8 = (const uint8_t *)SourceInit; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) @@ -180,33 +180,31 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou prefetcht0(rax + MEOW_PREFETCH + 0x80); prefetcht0(rax + MEOW_PREFETCH + 0xc0); - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } - } else { - // NOTE(casey): For small input, modern Intel x64's can't hit // full speed _with_ prefetching (because of port pressure), // so we use this loop. while (BlockCount--) { - MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); - MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); - MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); - MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); - MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); - MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); - MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); - MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0x00); + MEOW_MIX(xmm1, xmm5, xmm7, xmm2, xmm3, rax + 0x20); + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x40); + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x60); + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x80); + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0xa0); + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0xc0); + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xe0); rax += 0x100; } @@ -215,7 +213,7 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou // // NOTE(casey): Load any less-than-32-byte residual // - pxor_clear(xmm9, xmm9); + pxor_clear(xmm9 , xmm9 ); pxor_clear(xmm11, xmm11); // @@ -230,15 +228,15 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou // NOTE(casey): First, we have to load the part that is _not_ // 16-byte aligned const uint8_t * Last = SourceInit8 + (Len & ~0xf); - uint32_t Len8 = (Len & 0xf); + uint32_t Len8 = (Len & 0xf ); if (Len8) { // NOTE(casey): Load the mask early - movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); + movdqu(xmm8 , &MeowMaskLen[0x10 - Len8]); const uint8_t * LastOk = (const uint8_t *)(((uintptr_t)(SourceInit8 + Len - 1) | (MEOW_PAGESIZE - 1)) - 16); - uint32_t Align = (Last > LastOk) ? ((uintptr_t)Last) & 0xf : 0; - movdqu(xmm10, &MeowShiftAdjust[Align]); - movdqu(xmm9, Last - Align); + uint32_t Align = (Last > LastOk) ? ((uintptr_t)Last) & 0xf : 0; + movdqu(xmm10, &MeowShiftAdjust[Align] ); + movdqu(xmm9 , Last - Align); pshufb(xmm9, xmm10); // NOTE(jeffr): and off the extra bytes @@ -255,10 +253,10 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou // // NOTE(casey): Construct the residual and length injests // - xmm8 = xmm9; + xmm8 = xmm9; xmm10 = xmm9; - palignr(xmm8, xmm11, 15); - palignr(xmm10, xmm11, 1); + palignr(xmm8 , xmm11, 15); + palignr(xmm10, xmm11, 1); // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none // here, but the decision was made to leave them zero'd so as not @@ -272,32 +270,39 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou pxor_clear(xmm14, xmm14); movq(xmm15, Len, extraseed); palignr(xmm12, xmm15, 15); - palignr(xmm14, xmm15, 1); + palignr(xmm14, xmm15, 1); // NOTE(casey): To maintain the mix-down pattern, we always Meow // Mix the less-than-32-byte residual, even if it was empty - MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); + MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8 , xmm9 , xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our // 32-byte padding - MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); + MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); // // NOTE(casey): Hash all full 32-byte blocks // uint32_t LaneCount = (Len >> 5) & 0x7; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; - if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm2, xmm6, xmm0, xmm3, xmm4, rax + 0x00); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm3, xmm7, xmm1, xmm4, xmm5, rax + 0x20); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm4, xmm0, xmm2, xmm5, xmm6, rax + 0x40); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm5, xmm1, xmm3, xmm6, xmm7, rax + 0x60); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm6, xmm2, xmm4, xmm7, xmm0, rax + 0x80); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm7, xmm3, xmm5, xmm0, xmm1, rax + 0xa0); --LaneCount; + if (LaneCount == 0) { goto MixDown; } + MEOW_MIX(xmm0, xmm4, xmm6, xmm1, xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // - MixDown: + MixDown: MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); @@ -323,77 +328,81 @@ static meow_u128 MeowHash(const void * Seed128Init, size_t Len, const void * Sou } //------------------------------------------------------------ -template < bool bswap > -static void MeowHash32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MeowHash32( const void * in, const size_t len, const seed_t seed, void * out ) { meow_u128 h = MeowHash(MeowDefaultSeed, len, in, (uint64_t)seed); + PUT_U32(MeowU32From(h, 0), (uint8_t *)out, 0); } -template < bool bswap > -static void MeowHash64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MeowHash64( const void * in, const size_t len, const seed_t seed, void * out ) { meow_u128 h = MeowHash(MeowDefaultSeed, len, in, (uint64_t)seed); + PUT_U64(MeowU64From(h, 0), (uint8_t *)out, 0); } -template < bool bswap > -static void MeowHash128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MeowHash128( const void * in, const size_t len, const seed_t seed, void * out ) { meow_u128 h = MeowHash(MeowDefaultSeed, len, in, (uint64_t)seed); + PUT_U64(MeowU64From(h, 0), (uint8_t *)out, 0); PUT_U64(MeowU64From(h, 1), (uint8_t *)out, 8); } + #endif //------------------------------------------------------------ REGISTER_FAMILY(meowhash, - $.src_url = "https://github.com/cmuratori/meow_hash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/cmuratori/meow_hash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); #if defined(HAVE_X86_64_AES) && defined(HAVE_SSE_4_1) REGISTER_HASH(MeowHash__32, - $.desc = "MeowHash (0.5/calico, low 32 bits)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_LICENSE_ZLIB, - $.bits = 32, - $.verification_LE = 0xE9E94FF2, - $.verification_BE = 0xD5BF086D, - $.hashfn_native = MeowHash32, - $.hashfn_bswap = MeowHash32 -); + $.desc = "MeowHash (0.5/calico, low 32 bits)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 32, + $.verification_LE = 0xE9E94FF2, + $.verification_BE = 0xD5BF086D, + $.hashfn_native = MeowHash32, + $.hashfn_bswap = MeowHash32 + ); REGISTER_HASH(MeowHash__64, - $.desc = "MeowHash (0.5/calico, low 64 bits)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_LICENSE_ZLIB, - $.bits = 64, - $.verification_LE = 0x4C9F52A6, - $.verification_BE = 0xFA21003A, - $.hashfn_native = MeowHash64, - $.hashfn_bswap = MeowHash64 -); + $.desc = "MeowHash (0.5/calico, low 64 bits)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0x4C9F52A6, + $.verification_BE = 0xFA21003A, + $.hashfn_native = MeowHash64, + $.hashfn_bswap = MeowHash64 + ); REGISTER_HASH(MeowHash, - $.desc = "MeowHash (0.5/calico)", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_AES_BASED, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_LICENSE_ZLIB, - $.bits = 128, - $.verification_LE = 0x7C648489, - $.verification_BE = 0x4FD0834C, - $.hashfn_native = MeowHash128, - $.hashfn_bswap = MeowHash128 -); + $.desc = "MeowHash (0.5/calico)", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 128, + $.verification_LE = 0x7C648489, + $.verification_BE = 0x4FD0834C, + $.hashfn_native = MeowHash128, + $.hashfn_bswap = MeowHash128 + ); #endif diff --git a/hashes/metrohash.cpp b/hashes/metrohash.cpp index 6fa6fac1..05337fa7 100644 --- a/hashes/metrohash.cpp +++ b/hashes/metrohash.cpp @@ -27,163 +27,164 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_CRC32C) -#include "Intrinsics.h" + #include "Intrinsics.h" #else -uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v); +uint64_t _mm_crc32_u64( uint64_t crc, uint64_t v ); + #endif #define VARIANTS_64 5 static const uint64_t MULTK64[VARIANTS_64][8] = { - { - 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, - 0x62992FC1, 0x62992FC1, 0x30BC5B29, 0x30BC5B29, - }, - { - 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5, - 0xC83A91E1, 0x8648DBDB, 0x8648DBDB, 0x7BDEC03B, - }, - { - 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, - 0x62992FC1, 0x62992FC1, 0x30BC5B29, 0x30BC5B29, - }, - { - 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5, - 0xC83A91E1, 0x8648DBDB, 0x8648DBDB, 0x7BDEC03B, - }, - { - 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, - 0xD6D018F5, 0xA2AA033B, 0xA2AA033B, 0x62992FC1, - }, + { + 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, + 0x62992FC1, 0x62992FC1, 0x30BC5B29, 0x30BC5B29, + }, + { + 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5, + 0xC83A91E1, 0x8648DBDB, 0x8648DBDB, 0x7BDEC03B, + }, + { + 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, + 0x62992FC1, 0x62992FC1, 0x30BC5B29, 0x30BC5B29, + }, + { + 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5, + 0xC83A91E1, 0x8648DBDB, 0x8648DBDB, 0x7BDEC03B, + }, + { + 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29, + 0xD6D018F5, 0xA2AA033B, 0xA2AA033B, 0x62992FC1, + }, }; static const uint8_t ROTK64[VARIANTS_64][9] = { - { 37, 29, 21, 55, 26, 48, 37, 28, 29 }, - { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, - { 30, 29, 34, 36, 15, 15, 23, 28, 29 }, - { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, - { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, + { 37, 29, 21, 55, 26, 48, 37, 28, 29 }, + { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, + { 30, 29, 34, 36, 15, 15, 23, 28, 29 }, + { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, + { 33, 33, 35, 33, 15, 13, 25, 33, 33 }, }; -template < uint32_t variant, bool bswap > -static void MetroHash64(const void * in, const size_t len, const seed_t seed, void * out) { - if (variant >= VARIANTS_64) { return; } +template +static void MetroHash64( const void * in, const size_t len, const seed_t seed, void * out ) { + if (variant >= VARIANTS_64) { return; } - const uint64_t * K = &MULTK64[variant][0]; - const uint8_t * ROTK = &ROTK64[variant][0]; - const size_t length = len; - const uint8_t * ptr = (const uint8_t *)in; - const uint8_t * const end = ptr + len; - uint64_t v[4]; + const uint64_t * K = &MULTK64[variant][0]; + const uint8_t * ROTK = &ROTK64 [variant][0]; + const size_t length = len; + const uint8_t * ptr = (const uint8_t *)in; + const uint8_t * const end = ptr + len; + uint64_t v[4]; - uint64_t vseed = ((uint64_t)seed + K[2]) * K[0]; - if (variant != 0) { vseed += len; } + uint64_t vseed = ((uint64_t)seed + K[2]) * K[0]; + if (variant != 0) { vseed += len; } - v[0] = v[1] = v[2] = v[3] = vseed; + v[0] = v[1] = v[2] = v[3] = vseed; - // bulk update - while (ptr <= (end - 32)) { - if (variant <= 2) { - v[0] += GET_U64(ptr, 0) * K[0]; v[0] = ROTR64(v[0], 29) + v[2]; - v[1] += GET_U64(ptr, 8) * K[1]; v[1] = ROTR64(v[1], 29) + v[3]; - v[2] += GET_U64(ptr, 16) * K[2]; v[2] = ROTR64(v[2], 29) + v[0]; - v[3] += GET_U64(ptr, 24) * K[3]; v[3] = ROTR64(v[3], 29) + v[1]; - } else { - v[0] ^= _mm_crc32_u64(v[0], GET_U64(ptr, 0)); - v[1] ^= _mm_crc32_u64(v[1], GET_U64(ptr, 8)); - v[2] ^= _mm_crc32_u64(v[2], GET_U64(ptr, 16)); - v[3] ^= _mm_crc32_u64(v[3], GET_U64(ptr, 24)); + // bulk update + while (ptr <= (end - 32)) { + if (variant <= 2) { + v[0] += GET_U64(ptr, 0) * K[0]; v[0] = ROTR64(v[0], 29) + v[2]; + v[1] += GET_U64(ptr, 8) * K[1]; v[1] = ROTR64(v[1], 29) + v[3]; + v[2] += GET_U64(ptr, 16) * K[2]; v[2] = ROTR64(v[2], 29) + v[0]; + v[3] += GET_U64(ptr, 24) * K[3]; v[3] = ROTR64(v[3], 29) + v[1]; + } else { + v[0] ^= _mm_crc32_u64(v[0], GET_U64(ptr, 0)); + v[1] ^= _mm_crc32_u64(v[1], GET_U64(ptr, 8)); + v[2] ^= _mm_crc32_u64(v[2], GET_U64(ptr, 16)); + v[3] ^= _mm_crc32_u64(v[3], GET_U64(ptr, 24)); + } + ptr += 32; } - ptr += 32; - } - - if (len >= 32) { - v[2] ^= ROTR64(((v[0] + v[3]) * K[0]) + v[1], ROTK[0]) * K[1]; - v[3] ^= ROTR64(((v[1] + v[2]) * K[1]) + v[0], ROTK[0]) * K[0]; - v[0] ^= ROTR64(((v[0] + v[2]) * K[0]) + v[3], ROTK[0]) * K[1]; - v[1] ^= ROTR64(((v[1] + v[3]) * K[1]) + v[2], ROTK[0]) * K[0]; - - v[0] = vseed + (v[0] ^ v[1]); - } - - if ((end - ptr) >= 16) { - v[1] = v[0] + (GET_U64(ptr, 0) * K[4]); v[1] = ROTR64(v[1], ROTK[1]) * K[6]; - v[2] = v[0] + (GET_U64(ptr, 8) * K[5]); v[2] = ROTR64(v[2], ROTK[1]) * K[7]; - v[1] ^= ROTR64(v[1] * K[0], ROTK[2]) + v[2]; - v[2] ^= ROTR64(v[2] * K[3], ROTK[2]) + v[1]; - v[0] += v[2]; - ptr += 16; - } - - if ((end - ptr) >= 8) { - v[0] += GET_U64(ptr, 0) * K[3]; - v[0] ^= ROTR64(v[0], ROTK[3]) * K[1]; - ptr += 8; - } - - if ((end - ptr) >= 4) { - if (variant <= 2) { - v[0] += GET_U32(ptr, 0) * K[3]; - } else { - v[0] ^= _mm_crc32_u64(v[0], GET_U32(ptr, 0)); + + if (len >= 32) { + v[2] ^= ROTR64(((v[0] + v[3]) * K[0]) + v[1], ROTK[0]) * K[1]; + v[3] ^= ROTR64(((v[1] + v[2]) * K[1]) + v[0], ROTK[0]) * K[0]; + v[0] ^= ROTR64(((v[0] + v[2]) * K[0]) + v[3], ROTK[0]) * K[1]; + v[1] ^= ROTR64(((v[1] + v[3]) * K[1]) + v[2], ROTK[0]) * K[0]; + + v[0] = vseed + (v[0] ^ v[1]); } - v[0] ^= ROTR64(v[0], ROTK[4]) * K[1]; - ptr += 4; - } - if ((end - ptr) >= 2) { - if (variant <= 2) { - v[0] += GET_U16(ptr, 0) * K[3]; - } else { - v[0] ^= _mm_crc32_u64(v[0], GET_U16(ptr, 0)); + if ((end - ptr) >= 16) { + v[1] = v[0] + (GET_U64(ptr, 0) * K[4]); v[1] = ROTR64(v[1], ROTK[1]) * K[6]; + v[2] = v[0] + (GET_U64(ptr, 8) * K[5]); v[2] = ROTR64(v[2], ROTK[1]) * K[7]; + v[1] ^= ROTR64(v[1] * K[0], ROTK[2]) + v[2]; + v[2] ^= ROTR64(v[2] * K[3], ROTK[2]) + v[1]; + v[0] += v[2]; + ptr += 16; } - v[0] ^= ROTR64(v[0], ROTK[5]) * K[1]; - ptr += 2; - } - if ((end - ptr) >= 1) { - if (variant <= 2) { - v[0] += (*ptr) * K[3]; - } else { - v[0] ^= _mm_crc32_u64(v[0], *ptr); + if ((end - ptr) >= 8) { + v[0] += GET_U64(ptr, 0) * K[3]; + v[0] ^= ROTR64(v[0], ROTK[3]) * K[1]; + ptr += 8; + } + + if ((end - ptr) >= 4) { + if (variant <= 2) { + v[0] += GET_U32(ptr, 0) * K[3]; + } else { + v[0] ^= _mm_crc32_u64(v[0], GET_U32(ptr, 0)); + } + v[0] ^= ROTR64(v[0], ROTK[4]) * K[1]; + ptr += 4; + } + + if ((end - ptr) >= 2) { + if (variant <= 2) { + v[0] += GET_U16(ptr, 0) * K[3]; + } else { + v[0] ^= _mm_crc32_u64(v[0], GET_U16(ptr, 0)); + } + v[0] ^= ROTR64(v[0], ROTK[5]) * K[1]; + ptr += 2; + } + + if ((end - ptr) >= 1) { + if (variant <= 2) { + v[0] += (*ptr) * K[3]; + } else { + v[0] ^= _mm_crc32_u64(v[0], *ptr); + } + v[0] ^= ROTR64(v[0], ROTK[6]) * K[1]; } - v[0] ^= ROTR64(v[0], ROTK[6]) * K[1]; - } - v[0] ^= ROTR64(v[0], ROTK[7]); - v[0] *= K[0]; - v[0] ^= ROTR64(v[0], ROTK[8]); + v[0] ^= ROTR64(v[0], ROTK[7]); + v[0] *= K[0]; + v[0] ^= ROTR64(v[0], ROTK[8]); - PUT_U64(v[0], (uint8_t *)out, 0); + PUT_U64(v[0], (uint8_t *)out, 0); } #define VARIANTS_128 5 static const uint64_t MULTK128[VARIANTS_128][4] = { - { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, // Standard mixing - { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, - { 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29 }, - { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, // CRC-based mixing - { 0xEE783E2F, 0xAD07C493, 0x797A90BB, 0x2E4B2E1B } + { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, // Standard mixing + { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, + { 0xD6D018F5, 0xA2AA033B, 0x62992FC1, 0x30BC5B29 }, + { 0xC83A91E1, 0x8648DBDB, 0x7BDEC03B, 0x2F5870A5 }, // CRC-based mixing + { 0xEE783E2F, 0xAD07C493, 0x797A90BB, 0x2E4B2E1B } }; static const uint8_t ROTK128[VARIANTS_128][15] = { - { 21, 21, 21, 33, 45, 33, 27, 33, 46, 33, 22, 33, 58, 13, 37, }, - { 26, 26, 30, 33, 17, 33, 20, 33, 18, 33, 24, 33, 24, 13, 37, }, - { 33, 33, 33, 29, 29, 29, 29, 29, 25, 29, 30, 29, 18, 33, 33, }, - { 34, 37, 37, 34, 30, 36, 23, 0, 19, 0, 13, 0, 17, 11, 26, }, - { 12, 19, 19, 41, 10, 34, 22, 0, 14, 0, 15, 0, 18, 15, 27, } + { 21, 21, 21, 33, 45, 33, 27, 33, 46, 33, 22, 33, 58, 13, 37, }, + { 26, 26, 30, 33, 17, 33, 20, 33, 18, 33, 24, 33, 24, 13, 37, }, + { 33, 33, 33, 29, 29, 29, 29, 29, 25, 29, 30, 29, 18, 33, 33, }, + { 34, 37, 37, 34, 30, 36, 23, 0, 19, 0, 13, 0, 17, 11, 26, }, + { 12, 19, 19, 41, 10, 34, 22, 0, 14, 0, 15, 0, 18, 15, 27, } }; -template < uint32_t variant, bool bswap > -static void MetroHash128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MetroHash128( const void * in, const size_t len, const seed_t seed, void * out ) { if (variant >= VARIANTS_128) { return; } - const uint64_t * K = &MULTK128[variant][0]; - const uint8_t * ROTK = &ROTK128[variant][0]; - const size_t length = len; - const uint8_t * ptr = (const uint8_t *)in; - const uint8_t * const end = ptr + len; + const uint64_t * K = &MULTK128[variant][0]; + const uint8_t * ROTK = &ROTK128 [variant][0]; + const size_t length = len; + const uint8_t * ptr = (const uint8_t *)in; + const uint8_t * const end = ptr + len; uint64_t v[4]; @@ -229,14 +230,14 @@ static void MetroHash128(const void * in, const size_t len, const seed_t seed, v v[1] += (GET_U64(ptr, 8) * K[2]); v[1] = ROTR64(v[1], ROTK[3]) * K[3]; v[0] ^= ROTR64(v[0] * K[2] + v[1], ROTK[4]) * K[1]; v[1] ^= ROTR64(v[1] * K[3] + v[0], ROTK[4]) * K[0]; - ptr += 16; + ptr += 16; } if ((end - ptr) >= 8) { v[0] += GET_U64(ptr, 0) * K[2]; - v[0] = ROTR64(v[0], ROTK[5]) * K[3]; + v[0] = ROTR64(v[0] , ROTK[5]) * K[3]; v[0] ^= ROTR64(v[0] * K[2] + v[1], ROTK[6]) * K[1]; - ptr += 8; + ptr += 8; } if ((end - ptr) >= 4) { @@ -247,7 +248,7 @@ static void MetroHash128(const void * in, const size_t len, const seed_t seed, v v[1] ^= _mm_crc32_u64(v[0], GET_U32(ptr, 0)); } v[1] ^= ROTR64(v[1] * K[3] + v[0], ROTK[8]) * K[0]; - ptr += 4; + ptr += 4; } if ((end - ptr) >= 2) { @@ -258,7 +259,7 @@ static void MetroHash128(const void * in, const size_t len, const seed_t seed, v v[0] ^= _mm_crc32_u64(v[1], GET_U16(ptr, 0)); } v[0] ^= ROTR64(v[0] * K[2] + v[1], ROTK[10]) * K[1]; - ptr += 2; + ptr += 2; } if ((end - ptr) >= 1) { @@ -286,174 +287,174 @@ static void MetroHash128(const void * in, const size_t len, const seed_t seed, v } REGISTER_FAMILY(metrohash, - $.src_url = "https://github.com/jandrewrogers/MetroHash/tree/c135424b3b83f1ca2502b7960f8d5705ddcec987", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/jandrewrogers/MetroHash/tree/c135424b3b83f1ca2502b7960f8d5705ddcec987", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MetroHash_64, - $.desc = "Metrohash v1 base variant, 64-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x6FA828C9, - $.verification_BE = 0xFB8D54A5, - $.hashfn_native = MetroHash64<0, false>, - $.hashfn_bswap = MetroHash64<0, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 base variant, 64-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x6FA828C9, + $.verification_BE = 0xFB8D54A5, + $.hashfn_native = MetroHash64<0, false>, + $.hashfn_bswap = MetroHash64<0, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHash_64__var1, - $.desc = "Metrohash v1 variant 1, 64-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xEE88F7D2, - $.verification_BE = 0xCC0F03D7, - $.hashfn_native = MetroHash64<1, false>, - $.hashfn_bswap = MetroHash64<1, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 variant 1, 64-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xEE88F7D2, + $.verification_BE = 0xCC0F03D7, + $.hashfn_native = MetroHash64<1, false>, + $.hashfn_bswap = MetroHash64<1, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHash_64__var2, - $.desc = "Metrohash v1 variant 2, 64-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xE1FC7C6E, - $.verification_BE = 0x7F8C6EF1, - $.hashfn_native = MetroHash64<2, false>, - $.hashfn_bswap = MetroHash64<2, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 variant 2, 64-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xE1FC7C6E, + $.verification_BE = 0x7F8C6EF1, + $.hashfn_native = MetroHash64<2, false>, + $.hashfn_bswap = MetroHash64<2, true>, + $.badseeds = {} + ); #if defined(HAVE_X86_64_CRC32C) REGISTER_HASH(MetroHashCrc_64__var1, - $.desc = "Metrohash-crc v1 variant 1, 64-bit version (unofficial)", - $.hash_flags = - FLAG_HASH_CRC_BASED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x29C68A50, - $.verification_BE = 0xACEEC1FC, - $.hashfn_native = MetroHash64<3, false>, - $.hashfn_bswap = MetroHash64<3, true>, - $.badseeds = {} -); + $.desc = "Metrohash-crc v1 variant 1, 64-bit version (unofficial)", + $.hash_flags = + FLAG_HASH_CRC_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x29C68A50, + $.verification_BE = 0xACEEC1FC, + $.hashfn_native = MetroHash64<3, false>, + $.hashfn_bswap = MetroHash64<3, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHashCrc_64__var2, - $.desc = "Metrohash-crc v1 variant 2, 64-bit version (unofficial)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x2C00BD9F, - $.verification_BE = 0x590D5688, - $.hashfn_native = MetroHash64<4, false>, - $.hashfn_bswap = MetroHash64<4, true>, - $.badseeds = {} -); + $.desc = "Metrohash-crc v1 variant 2, 64-bit version (unofficial)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x2C00BD9F, + $.verification_BE = 0x590D5688, + $.hashfn_native = MetroHash64<4, false>, + $.hashfn_bswap = MetroHash64<4, true>, + $.badseeds = {} + ); #endif REGISTER_HASH(MetroHash_128, - $.desc = "Metrohash v1 base variant, 128-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x4A6673E7, - $.verification_BE = 0xD5F2CD8C, - $.hashfn_native = MetroHash128<0, false>, - $.hashfn_bswap = MetroHash128<0, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 base variant, 128-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x4A6673E7, + $.verification_BE = 0xD5F2CD8C, + $.hashfn_native = MetroHash128<0, false>, + $.hashfn_bswap = MetroHash128<0, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHash_128__var1, - $.desc = "Metrohash v1 variant 1, 128-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x20E8A1D7, - $.verification_BE = 0x78661274, - $.hashfn_native = MetroHash128<1, false>, - $.hashfn_bswap = MetroHash128<1, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 variant 1, 128-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x20E8A1D7, + $.verification_BE = 0x78661274, + $.hashfn_native = MetroHash128<1, false>, + $.hashfn_bswap = MetroHash128<1, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHash_128__var2, - $.desc = "Metrohash v1 variant 2, 128-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x5437C684, - $.verification_BE = 0x01A244A6, - $.hashfn_native = MetroHash128<2, false>, - $.hashfn_bswap = MetroHash128<2, true>, - $.badseeds = {} -); + $.desc = "Metrohash v1 variant 2, 128-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x5437C684, + $.verification_BE = 0x01A244A6, + $.hashfn_native = MetroHash128<2, false>, + $.hashfn_bswap = MetroHash128<2, true>, + $.badseeds = {} + ); #if defined(HAVE_X86_64_CRC32C) REGISTER_HASH(MetroHashCrc_128__var1, - $.desc = "Metrohash-crc v1 variant 1, 128-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x5E75144E, - $.verification_BE = 0xCD4C6C7E, - $.hashfn_native = MetroHash128<3, false>, - $.hashfn_bswap = MetroHash128<3, true>, - $.badseeds = {} -); + $.desc = "Metrohash-crc v1 variant 1, 128-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x5E75144E, + $.verification_BE = 0xCD4C6C7E, + $.hashfn_native = MetroHash128<3, false>, + $.hashfn_bswap = MetroHash128<3, true>, + $.badseeds = {} + ); REGISTER_HASH(MetroHashCrc_128__var2, - $.desc = "Metrohash-crc v1 variant 2, 128-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x1ACF3E77, - $.verification_BE = 0x3772DA12, - $.hashfn_native = MetroHash128<4, false>, - $.hashfn_bswap = MetroHash128<4, true>, - $.badseeds = {} -); + $.desc = "Metrohash-crc v1 variant 2, 128-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x1ACF3E77, + $.verification_BE = 0x3772DA12, + $.hashfn_native = MetroHash128<4, false>, + $.hashfn_bswap = MetroHash128<4, true>, + $.badseeds = {} + ); #endif diff --git a/hashes/multiply_shift.cpp b/hashes/multiply_shift.cpp index bd95b0aa..3ac990ba 100644 --- a/hashes/multiply_shift.cpp +++ b/hashes/multiply_shift.cpp @@ -35,59 +35,63 @@ // https://arxiv.org/pdf/1504.06804.pdf // A randomly-generated table of 128-bit multiplicative constants -const static int MULTIPLY_SHIFT_RANDOM_WORDS = 1<<8; -static uint64_t multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS * 2]; +const static int MULTIPLY_SHIFT_RANDOM_WORDS = 1 << 8; +static uint64_t multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS * 2]; // This is just the Xorshift RNG, which was arbitrarily chosen. This // hash is labeled as system-dependent, since this would really be // replaced by *some* kind of srand()/rand() in practice. -static inline void mix(uint32_t & w, uint32_t & x, uint32_t & y, uint32_t & z) { +static inline void mix( uint32_t & w, uint32_t & x, uint32_t & y, uint32_t & z ) { uint32_t t = x ^ (x << 11); + x = y; y = z; z = w; w = w ^ (w >> 19) ^ t ^ (t >> 8); } -static uintptr_t multiply_shift_seed_init_slow(const seed_t seed) { +static uintptr_t multiply_shift_seed_init_slow( const seed_t seed ) { uint32_t w, x, y, z; - x = 0x498b3bc5 ^ (uint32_t)(seed); + + x = 0x498b3bc5 ^ (uint32_t)(seed ); y = 0x5a05089a ^ (uint32_t)(seed >> 32); w = z = 0; - for(int i = 0; i < 10; i++) mix(w, x, y, z); + for (int i = 0; i < 10; i++) { mix(w, x, y, z); } for (int i = 0; i < MULTIPLY_SHIFT_RANDOM_WORDS; i++) { mix(w, x, y, z); multiply_shift_random[2 * i + 1] = ((uint64_t)(x) << 32) | y; mix(w, x, y, z); multiply_shift_random[2 * i + 0] = ((uint64_t)(x) << 32) | y; - if (!multiply_shift_random[2 * i + 0]) + if (!multiply_shift_random[2 * i + 0]) { multiply_shift_random[2 * i + 0]++; + } } return 0; } -static bool multiply_shift_init(void) { +static bool multiply_shift_init( void ) { multiply_shift_seed_init_slow(0); return true; } // Vector multiply-shift (3.4) from Thorup's notes. -template < bool bswap > -static void multiply_shift32(const void * in, const size_t len_bytes, const seed_t seed, void * out) { +template +static void multiply_shift32( const void * in, const size_t len_bytes, const seed_t seed, void * out ) { const uint8_t * buf = (const uint8_t *)in; - const size_t len = len_bytes/4; + const size_t len = len_bytes / 4; // We mix in len_bytes in the basis, since smhasher considers two keys // of different length to be different, even if all the extra bits are 0. // This is needed for the AppendZero test. uint64_t h, t; - h = ((uint32_t)(seed)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1] + - ((uint32_t)(seed>>32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2] + - ((uint32_t)(len_bytes)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3] + - ((uint32_t)(len_bytes>>32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]; + + h = ((uint32_t)(seed )) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1] + + ((uint32_t)(seed >> 32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2] + + ((uint32_t)(len_bytes )) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3] + + ((uint32_t)(len_bytes >> 32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]; for (size_t i = 0; i < len; i++, buf += 4) { - t = GET_U32(buf, 0) * - multiply_shift_random[i % MULTIPLY_SHIFT_RANDOM_WORDS]; + t = GET_U32(buf, 0) * + multiply_shift_random[i % MULTIPLY_SHIFT_RANDOM_WORDS]; h += t; } @@ -96,9 +100,9 @@ static void multiply_shift32(const void * in, const size_t len_bytes, const seed if (remaining_bytes) { uint64_t last = 0; if (remaining_bytes & 2) { last = (last << 16) | GET_U16(buf, 0); buf += 2; } - if (remaining_bytes & 1) { last = (last << 8) | (*buf); } - t = last * - multiply_shift_random[len % MULTIPLY_SHIFT_RANDOM_WORDS]; + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + t = last * + multiply_shift_random[len % MULTIPLY_SHIFT_RANDOM_WORDS]; h += t; } @@ -106,31 +110,32 @@ static void multiply_shift32(const void * in, const size_t len_bytes, const seed } // Pair multiply-shift (3.5) from Thorup's notes. -template < bool bswap > -static void pair_multiply_shift32(const void * in, const size_t len_bytes, const seed_t seed, void * out) { +template +static void pair_multiply_shift32( const void * in, const size_t len_bytes, const seed_t seed, void * out ) { const uint8_t * buf = (const uint8_t *)in; - const size_t len = len_bytes/4; + const size_t len = len_bytes / 4; // We mix in len_bytes in the basis, since smhasher considers two keys // of different length to be different, even if all the extra bits are 0. // This is needed for the AppendZero test. uint64_t h, t; - h = ((uint32_t)(seed)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1] + - ((uint32_t)(seed>>32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2] + - ((uint32_t)(len_bytes)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3] + - ((uint32_t)(len_bytes>>32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]; - - for (size_t i = 0; i < len/2; i++, buf += 8) { - t = GET_U64(buf, 0); - h += (((uint32_t)(t)) + multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) + 1]) * - (((uint32_t)(t>>32)) + multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) + 0]); + + h = ((uint32_t)(seed )) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1] + + ((uint32_t)(seed >> 32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2] + + ((uint32_t)(len_bytes )) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3] + + ((uint32_t)(len_bytes >> 32)) * multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]; + + for (size_t i = 0; i < len / 2; i++, buf += 8) { + t = GET_U64(buf, 0); + h += (((uint32_t)(t )) + multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) + 1]) * + (((uint32_t)(t >> 32)) + multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) + 0]); } // Make sure we have the last word, if the number of words is odd if (len & 1) { - t = GET_U32(buf, 0) * - multiply_shift_random[(len - 1) % MULTIPLY_SHIFT_RANDOM_WORDS]; - h += t; + t = GET_U32(buf, 0) * + multiply_shift_random[(len - 1) % MULTIPLY_SHIFT_RANDOM_WORDS]; + h += t; buf += 4; } @@ -139,9 +144,9 @@ static void pair_multiply_shift32(const void * in, const size_t len_bytes, const if (remaining_bytes) { uint64_t last = 0; if (remaining_bytes & 2) { last = (last << 16) | GET_U16(buf, 0); buf += 2; } - if (remaining_bytes & 1) { last = (last << 8) | (*buf); } - t = last * - multiply_shift_random[len % MULTIPLY_SHIFT_RANDOM_WORDS]; + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + t = last * + multiply_shift_random[len % MULTIPLY_SHIFT_RANDOM_WORDS]; h += t; } @@ -156,20 +161,19 @@ static void pair_multiply_shift32(const void * in, const size_t len_bytes, const // for the moment. // // XXX Need to implement fma128_128() -template < bool bswap > -static void multiply_shift64(const void * in, const size_t len_bytes, const seed_t seed, void * out) { +template +static void multiply_shift64( const void * in, const size_t len_bytes, const seed_t seed, void * out ) { const uint8_t * buf = (const uint8_t *)in; - const size_t len = len_bytes/8; + const size_t len = len_bytes / 8; // We mix in len_bytes in the basis, since smhasher considers two keys // of different length to be different, even if all the extra bits are 0. // This is needed for the AppendZero test. uint64_t h, t, ignored; - mult128_128(ignored, h, (uint64_t)seed, 0, - multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1], + + mult128_128(ignored, h, (uint64_t)seed , 0, multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1], multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2]); - mult128_128(ignored, t, (uint64_t)len_bytes, 0, - multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3], + mult128_128(ignored, t, (uint64_t)len_bytes, 0, multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3], multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]); h += t; @@ -186,9 +190,8 @@ static void multiply_shift64(const void * in, const size_t len_bytes, const seed uint64_t last = 0; if (remaining_bytes & 4) { last = GET_U32(buf, 0); buf += 4; } if (remaining_bytes & 2) { last = (last << 16) | GET_U16(buf, 0); buf += 2; } - if (remaining_bytes & 1) { last = (last << 8) | (*buf); } - mult128_128(ignored, t, last, 0, - multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 0], + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + mult128_128(ignored, t, last, 0, multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 0], multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 1]); h += t; } @@ -197,23 +200,22 @@ static void multiply_shift64(const void * in, const size_t len_bytes, const seed } // Pair multiply-shift (3.5) from Thorup's notes. -template < bool bswap > -static void pair_multiply_shift64(const void * in, const size_t len_bytes, const seed_t seed, void * out) { +template +static void pair_multiply_shift64( const void * in, const size_t len_bytes, const seed_t seed, void * out ) { const uint8_t * buf = (const uint8_t *)in; - const size_t len = len_bytes/8; + const size_t len = len_bytes / 8; // We mix in len_bytes in the basis, since smhasher considers two keys // of different length to be different, even if all the extra bits are 0. // This is needed for the AppendZero test. uint64_t h, t, ignored; - mult128_128(ignored, h, (uint64_t)seed, 0, - multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1], + + mult128_128(ignored, h, (uint64_t)seed , 0, multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 1], multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 2]); - mult128_128(ignored, t, (uint64_t)len_bytes, 0, - multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3], + mult128_128(ignored, t, (uint64_t)len_bytes, 0, multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 3], multiply_shift_random[MULTIPLY_SHIFT_RANDOM_WORDS - 4]); h += t; - for (size_t i = 0; i < len/2; i++, buf += 16) { + for (size_t i = 0; i < len / 2; i++, buf += 16) { uint64_t blk1lo, blk1hi, blk2lo, blk2hi; blk1lo = multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 2]; blk1hi = multiply_shift_random[((2 * i) % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 3]; @@ -230,7 +232,7 @@ static void pair_multiply_shift64(const void * in, const size_t len_bytes, const mult128_128(ignored, t, GET_U64(buf, 0), 0, multiply_shift_random[((len - 1) % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 0], multiply_shift_random[((len - 1) % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 1]); - h += t; + h += t; buf += 8; } @@ -240,9 +242,8 @@ static void pair_multiply_shift64(const void * in, const size_t len_bytes, const uint64_t last = 0; if (remaining_bytes & 4) { last = GET_U32(buf, 0); buf += 4; } if (remaining_bytes & 2) { last = (last << 16) | GET_U16(buf, 0); buf += 2; } - if (remaining_bytes & 1) { last = (last << 8) | (*buf); } - mult128_128(ignored, t, last, 0, - multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 0], + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + mult128_128(ignored, t, last, 0, multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 0], multiply_shift_random[(len % MULTIPLY_SHIFT_RANDOM_WORDS) * 2 + 1]); h += t; } @@ -251,74 +252,74 @@ static void pair_multiply_shift64(const void * in, const size_t len_bytes, const } REGISTER_FAMILY(multiply_shift, - $.src_url = "https://github.com/rurban/smhasher/blob/2b5992fe015282c87c9069e3c664771b47555ff3/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/2b5992fe015282c87c9069e3c664771b47555ff3/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(multiply_shift_32, - $.desc = "Dietzfelbinger Multiply-shift on strings, 32-bit blocks", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x34BAD85C, - $.verification_BE = 0x133CC3AC, - $.hashfn_native = multiply_shift32, - $.hashfn_bswap = multiply_shift32, -//$.seedfn = multiply_shift_seed_init_slow - $.initfn = multiply_shift_init -); + $.desc = "Dietzfelbinger Multiply-shift on strings, 32-bit blocks", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x34BAD85C, + $.verification_BE = 0x133CC3AC, + $.hashfn_native = multiply_shift32, + $.hashfn_bswap = multiply_shift32, +// $.seedfn = multiply_shift_seed_init_slow + $.initfn = multiply_shift_init + ); REGISTER_HASH(pair_multiply_shift_32, - $.desc = "Dietzfelbinger Pair-multiply-shift strings, 32-bit blocks", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xFC284F0F, - $.verification_BE = 0x6E93B706, - $.hashfn_native = pair_multiply_shift32, - $.hashfn_bswap = pair_multiply_shift32, -//$.seedfn = multiply_shift_seed_init_slow - $.initfn = multiply_shift_init -); + $.desc = "Dietzfelbinger Pair-multiply-shift strings, 32-bit blocks", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xFC284F0F, + $.verification_BE = 0x6E93B706, + $.hashfn_native = pair_multiply_shift32, + $.hashfn_bswap = pair_multiply_shift32, +// $.seedfn = multiply_shift_seed_init_slow + $.initfn = multiply_shift_init + ); REGISTER_HASH(multiply_shift, - $.desc = "Dietzfelbinger Multiply-shift on strings, 64-bit blocks", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_128_128 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xB7A5E66D, - $.verification_BE = 0x6E3902A6, - $.hashfn_native = multiply_shift64, - $.hashfn_bswap = multiply_shift64, -//$.seedfn = multiply_shift_seed_init_slow - $.initfn = multiply_shift_init -); + $.desc = "Dietzfelbinger Multiply-shift on strings, 64-bit blocks", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_128_128 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xB7A5E66D, + $.verification_BE = 0x6E3902A6, + $.hashfn_native = multiply_shift64, + $.hashfn_bswap = multiply_shift64, +// $.seedfn = multiply_shift_seed_init_slow + $.initfn = multiply_shift_init + ); REGISTER_HASH(pair_multiply_shift, - $.desc = "Dietzfelbinger Pair-multiply-shift strings, 64-bit blocks", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_128_128 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x4FBA804D, - $.verification_BE = 0x2B7F643B, - $.hashfn_native = pair_multiply_shift64, - $.hashfn_bswap = pair_multiply_shift64, -//$.seedfn = multiply_shift_seed_init_slow - $.initfn = multiply_shift_init -); + $.desc = "Dietzfelbinger Pair-multiply-shift strings, 64-bit blocks", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_128_128 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x4FBA804D, + $.verification_BE = 0x2B7F643B, + $.hashfn_native = pair_multiply_shift64, + $.hashfn_bswap = pair_multiply_shift64, +// $.seedfn = multiply_shift_seed_init_slow + $.initfn = multiply_shift_init + ); diff --git a/hashes/mum_mir.cpp b/hashes/mum_mir.cpp index 71a66833..07a2a1ce 100644 --- a/hashes/mum_mir.cpp +++ b/hashes/mum_mir.cpp @@ -1,4 +1,5 @@ -/* MUM and MIR hashes +/* + * MUM and MIR hashes * Copyright (C) 2021-2022 Frank J. T. Wojcik * Copyright (c) 2016 Vladimir Makarov * @@ -21,7 +22,7 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. -*/ + */ #include "Platform.h" #include "Hashlib.h" @@ -40,9 +41,10 @@ // // The code has been reworked to allow both forms to always be // calculable on every platform. -template < bool exact > -static inline uint64_t _mum(uint64_t v, uint64_t p) { +template +static inline uint64_t _mum( uint64_t v, uint64_t p ) { uint64_t hi, lo; + if (exact) { mult64_128(lo, hi, v, p); } else { @@ -60,9 +62,11 @@ static inline uint64_t _mum(uint64_t v, uint64_t p) { //----------------------------------------------------------------------------- // MUM hash internals -/* Here are different primes randomly generated with the equal - probability of their bit values. They are used to randomize input - values. */ +/* + * Here are different primes randomly generated with the equal + * probability of their bit values. They are used to randomize input + * values. + */ static const uint64_t _mum_hash_step_prime = UINT64_C(0x2e0bb864e9ea7df5); static const uint64_t _mum_key_step_prime = UINT64_C(0xcdb32970830fcaa1); static const uint64_t _mum_block_start_prime = UINT64_C(0xc42b5e2e6480b23b); @@ -71,118 +75,122 @@ static const uint64_t _mum_tail_prime = UINT64_C(0xaf47d47c99b1461b); static const uint64_t _mum_finish_prime1 = UINT64_C(0xa9a7ae7ceff79f3f); static const uint64_t _mum_finish_prime2 = UINT64_C(0xaf47d47c99b1461b); -static const uint64_t _mum_primes [] = { - UINT64_C(0x9ebdcae10d981691), UINT64_C(0x32b9b9b97a27ac7d), - UINT64_C(0x29b5584d83d35bbd), UINT64_C(0x4b04e0e61401255f), - UINT64_C(0x25e8f7b1f1c9d027), UINT64_C(0x80d4c8c000f3e881), - UINT64_C(0xbd1255431904b9dd), UINT64_C(0x8a3bd4485eee6d81), - UINT64_C(0x3bc721b2aad05197), UINT64_C(0x71b1a19b907d6e33), - UINT64_C(0x525e6c1084a8534b), UINT64_C(0x9e4c2cd340c1299f), - UINT64_C(0xde3add92e94caa37), UINT64_C(0x7e14eadb1f65311d), - UINT64_C(0x3f5aa40f89812853), UINT64_C(0x33b15a3b587d15c9), +static const uint64_t _mum_primes [] = { + UINT64_C(0x9ebdcae10d981691), UINT64_C(0x32b9b9b97a27ac7d), + UINT64_C(0x29b5584d83d35bbd), UINT64_C(0x4b04e0e61401255f), + UINT64_C(0x25e8f7b1f1c9d027), UINT64_C(0x80d4c8c000f3e881), + UINT64_C(0xbd1255431904b9dd), UINT64_C(0x8a3bd4485eee6d81), + UINT64_C(0x3bc721b2aad05197), UINT64_C(0x71b1a19b907d6e33), + UINT64_C(0x525e6c1084a8534b), UINT64_C(0x9e4c2cd340c1299f), + UINT64_C(0xde3add92e94caa37), UINT64_C(0x7e14eadb1f65311d), + UINT64_C(0x3f5aa40f89812853), UINT64_C(0x33b15a3b587d15c9), }; // Since unroll_power actually affects hash *values*, not just speed, // it needs to be a template parameter, so all versions of the hash // can be tested on all platforms. -template < uint32_t version, uint32_t unroll_power, bool bswap, bool exactmul > -//_MUM_OPTIMIZE("unroll-loops") -static inline uint64_t _mum_hash_aligned(uint64_t seed, const void * key, size_t len) { - const uint32_t _MUM_UNROLL_FACTOR = 1 << unroll_power; - const uint8_t * str = (const uint8_t *)key; - uint64_t u64, result; - size_t i; - size_t n; - - if ((version == 1) || (version == 3)) { - result = _mum(seed, _mum_block_start_prime); - } else { - result = seed; - } - while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) { - /* - * This loop could be vectorized when we have vector insns for - * 64x64->128-bit multiplication. AVX2 currently only have a - * vector insn for 4 32x32->64-bit multiplication. - */ - if ((version == 1) || (version == 2)) { - for (i = 0; i < _MUM_UNROLL_FACTOR; i++) - result ^= _mum(GET_U64(str, i*8), _mum_primes[i]); +template +// _MUM_OPTIMIZE("unroll-loops") +static inline uint64_t _mum_hash_aligned( uint64_t seed, const void * key, size_t len ) { + const uint32_t _MUM_UNROLL_FACTOR = 1 << unroll_power; + const uint8_t * str = (const uint8_t *)key; + uint64_t u64, result; + size_t i; + size_t n; + + if ((version == 1) || (version == 3)) { + result = _mum(seed, _mum_block_start_prime); } else { - for (i = 0; i < _MUM_UNROLL_FACTOR; i+=2) - result ^= _mum(GET_U64(str, i*8 ) ^ _mum_primes[i], - GET_U64(str, i*8 + 8) ^ _mum_primes[i+1]); + result = seed; } - len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t); - str += _MUM_UNROLL_FACTOR * sizeof (uint64_t); - /* - * We will use the same prime numbers on the next iterations -- - * randomize the state. - */ - result = _mum(result, _mum_unroll_prime); - } - n = len / sizeof (uint64_t); - for (i = 0; i < n; i++) - result ^= _mum(GET_U64(str, i*8), _mum_primes[i]); - len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t); - switch (len) { - case 7: - u64 = GET_U32(str, 0); - u64 |= (uint64_t) str[4] << 32; - u64 |= (uint64_t) str[5] << 40; - u64 |= (uint64_t) str[6] << 48; - return result ^ _mum(u64, _mum_tail_prime); - case 6: - u64 = GET_U32(str, 0); - u64 |= (uint64_t) str[4] << 32; - u64 |= (uint64_t) str[5] << 40; - return result ^ _mum(u64, _mum_tail_prime); - case 5: - u64 = GET_U32(str, 0); - u64 |= (uint64_t) str[4] << 32; - return result ^ _mum(u64, _mum_tail_prime); - case 4: - u64 = GET_U32(str, 0); - return result ^ _mum(u64, _mum_tail_prime); - case 3: - u64 = str[0]; - u64 |= (uint64_t) str[1] << 8; - u64 |= (uint64_t) str[2] << 16; - return result ^ _mum(u64, _mum_tail_prime); - case 2: - u64 = str[0]; - u64 |= (uint64_t) str[1] << 8; - return result ^ _mum(u64, _mum_tail_prime); - case 1: - u64 = str[0]; - return result ^ _mum(u64, _mum_tail_prime); - } - return result; + while (len > _MUM_UNROLL_FACTOR * sizeof(uint64_t)) { + /* + * This loop could be vectorized when we have vector insns for + * 64x64->128-bit multiplication. AVX2 currently only have a + * vector insn for 4 32x32->64-bit multiplication. + */ + if ((version == 1) || (version == 2)) { + for (i = 0; i < _MUM_UNROLL_FACTOR; i++) { + result ^= _mum(GET_U64(str, i * 8) , _mum_primes[i]); + } + } else { + for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2) { + result ^= _mum(GET_U64(str, i * 8) ^ _mum_primes[i], + GET_U64(str, i * 8 + 8) ^ _mum_primes[i + 1]); + } + } + len -= _MUM_UNROLL_FACTOR * sizeof(uint64_t); + str += _MUM_UNROLL_FACTOR * sizeof(uint64_t); + /* + * We will use the same prime numbers on the next iterations -- + * randomize the state. + */ + result = _mum(result, _mum_unroll_prime); + } + n = len / sizeof(uint64_t); + for (i = 0; i < n; i++) { + result ^= _mum(GET_U64(str, i * 8), _mum_primes[i]); + } + len -= n * sizeof(uint64_t); str += n * sizeof(uint64_t); + switch (len) { + case 7: + u64 = GET_U32(str, 0); + u64 |= (uint64_t)str[4] << 32; + u64 |= (uint64_t)str[5] << 40; + u64 |= (uint64_t)str[6] << 48; + return result ^ _mum(u64, _mum_tail_prime); + case 6: + u64 = GET_U32(str, 0); + u64 |= (uint64_t)str[4] << 32; + u64 |= (uint64_t)str[5] << 40; + return result ^ _mum(u64, _mum_tail_prime); + case 5: + u64 = GET_U32(str, 0); + u64 |= (uint64_t)str[4] << 32; + return result ^ _mum(u64, _mum_tail_prime); + case 4: + u64 = GET_U32(str, 0); + return result ^ _mum(u64, _mum_tail_prime); + case 3: + u64 = str[0]; + u64 |= (uint64_t)str[1] << 8; + u64 |= (uint64_t)str[2] << 16; + return result ^ _mum(u64, _mum_tail_prime); + case 2: + u64 = str [0]; + u64 |= (uint64_t)str[1] << 8; + return result ^ _mum(u64, _mum_tail_prime); + case 1: + u64 = str [0]; + return result ^ _mum(u64, _mum_tail_prime); + } + return result; } /* Final randomization of H. */ -template < uint32_t version, bool exactmul > -static inline uint64_t _mum_final (uint64_t h) { - if (version == 1) { - h ^= _mum(h, _mum_finish_prime1); - h ^= _mum(h, _mum_finish_prime2); - } else if (version == 2) { - h ^= ROTL64(h, 33); - h ^= _mum(h, _mum_finish_prime1); - } else { - h = _mum(h, h); - } - return h; +template +static inline uint64_t _mum_final( uint64_t h ) { + if (version == 1) { + h ^= _mum(h, _mum_finish_prime1); + h ^= _mum(h, _mum_finish_prime2); + } else if (version == 2) { + h ^= ROTL64(h, 33); + h ^= _mum(h, _mum_finish_prime1); + } else { + h = _mum(h, h); + } + return h; } //----------------------------------------------------------------------------- // MUM hash externals for SMHasher3 -template < uint32_t version, uint32_t unroll_power, bool bswap, bool exactmul > -static void mum_aligned(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void mum_aligned( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h; - h = _mum_hash_aligned(seed + len, in, len); - h = _mum_final(h); + + h = _mum_hash_aligned(seed + len, in, len); + h = _mum_final(h); PUT_U64(h, (uint8_t *)out, 0); } @@ -194,23 +202,26 @@ static void mum_aligned(const void * in, const size_t len, const seed_t seed, vo // "while (len >= .....". // Based on this, I'm removing the realign variants for now. #if defined(NOTYET) -template < uint32_t version, uint32_t unroll_power, bool bswap, bool exactmul > -static void mum_realign(const void * in, const size_t olen, const seed_t seed, void * out) { + +template +static void mum_realign( const void * in, const size_t olen, const seed_t seed, void * out ) { const uint8_t * str = (const uint8_t *)in; - const uint32_t _MUM_BLOCK_LEN = 1024; - uint64_t buf[_MUM_BLOCK_LEN / sizeof(uint64_t)]; - size_t len = olen; - uint64_t h = seed + olen; + const uint32_t _MUM_BLOCK_LEN = 1024; + uint64_t buf[_MUM_BLOCK_LEN / sizeof(uint64_t)]; + size_t len = olen; + uint64_t h = seed + olen; + while (len != 0) { size_t block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN; memmove(buf, str, block_len); - h = _mum_hash_aligned(h, buf, block_len); + h = _mum_hash_aligned(h, buf, block_len); len -= block_len; str += block_len; } - h = _mum_final(h); + h = _mum_final(h); PUT_U64(h, (uint8_t *)out, 0); } + #endif //----------------------------------------------------------------------------- @@ -222,33 +233,33 @@ static void mum_realign(const void * in, const size_t olen, const seed_t seed, v * Hash for the same key can be different on different architectures. * To get machine-independent hash, use mir_hash_strict which is about * 1.5 times slower than mir_hash. -*/ -template < bool exact > -static inline uint64_t mir_mum(uint64_t v, uint64_t c) { - if (exact) { return _mum(v, c); } - uint64_t v1 = v >> 32, v2 = (uint32_t) v, c1 = c >> 32, c2 = (uint32_t) c, rm = v2 * c1 + v1 * c2; - return v1 * c1 + (rm >> 32) + v2 * c2 + (rm << 32); + */ +template +static inline uint64_t mir_mum( uint64_t v, uint64_t c ) { + if (exact) { return _mum(v, c); } + uint64_t v1 = v >> 32, v2 = (uint32_t)v, c1 = c >> 32, c2 = (uint32_t)c, rm = v2 * c1 + v1 * c2; + return v1 * c1 + (rm >> 32) + v2 * c2 + (rm << 32); } static const uint64_t p1 = UINT64_C(0x65862b62bdf5ef4d), p2 = UINT64_C(0x288eea216831e6a7); -template < bool exactmul > -static inline uint64_t mir_round(uint64_t state, uint64_t v) { - state ^= mir_mum(v, p1); - return state ^ mir_mum(state, p2); +template +static inline uint64_t mir_round( uint64_t state, uint64_t v ) { + state ^= mir_mum(v, p1); + return state ^ mir_mum(state, p2); } -template < bool bswap > -static inline uint64_t mir_get_key_part(const uint8_t * v, size_t len) { - size_t i, start = 0; - uint64_t tail = 0; - - if (len >= sizeof(uint32_t)) { - tail = ((uint64_t)(GET_U32(v, 0))) << 32; - start = 4; - } - for (i = start; i < len; i++) tail = (tail >> 8) | ((uint64_t) v[i] << 56); - return tail; +template +static inline uint64_t mir_get_key_part( const uint8_t * v, size_t len ) { + size_t i, start = 0; + uint64_t tail = 0; + + if (len >= sizeof(uint32_t)) { + tail = ((uint64_t)(GET_U32(v, 0))) << 32; + start = 4; + } + for (i = start; i < len; i++) { tail = (tail >> 8) | ((uint64_t)v[i] << 56); } + return tail; } //----------------------------------------------------------------------------- @@ -257,789 +268,789 @@ static inline uint64_t mir_get_key_part(const uint8_t * v, size_t len) { // The bswap and exactmul booleans cover all possible sets of hash // values from the original mir_hash() in both "strict" mode and // "relaxed" mode, regardless of machine endianness. -template < bool bswap, bool exactmul > -static void mir_hash(const void * in, const size_t olen, const seed_t seed, void * out) { - const uint8_t * v = (const uint8_t *)in; - uint64_t r = seed + olen; - size_t len = olen; - uint64_t blk; - - for (; len >= 16; len -= 16, v += 16) { - r ^= mir_mum(GET_U64(v, 0), p1); - r ^= mir_mum(GET_U64(v, 8), p2); - r ^= mir_mum(r, p1); - } - if (len >= 8) { - r ^= mir_mum(GET_U64(v, 0), p1); - len -= 8, v += 8; - } - if (len != 0) { - r ^= mir_mum(mir_get_key_part(v, len), p2); - } - r = mir_round(r, r); - PUT_U64(r, (uint8_t *)out, 0); +template +static void mir_hash( const void * in, const size_t olen, const seed_t seed, void * out ) { + const uint8_t * v = (const uint8_t *)in; + uint64_t r = seed + olen; + size_t len = olen; + uint64_t blk; + + for (; len >= 16; len -= 16, v += 16) { + r ^= mir_mum(GET_U64(v, 0), p1); + r ^= mir_mum(GET_U64(v, 8), p2); + r ^= mir_mum(r, p1); + } + if (len >= 8) { + r ^= mir_mum(GET_U64(v, 0), p1); + len -= 8, v += 8; + } + if (len != 0) { + r ^= mir_mum(mir_get_key_part(v, len), p2); + } + r = mir_round(r, r); + PUT_U64(r, (uint8_t *)out, 0); } //----------------------------------------------------------------------------- // Also https://github.com/vnmakarov/mir/blob/master/mir-hash.h REGISTER_FAMILY(mum_mir, - $.src_url = "https://github.com/vnmakarov/mum-hash", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/vnmakarov/mum-hash", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(mum1__exact__unroll1, - $.desc = "Mum-hash v1, unroll 2^1, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xCB93DE58, - $.verification_BE = 0xE820D0FB, - $.hashfn_native = mum_aligned<1,1,false,true>, - $.hashfn_bswap = mum_aligned<1,1,true,true> -); + $.desc = "Mum-hash v1, unroll 2^1, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xCB93DE58, + $.verification_BE = 0xE820D0FB, + $.hashfn_native = mum_aligned<1, 1, false, true>, + $.hashfn_bswap = mum_aligned<1, 1, true, true> + ); REGISTER_HASH(mum1__exact__unroll2, - $.desc = "Mum-hash v1, unroll 2^2, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x3EEAE2D4, - $.verification_BE = 0xF23A691C, - $.hashfn_native = mum_aligned<1,2,false,true>, - $.hashfn_bswap = mum_aligned<1,2,true,true> -); + $.desc = "Mum-hash v1, unroll 2^2, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x3EEAE2D4, + $.verification_BE = 0xF23A691C, + $.hashfn_native = mum_aligned<1, 2, false, true>, + $.hashfn_bswap = mum_aligned<1, 2, true, true> + ); REGISTER_HASH(mum1__exact__unroll3, - $.desc = "Mum-hash v1, unroll 2^3, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x7C0A2F98, - $.verification_BE = 0x210F4BEB, - $.hashfn_native = mum_aligned<1,3,false,true>, - $.hashfn_bswap = mum_aligned<1,3,true,true> -); + $.desc = "Mum-hash v1, unroll 2^3, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x7C0A2F98, + $.verification_BE = 0x210F4BEB, + $.hashfn_native = mum_aligned<1, 3, false, true>, + $.hashfn_bswap = mum_aligned<1, 3, true, true> + ); REGISTER_HASH(mum1__exact__unroll4, - $.desc = "Mum-hash v1, unroll 2^4, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x280B2CC6, - $.verification_BE = 0x0609C4A6, - $.hashfn_native = mum_aligned<1,4,false,true>, - $.hashfn_bswap = mum_aligned<1,4,true,true> -); + $.desc = "Mum-hash v1, unroll 2^4, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x280B2CC6, + $.verification_BE = 0x0609C4A6, + $.hashfn_native = mum_aligned<1, 4, false, true>, + $.hashfn_bswap = mum_aligned<1, 4, true, true> + ); REGISTER_HASH(mum1__inexact__unroll1, - $.desc = "Mum-hash v1, unroll 2^1, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x689214DF, - $.verification_BE = 0x14FBDFDD, - $.hashfn_native = mum_aligned<1,1,false,false>, - $.hashfn_bswap = mum_aligned<1,1,true,false> -); + $.desc = "Mum-hash v1, unroll 2^1, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x689214DF, + $.verification_BE = 0x14FBDFDD, + $.hashfn_native = mum_aligned<1, 1, false, false>, + $.hashfn_bswap = mum_aligned<1, 1, true, false> + ); REGISTER_HASH(mum1__inexact__unroll2, - $.desc = "Mum-hash v1, unroll 2^2, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xA973C6C0, - $.verification_BE = 0x9C12DFA3, - $.hashfn_native = mum_aligned<1,2,false,false>, - $.hashfn_bswap = mum_aligned<1,2,true,false> -); + $.desc = "Mum-hash v1, unroll 2^2, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xA973C6C0, + $.verification_BE = 0x9C12DFA3, + $.hashfn_native = mum_aligned<1, 2, false, false>, + $.hashfn_bswap = mum_aligned<1, 2, true, false> + ); REGISTER_HASH(mum1__inexact__unroll3, - $.desc = "Mum-hash v1, unroll 2^3, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x5FC8FC51, - $.verification_BE = 0x907AB469, - $.hashfn_native = mum_aligned<1,3,false,false>, - $.hashfn_bswap = mum_aligned<1,3,true,false> -); + $.desc = "Mum-hash v1, unroll 2^3, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x5FC8FC51, + $.verification_BE = 0x907AB469, + $.hashfn_native = mum_aligned<1, 3, false, false>, + $.hashfn_bswap = mum_aligned<1, 3, true, false> + ); REGISTER_HASH(mum1__inexact__unroll4, - $.desc = "Mum-hash v1, unroll 2^4, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x2EF256D3, - $.verification_BE = 0xBF27AAE6, - $.hashfn_native = mum_aligned<1,4,false,false>, - $.hashfn_bswap = mum_aligned<1,4,true,false> -); + $.desc = "Mum-hash v1, unroll 2^4, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x2EF256D3, + $.verification_BE = 0xBF27AAE6, + $.hashfn_native = mum_aligned<1, 4, false, false>, + $.hashfn_bswap = mum_aligned<1, 4, true, false> + ); #if defined(NOTYET) REGISTER_HASH(mum1_realign__exact__unroll1, - $.desc = "Mum-hash v1, unroll 2^1, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x9E323D13, - $.verification_BE = 0x2E655802, - $.hashfn_native = mum_realign<1,1,false,true>, - $.hashfn_bswap = mum_realign<1,1,true,true> -); + $.desc = "Mum-hash v1, unroll 2^1, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x9E323D13, + $.verification_BE = 0x2E655802, + $.hashfn_native = mum_realign<1, 1, false, true>, + $.hashfn_bswap = mum_realign<1, 1, true, true> + ); REGISTER_HASH(mum1_realign__exact__unroll2, - $.desc = "Mum-hash v1, unroll 2^2, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x139A630F, - $.verification_BE = 0x2281185A, - $.hashfn_native = mum_realign<1,2,false,true>, - $.hashfn_bswap = mum_realign<1,2,true,true> -); + $.desc = "Mum-hash v1, unroll 2^2, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x139A630F, + $.verification_BE = 0x2281185A, + $.hashfn_native = mum_realign<1, 2, false, true>, + $.hashfn_bswap = mum_realign<1, 2, true, true> + ); REGISTER_HASH(mum1_realign__exact__unroll3, - $.desc = "Mum-hash v1, unroll 2^3, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x0F1AC6C6, - $.verification_BE = 0xE8BF6CE3, - $.hashfn_native = mum_realign<1,3,false,true>, - $.hashfn_bswap = mum_realign<1,3,true,true> -); + $.desc = "Mum-hash v1, unroll 2^3, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x0F1AC6C6, + $.verification_BE = 0xE8BF6CE3, + $.hashfn_native = mum_realign<1, 3, false, true>, + $.hashfn_bswap = mum_realign<1, 3, true, true> + ); REGISTER_HASH(mum1_realign__exact__unroll4, - $.desc = "Mum-hash v1, unroll 2^4, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xF47885FE, - $.verification_BE = 0xA7961551, - $.hashfn_native = mum_realign<1,4,false,true>, - $.hashfn_bswap = mum_realign<1,4,true,true> -); + $.desc = "Mum-hash v1, unroll 2^4, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xF47885FE, + $.verification_BE = 0xA7961551, + $.hashfn_native = mum_realign<1, 4, false, true>, + $.hashfn_bswap = mum_realign<1, 4, true, true> + ); REGISTER_HASH(mum1_realign__inexact__unroll1, - $.desc = "Mum-hash v1, unroll 2^1, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xE11FC923, - $.verification_BE = 0x99623861, - $.hashfn_native = mum_realign<1,1,false,false>, - $.hashfn_bswap = mum_realign<1,1,true,false> -); + $.desc = "Mum-hash v1, unroll 2^1, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xE11FC923, + $.verification_BE = 0x99623861, + $.hashfn_native = mum_realign<1, 1, false, false>, + $.hashfn_bswap = mum_realign<1, 1, true, false> + ); REGISTER_HASH(mum1_realign__inexact__unroll2, - $.desc = "Mum-hash v1, unroll 2^2, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xBAFC050E, - $.verification_BE = 0x9678D798, - $.hashfn_native = mum_realign<1,2,false,false>, - $.hashfn_bswap = mum_realign<1,2,true,false> -); + $.desc = "Mum-hash v1, unroll 2^2, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xBAFC050E, + $.verification_BE = 0x9678D798, + $.hashfn_native = mum_realign<1, 2, false, false>, + $.hashfn_bswap = mum_realign<1, 2, true, false> + ); REGISTER_HASH(mum1_realign__inexact__unroll3, - $.desc = "Mum-hash v1, unroll 2^3, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x56FA3D86, - $.verification_BE = 0x8EDC90F0, - $.hashfn_native = mum_realign<1,3,false,false>, - $.hashfn_bswap = mum_realign<1,3,true,false> -); + $.desc = "Mum-hash v1, unroll 2^3, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x56FA3D86, + $.verification_BE = 0x8EDC90F0, + $.hashfn_native = mum_realign<1, 3, false, false>, + $.hashfn_bswap = mum_realign<1, 3, true, false> + ); REGISTER_HASH(mum1_realign__inexact__unroll4, - $.desc = "Mum-hash v1, unroll 2^4, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x59787144, - $.verification_BE = 0xFCAEA377, - $.hashfn_native = mum_realign<1,4,false,false>, - $.hashfn_bswap = mum_realign<1,4,true,false> -); + $.desc = "Mum-hash v1, unroll 2^4, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x59787144, + $.verification_BE = 0xFCAEA377, + $.hashfn_native = mum_realign<1, 4, false, false>, + $.hashfn_bswap = mum_realign<1, 4, true, false> + ); #endif REGISTER_HASH(mum2__exact__unroll1, - $.desc = "Mum-hash v2, unroll 2^1, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x9B36F94C, - $.verification_BE = 0x50F10B41, - $.hashfn_native = mum_aligned<2,1,false,true>, - $.hashfn_bswap = mum_aligned<2,1,true,true> -); + $.desc = "Mum-hash v2, unroll 2^1, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x9B36F94C, + $.verification_BE = 0x50F10B41, + $.hashfn_native = mum_aligned<2, 1, false, true>, + $.hashfn_bswap = mum_aligned<2, 1, true, true> + ); REGISTER_HASH(mum2__exact__unroll2, - $.desc = "Mum-hash v2, unroll 2^2, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x40427228, - $.verification_BE = 0x43DB198B, - $.hashfn_native = mum_aligned<2,2,false,true>, - $.hashfn_bswap = mum_aligned<2,2,true,true> -); + $.desc = "Mum-hash v2, unroll 2^2, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x40427228, + $.verification_BE = 0x43DB198B, + $.hashfn_native = mum_aligned<2, 2, false, true>, + $.hashfn_bswap = mum_aligned<2, 2, true, true> + ); REGISTER_HASH(mum2__exact__unroll3, - $.desc = "Mum-hash v2, unroll 2^3, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xB5D1CB5C, - $.verification_BE = 0xA718EDE8, - $.hashfn_native = mum_aligned<2,3,false,true>, - $.hashfn_bswap = mum_aligned<2,3,true,true> -); + $.desc = "Mum-hash v2, unroll 2^3, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xB5D1CB5C, + $.verification_BE = 0xA718EDE8, + $.hashfn_native = mum_aligned<2, 3, false, true>, + $.hashfn_bswap = mum_aligned<2, 3, true, true> + ); REGISTER_HASH(mum2__exact__unroll4, - $.desc = "Mum-hash v2, unroll 2^4, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x59AEDABF, - $.verification_BE = 0x3B1A2832, - $.hashfn_native = mum_aligned<2,4,false,true>, - $.hashfn_bswap = mum_aligned<2,4,true,true> -); + $.desc = "Mum-hash v2, unroll 2^4, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x59AEDABF, + $.verification_BE = 0x3B1A2832, + $.hashfn_native = mum_aligned<2, 4, false, true>, + $.hashfn_bswap = mum_aligned<2, 4, true, true> + ); REGISTER_HASH(mum2__inexact__unroll1, - $.desc = "Mum-hash v2, unroll 2^1, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x1CC6D1E3, - $.verification_BE = 0x297D8E45, - $.hashfn_native = mum_aligned<2,1,false,false>, - $.hashfn_bswap = mum_aligned<2,1,true,false> -); + $.desc = "Mum-hash v2, unroll 2^1, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x1CC6D1E3, + $.verification_BE = 0x297D8E45, + $.hashfn_native = mum_aligned<2, 1, false, false>, + $.hashfn_bswap = mum_aligned<2, 1, true, false> + ); REGISTER_HASH(mum2__inexact__unroll2, - $.desc = "Mum-hash v2, unroll 2^2, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x62325A27, - $.verification_BE = 0x5324AEEA, - $.hashfn_native = mum_aligned<2,2,false,false>, - $.hashfn_bswap = mum_aligned<2,2,true,false> -); + $.desc = "Mum-hash v2, unroll 2^2, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x62325A27, + $.verification_BE = 0x5324AEEA, + $.hashfn_native = mum_aligned<2, 2, false, false>, + $.hashfn_bswap = mum_aligned<2, 2, true, false> + ); REGISTER_HASH(mum2__inexact__unroll3, - $.desc = "Mum-hash v2, unroll 2^3, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xF4DD9947, - $.verification_BE = 0x98C9448F, - $.hashfn_native = mum_aligned<2,3,false,false>, - $.hashfn_bswap = mum_aligned<2,3,true,false> -); + $.desc = "Mum-hash v2, unroll 2^3, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xF4DD9947, + $.verification_BE = 0x98C9448F, + $.hashfn_native = mum_aligned<2, 3, false, false>, + $.hashfn_bswap = mum_aligned<2, 3, true, false> + ); REGISTER_HASH(mum2__inexact__unroll4, - $.desc = "Mum-hash v2, unroll 2^4, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x62C46C55, - $.verification_BE = 0x0E9DDA53, - $.hashfn_native = mum_aligned<2,4,false,false>, - $.hashfn_bswap = mum_aligned<2,4,true,false> -); + $.desc = "Mum-hash v2, unroll 2^4, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x62C46C55, + $.verification_BE = 0x0E9DDA53, + $.hashfn_native = mum_aligned<2, 4, false, false>, + $.hashfn_bswap = mum_aligned<2, 4, true, false> + ); #if defined(NOTYET) REGISTER_HASH(mum2_realign__exact__unroll1, - $.desc = "Mum-hash v2, unroll 2^1, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x3A8751BE, - $.verification_BE = 0xA3C3C380, - $.hashfn_native = mum_realign<2,1,false,true>, - $.hashfn_bswap = mum_realign<2,1,true,true> -); + $.desc = "Mum-hash v2, unroll 2^1, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x3A8751BE, + $.verification_BE = 0xA3C3C380, + $.hashfn_native = mum_realign<2, 1, false, true>, + $.hashfn_bswap = mum_realign<2, 1, true, true> + ); REGISTER_HASH(mum2_realign__exact__unroll2, - $.desc = "Mum-hash v2, unroll 2^2, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x7C85EF5A, - $.verification_BE = 0xE99D6D79, - $.hashfn_native = mum_realign<2,2,false,true>, - $.hashfn_bswap = mum_realign<2,2,true,true> -); + $.desc = "Mum-hash v2, unroll 2^2, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x7C85EF5A, + $.verification_BE = 0xE99D6D79, + $.hashfn_native = mum_realign<2, 2, false, true>, + $.hashfn_bswap = mum_realign<2, 2, true, true> + ); REGISTER_HASH(mum2_realign__exact__unroll3, - $.desc = "Mum-hash v2, unroll 2^3, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x93F25600, - $.verification_BE = 0xE13A6F00, - $.hashfn_native = mum_realign<2,3,false,true>, - $.hashfn_bswap = mum_realign<2,3,true,true> -); + $.desc = "Mum-hash v2, unroll 2^3, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x93F25600, + $.verification_BE = 0xE13A6F00, + $.hashfn_native = mum_realign<2, 3, false, true>, + $.hashfn_bswap = mum_realign<2, 3, true, true> + ); REGISTER_HASH(mum2_realign__exact__unroll4, - $.desc = "Mum-hash v2, unroll 2^4, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xA0DC8DF8, - $.verification_BE = 0x6B746384, - $.hashfn_native = mum_realign<2,4,false,true>, - $.hashfn_bswap = mum_realign<2,4,true,true> -); + $.desc = "Mum-hash v2, unroll 2^4, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xA0DC8DF8, + $.verification_BE = 0x6B746384, + $.hashfn_native = mum_realign<2, 4, false, true>, + $.hashfn_bswap = mum_realign<2, 4, true, true> + ); REGISTER_HASH(mum2_realign__inexact__unroll1, - $.desc = "Mum-hash v2, unroll 2^1, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x2D06BA6A, - $.verification_BE = 0xF0F929DF, - $.hashfn_native = mum_realign<2,1,false,false>, - $.hashfn_bswap = mum_realign<2,1,true,false> -); + $.desc = "Mum-hash v2, unroll 2^1, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x2D06BA6A, + $.verification_BE = 0xF0F929DF, + $.hashfn_native = mum_realign<2, 1, false, false>, + $.hashfn_bswap = mum_realign<2, 1, true, false> + ); REGISTER_HASH(mum2_realign__inexact__unroll2, - $.desc = "Mum-hash v2, unroll 2^2, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xF645F70A, - $.verification_BE = 0xC384782D, - $.hashfn_native = mum_realign<2,2,false,false>, - $.hashfn_bswap = mum_realign<2,2,true,false> -); + $.desc = "Mum-hash v2, unroll 2^2, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xF645F70A, + $.verification_BE = 0xC384782D, + $.hashfn_native = mum_realign<2, 2, false, false>, + $.hashfn_bswap = mum_realign<2, 2, true, false> + ); REGISTER_HASH(mum2_realign__inexact__unroll3, - $.desc = "Mum-hash v2, unroll 2^3, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xA8F0601A, - $.verification_BE = 0x5F5895AB, - $.hashfn_native = mum_realign<2,3,false,false>, - $.hashfn_bswap = mum_realign<2,3,true,false> -); + $.desc = "Mum-hash v2, unroll 2^3, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xA8F0601A, + $.verification_BE = 0x5F5895AB, + $.hashfn_native = mum_realign<2, 3, false, false>, + $.hashfn_bswap = mum_realign<2, 3, true, false> + ); REGISTER_HASH(mum2_realign__inexact__unroll4, - $.desc = "Mum-hash v2, unroll 2^4, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x53A9484D, - $.verification_BE = 0x4C6EBD7D, - $.hashfn_native = mum_realign<2,4,false,false>, - $.hashfn_bswap = mum_realign<2,4,true,false> -); + $.desc = "Mum-hash v2, unroll 2^4, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x53A9484D, + $.verification_BE = 0x4C6EBD7D, + $.hashfn_native = mum_realign<2, 4, false, false>, + $.hashfn_bswap = mum_realign<2, 4, true, false> + ); #endif REGISTER_HASH(mum3__exact__unroll1, - $.desc = "Mum-hash v3, unroll 2^1, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x3D14C6E2, - $.verification_BE = 0x360A792D, - $.hashfn_native = mum_aligned<3,1,false,true>, - $.hashfn_bswap = mum_aligned<3,1,true,true> -); + $.desc = "Mum-hash v3, unroll 2^1, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x3D14C6E2, + $.verification_BE = 0x360A792D, + $.hashfn_native = mum_aligned<3, 1, false, true>, + $.hashfn_bswap = mum_aligned<3, 1, true, true> + ); REGISTER_HASH(mum3__exact__unroll2, - $.desc = "Mum-hash v3, unroll 2^2, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x3A556EB2, - $.verification_BE = 0x752891D0, - $.hashfn_native = mum_aligned<3,2,false,true>, - $.hashfn_bswap = mum_aligned<3,2,true,true> -); + $.desc = "Mum-hash v3, unroll 2^2, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x3A556EB2, + $.verification_BE = 0x752891D0, + $.hashfn_native = mum_aligned<3, 2, false, true>, + $.hashfn_bswap = mum_aligned<3, 2, true, true> + ); REGISTER_HASH(mum3__exact__unroll3, - $.desc = "Mum-hash v3, unroll 2^3, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x8BD72B8C, - $.verification_BE = 0xDD8DD247, - $.hashfn_native = mum_aligned<3,3,false,true>, - $.hashfn_bswap = mum_aligned<3,3,true,true> -); + $.desc = "Mum-hash v3, unroll 2^3, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x8BD72B8C, + $.verification_BE = 0xDD8DD247, + $.hashfn_native = mum_aligned<3, 3, false, true>, + $.hashfn_bswap = mum_aligned<3, 3, true, true> + ); REGISTER_HASH(mum3__exact__unroll4, - $.desc = "Mum-hash v3, unroll 2^4, exact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x0AD998DF, - $.verification_BE = 0x05832709, - $.hashfn_native = mum_aligned<3,4,false,true>, - $.hashfn_bswap = mum_aligned<3,4,true,true> -); + $.desc = "Mum-hash v3, unroll 2^4, exact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x0AD998DF, + $.verification_BE = 0x05832709, + $.hashfn_native = mum_aligned<3, 4, false, true>, + $.hashfn_bswap = mum_aligned<3, 4, true, true> + ); REGISTER_HASH(mum3__inexact__unroll1, - $.desc = "Mum-hash v3, unroll 2^1, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x475D666B, - $.verification_BE = 0xE75B31F7, - $.hashfn_native = mum_aligned<3,1,false,false>, - $.hashfn_bswap = mum_aligned<3,1,true,false> -); + $.desc = "Mum-hash v3, unroll 2^1, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x475D666B, + $.verification_BE = 0xE75B31F7, + $.hashfn_native = mum_aligned<3, 1, false, false>, + $.hashfn_bswap = mum_aligned<3, 1, true, false> + ); REGISTER_HASH(mum3__inexact__unroll2, - $.desc = "Mum-hash v3, unroll 2^2, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xF6E13B23, - $.verification_BE = 0x7B00F4F6, - $.hashfn_native = mum_aligned<3,2,false,false>, - $.hashfn_bswap = mum_aligned<3,2,true,false> -); + $.desc = "Mum-hash v3, unroll 2^2, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xF6E13B23, + $.verification_BE = 0x7B00F4F6, + $.hashfn_native = mum_aligned<3, 2, false, false>, + $.hashfn_bswap = mum_aligned<3, 2, true, false> + ); REGISTER_HASH(mum3__inexact__unroll3, - $.desc = "Mum-hash v3, unroll 2^3, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xB5560703, - $.verification_BE = 0x1220D737, - $.hashfn_native = mum_aligned<3,3,false,false>, - $.hashfn_bswap = mum_aligned<3,3,true,false> -); + $.desc = "Mum-hash v3, unroll 2^3, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xB5560703, + $.verification_BE = 0x1220D737, + $.hashfn_native = mum_aligned<3, 3, false, false>, + $.hashfn_bswap = mum_aligned<3, 3, true, false> + ); REGISTER_HASH(mum3__inexact__unroll4, - $.desc = "Mum-hash v3, unroll 2^4, inexact mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xE96A20C0, - $.verification_BE = 0xE784308C, - $.hashfn_native = mum_aligned<3,4,false,false>, - $.hashfn_bswap = mum_aligned<3,4,true,false> -); + $.desc = "Mum-hash v3, unroll 2^4, inexact mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xE96A20C0, + $.verification_BE = 0xE784308C, + $.hashfn_native = mum_aligned<3, 4, false, false>, + $.hashfn_bswap = mum_aligned<3, 4, true, false> + ); #if defined(NOTYET) REGISTER_HASH(mum3_realign__exact__unroll1, - $.desc = "Mum-hash v3, unroll 2^1, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x26B6E56E, - $.verification_BE = 0x3395CE6B, - $.hashfn_native = mum_realign<3,1,false,true>, - $.hashfn_bswap = mum_realign<3,1,true,true> -); + $.desc = "Mum-hash v3, unroll 2^1, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x26B6E56E, + $.verification_BE = 0x3395CE6B, + $.hashfn_native = mum_realign<3, 1, false, true>, + $.hashfn_bswap = mum_realign<3, 1, true, true> + ); REGISTER_HASH(mum3_realign__exact__unroll2, - $.desc = "Mum-hash v3, unroll 2^2, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x6A60097E, - $.verification_BE = 0xF7ABC648, - $.hashfn_native = mum_realign<3,2,false,true>, - $.hashfn_bswap = mum_realign<3,2,true,true> -); + $.desc = "Mum-hash v3, unroll 2^2, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x6A60097E, + $.verification_BE = 0xF7ABC648, + $.hashfn_native = mum_realign<3, 2, false, true>, + $.hashfn_bswap = mum_realign<3, 2, true, true> + ); REGISTER_HASH(mum3_realign__exact__unroll3, - $.desc = "Mum-hash v3, unroll 2^3, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xD45801EE, - $.verification_BE = 0x1D6D8F1C, - $.hashfn_native = mum_realign<3,3,false,true>, - $.hashfn_bswap = mum_realign<3,3,true,true> -); + $.desc = "Mum-hash v3, unroll 2^3, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xD45801EE, + $.verification_BE = 0x1D6D8F1C, + $.hashfn_native = mum_realign<3, 3, false, true>, + $.hashfn_bswap = mum_realign<3, 3, true, true> + ); REGISTER_HASH(mum3_realign__exact__unroll4, - $.desc = "Mum-hash v3, unroll 2^4, exact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x65C49B24, - $.verification_BE = 0xE1C2CEEC, - $.hashfn_native = mum_realign<3,4,false,true>, - $.hashfn_bswap = mum_realign<3,4,true,true> -); + $.desc = "Mum-hash v3, unroll 2^4, exact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x65C49B24, + $.verification_BE = 0xE1C2CEEC, + $.hashfn_native = mum_realign<3, 4, false, true>, + $.hashfn_bswap = mum_realign<3, 4, true, true> + ); REGISTER_HASH(mum3_realign__inexact__unroll1, - $.desc = "Mum-hash v3, unroll 2^1, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xFB3DE98D, - $.verification_BE = 0xBBF8D76F, - $.hashfn_native = mum_realign<3,1,false,false>, - $.hashfn_bswap = mum_realign<3,1,true,false> -); + $.desc = "Mum-hash v3, unroll 2^1, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xFB3DE98D, + $.verification_BE = 0xBBF8D76F, + $.hashfn_native = mum_realign<3, 1, false, false>, + $.hashfn_bswap = mum_realign<3, 1, true, false> + ); REGISTER_HASH(mum3_realign__inexact__unroll2, - $.desc = "Mum-hash v3, unroll 2^2, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xBFD7CE56, - $.verification_BE = 0x134317BB, - $.hashfn_native = mum_realign<3,2,false,false>, - $.hashfn_bswap = mum_realign<3,2,true,false> -); + $.desc = "Mum-hash v3, unroll 2^2, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xBFD7CE56, + $.verification_BE = 0x134317BB, + $.hashfn_native = mum_realign<3, 2, false, false>, + $.hashfn_bswap = mum_realign<3, 2, true, false> + ); REGISTER_HASH(mum3_realign__inexact__unroll3, - $.desc = "Mum-hash v3, unroll 2^3, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x68CB735E, - $.verification_BE = 0x47E5152C, - $.hashfn_native = mum_realign<3,3,false,false>, - $.hashfn_bswap = mum_realign<3,3,true,false> -); + $.desc = "Mum-hash v3, unroll 2^3, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x68CB735E, + $.verification_BE = 0x47E5152C, + $.hashfn_native = mum_realign<3, 3, false, false>, + $.hashfn_bswap = mum_realign<3, 3, true, false> + ); REGISTER_HASH(mum3_realign__inexact__unroll4, - $.desc = "Mum-hash v3, unroll 2^4, inexact mult, for aligned-only reads", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x4975BD5E, - $.verification_BE = 0x8A467520, - $.hashfn_native = mum_realign<3,4,false,false>, - $.hashfn_bswap = mum_realign<3,4,true,false> -); + $.desc = "Mum-hash v3, unroll 2^4, inexact mult, for aligned-only reads", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x4975BD5E, + $.verification_BE = 0x8A467520, + $.hashfn_native = mum_realign<3, 4, false, false>, + $.hashfn_bswap = mum_realign<3, 4, true, false> + ); #endif REGISTER_HASH(mir__exact, - $.desc = "MIR-hash, exact 128-bit mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x00A393C8, - $.verification_BE = 0x39F99A44, - $.hashfn_native = mir_hash, - $.hashfn_bswap = mir_hash, - $.badseeds = {0x0, 0x1, 0x2} // !! I think literally *ALL* seeds are bad -); + $.desc = "MIR-hash, exact 128-bit mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x00A393C8, + $.verification_BE = 0x39F99A44, + $.hashfn_native = mir_hash, + $.hashfn_bswap = mir_hash, + $.badseeds = { 0x0, 0x1, 0x2 } // !! I think literally *ALL* seeds are bad + ); REGISTER_HASH(mir__inexact, - $.desc = "MIR-hash, inexact 128-bit mult", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x422A66FC, - $.verification_BE = 0xA9A6A383, - $.hashfn_native = mir_hash, - $.hashfn_bswap = mir_hash, - $.seedfixfn = excludeBadseeds, - $.badseeds = {UINT64_C(0xfffffffffffffff0)} -); + $.desc = "MIR-hash, inexact 128-bit mult", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x422A66FC, + $.verification_BE = 0xA9A6A383, + $.hashfn_native = mir_hash, + $.hashfn_bswap = mir_hash, + $.seedfixfn = excludeBadseeds, + $.badseeds = { UINT64_C (0xfffffffffffffff0) } + ); diff --git a/hashes/murmur_oaat.cpp b/hashes/murmur_oaat.cpp index eff2a402..79801534 100644 --- a/hashes/murmur_oaat.cpp +++ b/hashes/murmur_oaat.cpp @@ -28,7 +28,7 @@ #include "Hashlib.h" //------------------------------------------------------------ -static uint32_t MurmurOAAT_impl(const uint8_t * data, size_t len, uint32_t seed) { +static uint32_t MurmurOAAT_impl( const uint8_t * data, size_t len, uint32_t seed ) { uint32_t h = seed; for (size_t i = 0; i < len; i++) { @@ -40,30 +40,31 @@ static uint32_t MurmurOAAT_impl(const uint8_t * data, size_t len, uint32_t seed) } //------------------------------------------------------------ -template < bool bswap > -static void MurmurOAAT(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void MurmurOAAT( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = MurmurOAAT_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(MurmurOAAT, - $.src_url = "https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MurmurOAAT, - $.desc = "OAAT hash based on Murmur's mix", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x5363BD98, - $.verification_BE = 0x29CCE130, - $.hashfn_native = MurmurOAAT, - $.hashfn_bswap = MurmurOAAT -); + $.desc = "OAAT hash based on Murmur's mix", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x5363BD98, + $.verification_BE = 0x29CCE130, + $.hashfn_native = MurmurOAAT, + $.hashfn_bswap = MurmurOAAT + ); diff --git a/hashes/murmurhash1.cpp b/hashes/murmurhash1.cpp index f903f4b0..5a9fc58d 100644 --- a/hashes/murmurhash1.cpp +++ b/hashes/murmurhash1.cpp @@ -31,14 +31,15 @@ #include "Hashlib.h" //----------------------------------------------------------------------------- -template < bool bswap > -static void MurmurHash1(const void * in, const size_t olen, const seed_t seed, void * out) { - //uint32_t MurmurHash1 ( const void * key, int len, uint32_t seed ) +template +static void MurmurHash1( const void * in, const size_t olen, const seed_t seed, void * out ) { + // uint32_t MurmurHash1 ( const void * key, int len, uint32_t seed ) const uint32_t m = 0xc6a4a793; const uint32_t r = 16; - size_t len = olen; - uint32_t h = seed; + size_t len = olen; + uint32_t h = seed; + h ^= len * m; //---------- @@ -47,25 +48,25 @@ static void MurmurHash1(const void * in, const size_t olen, const seed_t seed, v while (len >= 4) { uint32_t k = GET_U32(data, 0); - h += k; - h *= m; - h ^= h >> 16; + h += k; + h *= m; + h ^= h >> 16; data += 4; - len -= 4; + len -= 4; } //---------- - switch(len) { + switch (len) { case 3: - h += data[2] << 16; /* FALLTHROUGH */ + h += data[2] << 16; /* FALLTHROUGH */ case 2: - h += data[1] << 8; /* FALLTHROUGH */ + h += data[1] << 8; /* FALLTHROUGH */ case 1: - h += data[0]; - h *= m; - h ^= h >> r; - }; + h += data[0]; + h *= m; + h ^= h >> r; + } //---------- h *= m; @@ -77,22 +78,22 @@ static void MurmurHash1(const void * in, const size_t olen, const seed_t seed, v } REGISTER_FAMILY(murmur1, - $.src_url = "https://github.com/aappleby/smhasher/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/aappleby/smhasher/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MurmurHash1, - $.desc = "MurmurHash v1", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x9EA7D056, - $.verification_BE = 0x4B34A47A, - $.hashfn_native = MurmurHash1, - $.hashfn_bswap = MurmurHash1, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0xc6a4a793} -); + $.desc = "MurmurHash v1", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x9EA7D056, + $.verification_BE = 0x4B34A47A, + $.hashfn_native = MurmurHash1, + $.hashfn_bswap = MurmurHash1, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0xc6a4a793 } + ); diff --git a/hashes/murmurhash2.cpp b/hashes/murmurhash2.cpp index 3ab24163..ccd6671f 100644 --- a/hashes/murmurhash2.cpp +++ b/hashes/murmurhash2.cpp @@ -31,87 +31,87 @@ #include "Hashlib.h" //----------------------------------------------------------------------------- -template < bool bswap > -static void MurmurHash2_32(const void * in, const size_t olen, const seed_t seed, void * out) { - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const uint32_t m = 0x5bd1e995; - const uint32_t r = 24; - size_t len = olen; - - // Initialize the hash to a 'random' value - uint32_t h = seed ^ olen; - - // Mix 4 bytes at a time into the hash - const uint8_t * data = (const uint8_t *)in; - - while (len >= 4) { - uint32_t k = GET_U32(data, 0); - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - switch(len) { - case 3: h ^= data[2] << 16; /* FALLTHROUGH */ - case 2: h ^= data[1] << 8; /* FALLTHROUGH */ - case 1: h ^= data[0]; - h *= m; - } - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - PUT_U32(h, (uint8_t *)out, 0); +template +static void MurmurHash2_32( const void * in, const size_t olen, const seed_t seed, void * out ) { + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const uint32_t m = 0x5bd1e995; + const uint32_t r = 24; + size_t len = olen; + + // Initialize the hash to a 'random' value + uint32_t h = seed ^ olen; + + // Mix 4 bytes at a time into the hash + const uint8_t * data = (const uint8_t *)in; + + while (len >= 4) { + uint32_t k = GET_U32(data, 0); + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + switch (len) { + case 3: h ^= data[2] << 16; /* FALLTHROUGH */ + case 2: h ^= data[1] << 8; /* FALLTHROUGH */ + case 1: h ^= data[0]; + h *= m; + } + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + PUT_U32(h, (uint8_t *)out, 0); } //----------------------------------------------------------------------------- // MurmurHash2, 64-bit versions, by Austin Appleby // 64-bit hash for 64-bit platforms -template < bool bswap > -static void MurmurHash2_64(const void * in, const size_t len, const seed_t seed, void * out) { - const uint64_t m = UINT64_C(0xc6a4a7935bd1e995); - const uint32_t r = 47; +template +static void MurmurHash2_64( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint64_t m = UINT64_C(0xc6a4a7935bd1e995); + const uint32_t r = 47; - uint64_t h = seed ^ (len * m); + uint64_t h = seed ^ (len * m); const uint8_t * data = (const uint8_t *)in; - const uint8_t * end = data + len - (len & 7); + const uint8_t * end = data + len - (len & 7); while (data != end) { - uint64_t k = GET_U64(data, 0); + uint64_t k = GET_U64(data, 0); - k *= m; - k ^= k >> r; - k *= m; + k *= m; + k ^= k >> r; + k *= m; - h ^= k; - h *= m; + h ^= k; + h *= m; - data += 8; + data += 8; } - switch(len & 7) { + switch (len & 7) { case 7: h ^= uint64_t(data[6]) << 48; /* FALLTHROUGH */ case 6: h ^= uint64_t(data[5]) << 40; /* FALLTHROUGH */ case 5: h ^= uint64_t(data[4]) << 32; /* FALLTHROUGH */ case 4: h ^= uint64_t(data[3]) << 24; /* FALLTHROUGH */ case 3: h ^= uint64_t(data[2]) << 16; /* FALLTHROUGH */ - case 2: h ^= uint64_t(data[1]) << 8; /* FALLTHROUGH */ + case 2: h ^= uint64_t(data[1]) << 8; /* FALLTHROUGH */ case 1: h ^= uint64_t(data[0]); - h *= m; + h *= m; } h ^= h >> r; @@ -121,45 +121,44 @@ static void MurmurHash2_64(const void * in, const size_t len, const seed_t seed, PUT_U64(h, (uint8_t *)out, 0); } - // 64-bit hash for 32-bit platforms -template < bool bswap > -static void MurmurHash2_32_64(const void * in, const size_t olen, const seed_t seed, void * out) { - const uint32_t m = 0x5bd1e995; - const uint32_t r = 24; +template +static void MurmurHash2_32_64( const void * in, const size_t olen, const seed_t seed, void * out ) { + const uint32_t m = 0x5bd1e995; + const uint32_t r = 24; - uint32_t h1 = (uint32_t)(seed) ^ olen; - uint32_t h2 = (uint32_t)(seed >> 32); - size_t len = olen; + uint32_t h1 = (uint32_t)(seed ) ^ olen; + uint32_t h2 = (uint32_t)(seed >> 32); + size_t len = olen; const uint8_t * data = (const uint8_t *)in; while (len >= 8) { uint32_t k1 = GET_U32(data, 0); - k1 *= m; k1 ^= k1 >> r; k1 *= m; + k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; uint32_t k2 = GET_U32(data, 4); - k2 *= m; k2 ^= k2 >> r; k2 *= m; - h2 *= m; h2 ^= k2; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; - len -= 8; + len -= 8; data += 8; } if (len >= 4) { uint32_t k1 = GET_U32(data, 0); - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; data += 4; } switch (len) { case 3: h2 ^= data[2] << 16; /* FALLTHROUGH */ - case 2: h2 ^= data[1] << 8; /* FALLTHROUGH */ + case 2: h2 ^= data[1] << 8; /* FALLTHROUGH */ case 1: h2 ^= data[0]; - h2 *= m; + h2 *= m; } h1 ^= h2 >> 18; h1 *= m; @@ -182,38 +181,38 @@ static void MurmurHash2_32_64(const void * in, const size_t olen, const seed_t s // collide with each other than expected, and also makes the function // more amenable to incremental implementations. -#define mmix(h,k) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } +#define mmix(h, k) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } -template < bool bswap > -static void MurmurHash2A_32(const void * in, const size_t olen, const seed_t seed, void * out) { - const uint32_t m = 0x5bd1e995; - const uint32_t r = 24; +template +static void MurmurHash2A_32( const void * in, const size_t olen, const seed_t seed, void * out ) { + const uint32_t m = 0x5bd1e995; + const uint32_t r = 24; - size_t len = olen; - uint32_t len32 = olen; - uint32_t h = (uint32_t)seed; + size_t len = olen; + uint32_t len32 = olen; + uint32_t h = (uint32_t)seed; const uint8_t * data = (const uint8_t *)in; - while (len >= 4 ) { + while (len >= 4) { uint32_t k = GET_U32(data, 0); - mmix(h,k); + mmix(h, k); data += 4; - len -= 4; + len -= 4; } uint32_t t = 0; switch (len) { case 3: t ^= data[2] << 16; /* FALLTHROUGH */ - case 2: t ^= data[1] << 8; /* FALLTHROUGH */ + case 2: t ^= data[1] << 8; /* FALLTHROUGH */ case 1: t ^= data[0]; } - mmix(h,t); - mmix(h,len32); + mmix(h, t ); + mmix(h, len32); h ^= h >> 13; h *= m; @@ -223,70 +222,70 @@ static void MurmurHash2A_32(const void * in, const size_t olen, const seed_t see } REGISTER_FAMILY(murmur2, - $.src_url = "https://github.com/aappleby/smhasher/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/aappleby/smhasher/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MurmurHash2_32, - $.desc = "MurmurHash v2, 32-bit version", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x27864C1E, - $.verification_BE = 0xE87D9B54, - $.hashfn_native = MurmurHash2_32, - $.hashfn_bswap = MurmurHash2_32, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0x10} -); + $.desc = "MurmurHash v2, 32-bit version", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x27864C1E, + $.verification_BE = 0xE87D9B54, + $.hashfn_native = MurmurHash2_32, + $.hashfn_bswap = MurmurHash2_32, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x10 } + ); REGISTER_HASH(MurmurHash2_64, - $.desc = "MurmurHash v2, 64-bit version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x1F0D3804, - $.verification_BE = 0x8FDA498D, - $.hashfn_native = MurmurHash2_64, - $.hashfn_bswap = MurmurHash2_64, - $.seedfixfn = excludeBadseeds, - $.badseeds = {UINT64_C(0xc6a4a7935bd1e995)} -); + $.desc = "MurmurHash v2, 64-bit version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x1F0D3804, + $.verification_BE = 0x8FDA498D, + $.hashfn_native = MurmurHash2_64, + $.hashfn_bswap = MurmurHash2_64, + $.seedfixfn = excludeBadseeds, + $.badseeds = { UINT64_C (0xc6a4a7935bd1e995) } + ); REGISTER_HASH(MurmurHash2_64__int32, - $.desc = "MurmurHash v2, 64-bit version using 32-bit variables", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xDD537C05, - $.verification_BE = 0xBF573795, - $.hashfn_native = MurmurHash2_32_64, - $.hashfn_bswap = MurmurHash2_32_64, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0x10, UINT64_C(0xffffffff00000010)} -); + $.desc = "MurmurHash v2, 64-bit version using 32-bit variables", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xDD537C05, + $.verification_BE = 0xBF573795, + $.hashfn_native = MurmurHash2_32_64, + $.hashfn_bswap = MurmurHash2_32_64, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x10, UINT64_C (0xffffffff00000010) } + ); REGISTER_HASH(MurmurHash2a, - $.desc = "MurmurHash v2a, 32-bit version using variant mixing", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x7FBD4396, - $.verification_BE = 0x7D969EB5, - $.hashfn_native = MurmurHash2A_32, - $.hashfn_bswap = MurmurHash2A_32, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0x2fc301c9} -); + $.desc = "MurmurHash v2a, 32-bit version using variant mixing", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x7FBD4396, + $.verification_BE = 0x7D969EB5, + $.hashfn_native = MurmurHash2A_32, + $.hashfn_bswap = MurmurHash2A_32, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x2fc301c9 } + ); diff --git a/hashes/murmurhash3.cpp b/hashes/murmurhash3.cpp index d1b0d36a..d8446b09 100644 --- a/hashes/murmurhash3.cpp +++ b/hashes/murmurhash3.cpp @@ -30,323 +30,323 @@ #include "Platform.h" #include "Hashlib.h" -static FORCE_INLINE uint32_t fmix32(uint32_t h) { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; +static FORCE_INLINE uint32_t fmix32( uint32_t h ) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; } -static FORCE_INLINE uint64_t fmix64(uint64_t k) { - k ^= k >> 33; - k *= UINT64_C(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= UINT64_C(0xc4ceb9fe1a85ec53); - k ^= k >> 33; +static FORCE_INLINE uint64_t fmix64( uint64_t k ) { + k ^= k >> 33; + k *= UINT64_C(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= UINT64_C(0xc4ceb9fe1a85ec53); + k ^= k >> 33; - return k; + return k; } //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here -template < bool bswap > -static FORCE_INLINE uint32_t getblock32(const uint8_t * p, int64_t i) { - return GET_U32(p+(4*i), 0); +template +static FORCE_INLINE uint32_t getblock32( const uint8_t * p, int64_t i ) { + return GET_U32(p + (4 * i), 0); } -template < bool bswap > -static FORCE_INLINE uint64_t getblock64(const uint8_t * p, int64_t i) { - return GET_U64(p+(8*i), 0); +template +static FORCE_INLINE uint64_t getblock64( const uint8_t * p, int64_t i ) { + return GET_U64(p + (8 * i), 0); } //----------------------------------------------------------------------------- -template < bool bswap > -static void MurmurHash3_32(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * data = (const uint8_t *)in; - const ssize_t nblocks = len / 4; +template +static void MurmurHash3_32( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * data = (const uint8_t *)in; + const ssize_t nblocks = len / 4; - uint32_t h1 = (uint32_t)seed; + uint32_t h1 = (uint32_t )seed; - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; - //---------- - // body + //---------- + // body - const uint8_t * blocks = data + nblocks*4; + const uint8_t * blocks = data + nblocks * 4; - for (ssize_t i = -nblocks; i; i++) { - uint32_t k1 = getblock32(blocks,i); + for (ssize_t i = -nblocks; i; i++) { + uint32_t k1 = getblock32(blocks, i); - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } - //---------- - // tail + //---------- + // tail - const uint8_t * tail = data + nblocks*4; + const uint8_t * tail = data + nblocks * 4; - uint32_t k1 = 0; + uint32_t k1 = 0; - switch(len & 3) { - case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */ - case 2: k1 ^= tail[1] << 8; /* FALLTHROUGH */ - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; /* FALLTHROUGH */ + case 2: k1 ^= tail[1] << 8; /* FALLTHROUGH */ + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; + } - //---------- - // finalization + //---------- + // finalization - h1 ^= (uint32_t)len; + h1 ^= (uint32_t)len; - h1 = fmix32(h1); + h1 = fmix32(h1); - PUT_U32(h1, (uint8_t *)out, 0); -} + PUT_U32(h1, (uint8_t *)out, 0); +} //----------------------------------------------------------------------------- -template < bool bswap > -static void MurmurHash3_32_128(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * data = (const uint8_t *)in; - const ssize_t nblocks = len / 16; - - uint32_t h1 = (uint32_t)seed; - uint32_t h2 = (uint32_t)seed; - uint32_t h3 = (uint32_t)seed; - uint32_t h4 = (uint32_t)seed; - - const uint32_t c1 = 0x239b961b; - const uint32_t c2 = 0xab0e9789; - const uint32_t c3 = 0x38b34ae5; - const uint32_t c4 = 0xa1e38b93; - - //---------- - // body - - const uint8_t * blocks = data + nblocks*16; - - for (ssize_t i = -nblocks; i; i++) { - uint32_t k1 = getblock32(blocks,i*4+0); - uint32_t k2 = getblock32(blocks,i*4+1); - uint32_t k3 = getblock32(blocks,i*4+2); - uint32_t k4 = getblock32(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - const uint8_t * tail = data + nblocks*16; - - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - switch(len & 15) { - case 15: k4 ^= tail[14] << 16; /* FALLTHROUGH */ - case 14: k4 ^= tail[13] << 8; /* FALLTHROUGH */ - case 13: k4 ^= tail[12] << 0; /* FALLTHROUGH */ - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - /* FALLTHROUGH */ - case 12: k3 ^= tail[11] << 24; /* FALLTHROUGH */ - case 11: k3 ^= tail[10] << 16; /* FALLTHROUGH */ - case 10: k3 ^= tail[ 9] << 8; /* FALLTHROUGH */ - case 9: k3 ^= tail[ 8] << 0; /* FALLTHROUGH */ - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - /* FALLTHROUGH */ - case 8: k2 ^= tail[ 7] << 24; /* FALLTHROUGH */ - case 7: k2 ^= tail[ 6] << 16; /* FALLTHROUGH */ - case 6: k2 ^= tail[ 5] << 8; /* FALLTHROUGH */ - case 5: k2 ^= tail[ 4] << 0; /* FALLTHROUGH */ - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - /* FALLTHROUGH */ - case 4: k1 ^= tail[ 3] << 24; /* FALLTHROUGH */ - case 3: k1 ^= tail[ 2] << 16; /* FALLTHROUGH */ - case 2: k1 ^= tail[ 1] << 8; /* FALLTHROUGH */ - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= (uint32_t)len; h2 ^= (uint32_t)len; - h3 ^= (uint32_t)len; h4 ^= (uint32_t)len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix32(h1); - h2 = fmix32(h2); - h3 = fmix32(h3); - h4 = fmix32(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - PUT_U32(h1, (uint8_t *)out, 0); - PUT_U32(h2, (uint8_t *)out, 4); - PUT_U32(h3, (uint8_t *)out, 8); - PUT_U32(h4, (uint8_t *)out, 12); +template +static void MurmurHash3_32_128( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * data = (const uint8_t *)in; + const ssize_t nblocks = len / 16; + + uint32_t h1 = (uint32_t )seed; + uint32_t h2 = (uint32_t )seed; + uint32_t h3 = (uint32_t )seed; + uint32_t h4 = (uint32_t )seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint8_t * blocks = data + nblocks * 16; + + for (ssize_t i = -nblocks; i; i++) { + uint32_t k1 = getblock32(blocks, i * 4 + 0); + uint32_t k2 = getblock32(blocks, i * 4 + 1); + uint32_t k3 = getblock32(blocks, i * 4 + 2); + uint32_t k4 = getblock32(blocks, i * 4 + 3); + + k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1, 19); h1 += h2; h1 = h1 * 5 + 0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2, 17); h2 += h3; h2 = h2 * 5 + 0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3, 15); h3 += h4; h3 = h3 * 5 + 0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4, 13); h4 += h1; h4 = h4 * 5 + 0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = data + nblocks * 16; + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch (len & 15) { + case 15: k4 ^= tail[14] << 16; /* FALLTHROUGH */ + case 14: k4 ^= tail[13] << 8; /* FALLTHROUGH */ + case 13: k4 ^= tail[12] << 0; /* FALLTHROUGH */ + k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; + /* FALLTHROUGH */ + case 12: k3 ^= tail[11] << 24; /* FALLTHROUGH */ + case 11: k3 ^= tail[10] << 16; /* FALLTHROUGH */ + case 10: k3 ^= tail[ 9] << 8; /* FALLTHROUGH */ + case 9: k3 ^= tail[ 8] << 0; /* FALLTHROUGH */ + k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; + /* FALLTHROUGH */ + case 8: k2 ^= tail[ 7] << 24; /* FALLTHROUGH */ + case 7: k2 ^= tail[ 6] << 16; /* FALLTHROUGH */ + case 6: k2 ^= tail[ 5] << 8; /* FALLTHROUGH */ + case 5: k2 ^= tail[ 4] << 0; /* FALLTHROUGH */ + k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; + /* FALLTHROUGH */ + case 4: k1 ^= tail[ 3] << 24; /* FALLTHROUGH */ + case 3: k1 ^= tail[ 2] << 16; /* FALLTHROUGH */ + case 2: k1 ^= tail[ 1] << 8; /* FALLTHROUGH */ + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= (uint32_t)len; h2 ^= (uint32_t)len; + h3 ^= (uint32_t)len; h4 ^= (uint32_t)len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + PUT_U32(h1, (uint8_t *)out, 0); + PUT_U32(h2, (uint8_t *)out, 4); + PUT_U32(h3, (uint8_t *)out, 8); + PUT_U32(h4, (uint8_t *)out, 12); } //----------------------------------------------------------------------------- -template < bool bswap > -static void MurmurHash3_128(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * data = (const uint8_t *)in; - const size_t nblocks = len / 16; +template +static void MurmurHash3_128( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * data = (const uint8_t *)in; + const size_t nblocks = len / 16; - uint64_t h1 = (uint32_t)seed; - uint64_t h2 = (uint32_t)seed; + uint64_t h1 = (uint32_t )seed; + uint64_t h2 = (uint32_t )seed; - const uint64_t c1 = UINT64_C(0x87c37b91114253d5); - const uint64_t c2 = UINT64_C(0x4cf5ad432745937f); + const uint64_t c1 = UINT64_C(0x87c37b91114253d5); + const uint64_t c2 = UINT64_C(0x4cf5ad432745937f); - //---------- - // body + //---------- + // body - const uint8_t * blocks = data; + const uint8_t * blocks = data; - for (size_t i = 0; i < nblocks; i++) { - uint64_t k1 = getblock64(blocks,i*2+0); - uint64_t k2 = getblock64(blocks,i*2+1); + for (size_t i = 0; i < nblocks; i++) { + uint64_t k1 = getblock64(blocks, i * 2 + 0); + uint64_t k2 = getblock64(blocks, i * 2 + 1); - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + h1 = ROTL64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } + h2 = ROTL64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; + } - //---------- - // tail + //---------- + // tail - const uint8_t * tail = data + nblocks*16; + const uint8_t * tail = data + nblocks * 16; - uint64_t k1 = 0; - uint64_t k2 = 0; + uint64_t k1 = 0; + uint64_t k2 = 0; - switch(len & 15) { - case 15: k2 ^= ((uint64_t)tail[14]) << 48; /* FALLTHROUGH */ - case 14: k2 ^= ((uint64_t)tail[13]) << 40; /* FALLTHROUGH */ - case 13: k2 ^= ((uint64_t)tail[12]) << 32; /* FALLTHROUGH */ - case 12: k2 ^= ((uint64_t)tail[11]) << 24; /* FALLTHROUGH */ - case 11: k2 ^= ((uint64_t)tail[10]) << 16; /* FALLTHROUGH */ - case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; /* FALLTHROUGH */ - case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - /* FALLTHROUGH */ - case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; /* FALLTHROUGH */ - case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; /* FALLTHROUGH */ - case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; /* FALLTHROUGH */ - case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; /* FALLTHROUGH */ - case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; /* FALLTHROUGH */ - case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; /* FALLTHROUGH */ - case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; /* FALLTHROUGH */ - case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - }; + switch (len & 15) { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; /* FALLTHROUGH */ + case 14: k2 ^= ((uint64_t)tail[13]) << 40; /* FALLTHROUGH */ + case 13: k2 ^= ((uint64_t)tail[12]) << 32; /* FALLTHROUGH */ + case 12: k2 ^= ((uint64_t)tail[11]) << 24; /* FALLTHROUGH */ + case 11: k2 ^= ((uint64_t)tail[10]) << 16; /* FALLTHROUGH */ + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; /* FALLTHROUGH */ + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; + /* FALLTHROUGH */ + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; /* FALLTHROUGH */ + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; /* FALLTHROUGH */ + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; /* FALLTHROUGH */ + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; /* FALLTHROUGH */ + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; /* FALLTHROUGH */ + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; /* FALLTHROUGH */ + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; /* FALLTHROUGH */ + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; + } - //---------- - // finalization + //---------- + // finalization - h1 ^= (uint32_t)len; h2 ^= (uint32_t)len; + h1 ^= (uint32_t)len; h2 ^= (uint32_t)len; - h1 += h2; - h2 += h1; + h1 += h2; + h2 += h1; - h1 = fmix64(h1); - h2 = fmix64(h2); + h1 = fmix64(h1); + h2 = fmix64(h2); - h1 += h2; - h2 += h1; + h1 += h2; + h2 += h1; - PUT_U64(h1, (uint8_t *)out, 0); - PUT_U64(h2, (uint8_t *)out, 8); + PUT_U64(h1, (uint8_t *)out, 0); + PUT_U64(h2, (uint8_t *)out, 8); } REGISTER_FAMILY(murmur3, - $.src_url = "https://github.com/aappleby/smhasher/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/aappleby/smhasher/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(MurmurHash3_32, - $.desc = "MurmurHash v3, 32-bit version", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xB0F57EE3, - $.verification_BE = 0x6213303E, - $.hashfn_native = MurmurHash3_32, - $.hashfn_bswap = MurmurHash3_32, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0xfca58b2d} -); + $.desc = "MurmurHash v3, 32-bit version", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xB0F57EE3, + $.verification_BE = 0x6213303E, + $.hashfn_native = MurmurHash3_32, + $.hashfn_bswap = MurmurHash3_32, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0xfca58b2d } + ); REGISTER_HASH(MurmurHash3_128__int32, - $.desc = "MurmurHash v3, 128-bit version using 32-bit variables", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0xB3ECE62A, - $.verification_BE = 0xDC26F009, - $.hashfn_native = MurmurHash3_32_128, - $.hashfn_bswap = MurmurHash3_32_128, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0x239b961b} -); + $.desc = "MurmurHash v3, 128-bit version using 32-bit variables", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xB3ECE62A, + $.verification_BE = 0xDC26F009, + $.hashfn_native = MurmurHash3_32_128, + $.hashfn_bswap = MurmurHash3_32_128, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x239b961b } + ); REGISTER_HASH(MurmurHash3_128, - $.desc = "MurmurHash v3, 128-bit version using 64-bit variables", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x6384BA69, - $.verification_BE = 0xCC622B6F, - $.hashfn_native = MurmurHash3_128, - $.hashfn_bswap = MurmurHash3_128, - $.seedfixfn = excludeBadseeds, - $.badseeds = {0x239b961b} -); + $.desc = "MurmurHash v3, 128-bit version using 64-bit variables", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x6384BA69, + $.verification_BE = 0xCC622B6F, + $.hashfn_native = MurmurHash3_128, + $.hashfn_bswap = MurmurHash3_128, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x239b961b } + ); diff --git a/hashes/mx3.cpp b/hashes/mx3.cpp index 99ac620d..376bcb18 100644 --- a/hashes/mx3.cpp +++ b/hashes/mx3.cpp @@ -10,8 +10,8 @@ //------------------------------------------------------------ static const uint64_t C = UINT64_C(0xbea225f9eb34556d); -template < bool v1 > -static inline uint64_t mix(uint64_t x) { +template +static inline uint64_t mix( uint64_t x ) { constexpr uint32_t R0 = v1 ? 0 : 32; constexpr uint32_t R1 = v1 ? 33 : 29; constexpr uint32_t R2 = v1 ? 29 : 32; @@ -29,9 +29,10 @@ static inline uint64_t mix(uint64_t x) { return x; } -template < bool v1 > -static inline uint64_t mix_stream(uint64_t h, uint64_t x) { +template +static inline uint64_t mix_stream( uint64_t h, uint64_t x ) { constexpr uint32_t R1 = v1 ? 33 : 43; + x *= C; x ^= (x >> 57) ^ (x >> R1); x *= C; @@ -40,83 +41,86 @@ static inline uint64_t mix_stream(uint64_t h, uint64_t x) { return h; } -template < bool v1, bool bswap > -static inline uint64_t mx3(const uint8_t * buf, size_t len, uint64_t seed) { +template +static inline uint64_t mx3( const uint8_t * buf, size_t len, uint64_t seed ) { const uint8_t * const tail = buf + (len & ~7); uint64_t h = seed ^ len; + while (len >= 32) { len -= 32; - h = mix_stream(h, GET_U64(buf, 0)); - h = mix_stream(h, GET_U64(buf, 8)); - h = mix_stream(h, GET_U64(buf, 16)); - h = mix_stream(h, GET_U64(buf, 24)); + h = mix_stream(h, GET_U64(buf, 0)); + h = mix_stream(h, GET_U64(buf, 8)); + h = mix_stream(h, GET_U64(buf, 16)); + h = mix_stream(h, GET_U64(buf, 24)); buf += 32; } while (len >= 8) { len -= 8; - h = mix_stream(h, GET_U64(buf, 0)); + h = mix_stream(h, GET_U64(buf, 0)); buf += 8; } uint64_t v = 0; switch (len & 7) { - case 7: v |= static_cast(tail[6]) << 48; - case 6: v |= static_cast(tail[5]) << 40; - case 5: v |= static_cast(tail[4]) << 32; - case 4: v |= static_cast(tail[3]) << 24; - case 3: v |= static_cast(tail[2]) << 16; - case 2: v |= static_cast(tail[1]) << 8; - case 1: h = mix_stream(h, v | tail[0]); - default: ; + case 7: v |= static_cast(tail[6]) << 48; + case 6: v |= static_cast(tail[5]) << 40; + case 5: v |= static_cast(tail[4]) << 32; + case 4: v |= static_cast(tail[3]) << 24; + case 3: v |= static_cast(tail[2]) << 16; + case 2: v |= static_cast(tail[1]) << 8; + case 1: h = mix_stream(h, v | tail[0]); + default:; } return mix(h); } //------------------------------------------------------------ -template < bool bswap > -static void mx3_v1(const void * in, const size_t len, const seed_t seed, void * out) { - uint64_t h = mx3((const uint8_t *)in, len, (uint64_t) seed); +template +static void mx3_v1( const void * in, const size_t len, const seed_t seed, void * out ) { + uint64_t h = mx3((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void mx3_v2(const void * in, const size_t len, const seed_t seed, void * out) { - uint64_t h = mx3((const uint8_t *)in, len, (uint64_t) seed); +template +static void mx3_v2( const void * in, const size_t len, const seed_t seed, void * out ) { + uint64_t h = mx3((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(mx3, - $.src_url = "https://github.com/jonmaiga/mx3/", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/jonmaiga/mx3/", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(mx3__v2, - $.desc = "mx3 (revision 2)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x527399AD, - $.verification_BE = 0x5B6AAE8F, - $.hashfn_native = mx3_v2, - $.hashfn_bswap = mx3_v2 -); + $.desc = "mx3 (revision 2)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x527399AD, + $.verification_BE = 0x5B6AAE8F, + $.hashfn_native = mx3_v2, + $.hashfn_bswap = mx3_v2 + ); REGISTER_HASH(mx3__v1, - $.desc = "mx3 (revision 1)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x4DB51E5B, - $.verification_BE = 0x93E930B0, - $.hashfn_native = mx3_v1, - $.hashfn_bswap = mx3_v1 -); + $.desc = "mx3 (revision 1)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x4DB51E5B, + $.verification_BE = 0x93E930B0, + $.hashfn_native = mx3_v1, + $.hashfn_bswap = mx3_v1 + ); diff --git a/hashes/nmhash.cpp b/hashes/nmhash.cpp index 91683fad..5e9ab570 100644 --- a/hashes/nmhash.cpp +++ b/hashes/nmhash.cpp @@ -33,7 +33,7 @@ #include "Hashlib.h" //------------------------------------------------------------ -//#define NMH_VERSION 2 +// #define NMH_VERSION 2 /* vector macros */ #define NMH_SCALAR 0 @@ -42,17 +42,17 @@ #define NMH_AVX512 3 #if defined(HAVE_AVX512_BW) -#define NMH_VECTOR NMH_AVX512 /* _mm512_mullo_epi16 requires AVX512BW */ + #define NMH_VECTOR NMH_AVX512 /* _mm512_mullo_epi16 requires AVX512BW */ #elif defined(HAVE_AVX2) -#define NMH_VECTOR NMH_AVX2 + #define NMH_VECTOR NMH_AVX2 #elif defined(HAVE_SSE_2) -#define NMH_VECTOR NMH_SSE2 + #define NMH_VECTOR NMH_SSE2 #else -#define NMH_VECTOR NMH_SCALAR + #define NMH_VECTOR NMH_SCALAR #endif #if NMH_VECTOR > NMH_SCALAR -#include "Intrinsics.h" + #include "Intrinsics.h" #endif //------------------------------------------------------------ @@ -102,10 +102,10 @@ alignas(16) static const uint32_t __NMH_M3_V[32] = { }; //------------------------------------------------------------ -static inline uint32_t NMHASH_mult16(uint32_t a, uint32_t b) { - uint16_t al = (uint16_t)(a); +static inline uint32_t NMHASH_mult16( uint32_t a, uint32_t b ) { + uint16_t al = (uint16_t)(a ); uint16_t ah = (uint16_t)(a >> 16); - uint16_t bl = (uint16_t)(b); + uint16_t bl = (uint16_t)(b ); uint16_t bh = (uint16_t)(b >> 16); al *= bl; @@ -114,8 +114,7 @@ static inline uint32_t NMHASH_mult16(uint32_t a, uint32_t b) { return (((uint32_t)ah) << 16) + ((uint32_t)al); } - -static inline uint32_t NMHASH32_0to8(uint32_t const x, uint32_t const seed2) { +static inline uint32_t NMHASH32_0to8( uint32_t const x, uint32_t const seed2 ) { /* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */ const uint32_t m1 = UINT32_C(0x776BF593); const uint32_t m2 = UINT32_C(0x3FB39C65); @@ -125,31 +124,31 @@ static inline uint32_t NMHASH32_0to8(uint32_t const x, uint32_t const seed2) { { uint32_t vx; vx = x; - vx ^= (vx >> 12) ^ (vx >> 6); + vx ^= (vx >> 12) ^ (vx >> 6); vx = NMHASH_mult16(vx, m1); vx ^= (vx << 11) ^ (vx >> 19); vx = NMHASH_mult16(vx, m2); vx ^= seed2; - vx ^= (vx >> 15) ^ (vx >> 9); + vx ^= (vx >> 15) ^ (vx >> 9); vx = NMHASH_mult16(vx, m3); vx ^= (vx << 16) ^ (vx >> 11); return vx; } #else /* at least NMH_SSE2 */ { - __m128i hv = _mm_setr_epi32((int)x, 0, 0, 0); + __m128i hv = _mm_setr_epi32((int)x , 0, 0, 0); const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0); - const uint32_t *const result = (const uint32_t*)&hv; + const uint32_t * const result = (const uint32_t *)&hv; - hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6)); + hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6)); hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0)); hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19)); hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0)); hv = _mm_xor_si128(hv, sv); - hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9)); - hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0)); + hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9)); + hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0)); hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11)); return *result; @@ -157,115 +156,115 @@ static inline uint32_t NMHASH32_0to8(uint32_t const x, uint32_t const seed2) { #endif } -template < bool gt32bytes, bool bswap > -static inline uint32_t NMHASH32_9to255(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { +template +static inline uint32_t NMHASH32_9to255( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { /* base mixer: [f0d9649b 5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */ uint32_t result = 0; #if NMH_VECTOR == NMH_SCALAR { - uint32_t x[4], y[4]; + uint32_t x[4], y[4]; uint32_t const sl = seed + (uint32_t)len; - size_t j; + size_t j; x[0] = NMH_PRIME32_1; x[1] = NMH_PRIME32_2; x[2] = NMH_PRIME32_3; x[3] = NMH_PRIME32_4; - for (j = 0; j < 4; ++j) y[j] = sl; + for (j = 0; j < 4; ++j) { y[j] = sl; } if (gt32bytes) { /* 33 to 255 bytes */ size_t const r = (len - 1) / 32; - size_t i; + size_t i; for (i = 0; i < r; ++i) { - for (j = 0; j < 4; ++j) x[j] ^= GET_U32(p, i * 32 + j * 4); - for (j = 0; j < 4; ++j) y[j] ^= GET_U32(p, i * 32 + j * 4 + 16); - for (j = 0; j < 4; ++j) x[j] += y[j]; + for (j = 0; j < 4; ++j) { x[j] ^= GET_U32(p, i * 32 + j * 4); } + for (j = 0; j < 4; ++j) { y[j] ^= GET_U32(p, i * 32 + j * 4 + 16); } + for (j = 0; j < 4; ++j) { x[j] += y[j]; } - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M1); + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M1); } - for (j = 0; j < 4; ++j) x[j] ^= (x[j] << 5) ^ (x[j] >> 13); + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] << 5) ^ (x[j] >> 13); } - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M2); + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M2); } - for (j = 0; j < 4; ++j) x[j] ^= y[j]; + for (j = 0; j < 4; ++j) { x[j] ^= y[j]; } - for (j = 0; j < 4; ++j) x[j] ^= (x[j] << 11) ^ (x[j] >> 9); + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] << 11) ^ (x[j] >> 9); } - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M3); + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M3); } - for (j = 0; j < 4; ++j) x[j] ^= (x[j] >> 10) ^ (x[j] >> 20); + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] >> 10) ^ (x[j] >> 20); } } - for (j = 0; j < 4; ++j) x[j] ^= GET_U32(p, len - 32 + j * 4); - for (j = 0; j < 4; ++j) y[j] ^= GET_U32(p, len - 16 + j * 4); + for (j = 0; j < 4; ++j) { x[j] ^= GET_U32(p, len - 32 + j * 4); } + for (j = 0; j < 4; ++j) { y[j] ^= GET_U32(p, len - 16 + j * 4); } } else { /* 9 to 32 bytes */ - x[0] ^= GET_U32(p, 0); - x[1] ^= GET_U32(p, ((len>>4)<<3)); + x[0] ^= GET_U32(p, 0 ); + x[1] ^= GET_U32(p, ( (len >> 4) << 3)); x[2] ^= GET_U32(p, len - 8); - x[3] ^= GET_U32(p, len - 8 - ((len>>4)<<3)); - y[0] ^= GET_U32(p, 4); - y[1] ^= GET_U32(p, ((len>>4)<<3) + 4); + x[3] ^= GET_U32(p, len - 8 - ((len >> 4) << 3)); + y[0] ^= GET_U32(p, 4 ); + y[1] ^= GET_U32(p, ( (len >> 4) << 3) + 4); y[2] ^= GET_U32(p, len - 8 + 4); - y[3] ^= GET_U32(p, len - 8 - ((len>>4)<<3) + 4); + y[3] ^= GET_U32(p, len - 8 - ((len >> 4) << 3) + 4); } - for (j = 0; j < 4; ++j) x[j] += y[j]; - for (j = 0; j < 4; ++j) y[j] ^= (y[j] << 17) ^ (y[j] >> 6); + for (j = 0; j < 4; ++j) { x[j] += y[j]; } + for (j = 0; j < 4; ++j) { y[j] ^= (y[j] << 17) ^ (y[j] >> 6); } - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M1); - for (j = 0; j < 4; ++j) x[j] ^= (x[j] << 5) ^ (x[j] >> 13); - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M2); + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M1); } + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] << 5) ^ (x[j] >> 13); } + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M2); } - for (j = 0; j < 4; ++j) x[j] ^= y[j]; + for (j = 0; j < 4; ++j) { x[j] ^= y[j]; } - for (j = 0; j < 4; ++j) x[j] ^= (x[j] << 11) ^ (x[j] >> 9); - for (j = 0; j < 4; ++j) x[j] = NMHASH_mult16(x[j], __NMH_M3); - for (j = 0; j < 4; ++j) x[j] ^= (x[j] >> 10) ^ (x[j] >> 20); + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] << 11) ^ (x[j] >> 9); } + for (j = 0; j < 4; ++j) { x[j] = NMHASH_mult16(x[j], __NMH_M3); } + for (j = 0; j < 4; ++j) { x[j] ^= (x[j] >> 10) ^ (x[j] >> 20); } x[0] ^= NMH_PRIME32_1; x[1] ^= NMH_PRIME32_2; x[2] ^= NMH_PRIME32_3; x[3] ^= NMH_PRIME32_4; - for (j = 1; j < 4; ++j) x[0] += x[j]; + for (j = 1; j < 4; ++j) { x[0] += x[j]; } - x[0] ^= sl + (sl >> 5); - x[0] = NMHASH_mult16(x[0], __NMH_M3); - x[0] ^= (x[0] >> 10) ^ (x[0] >> 20); + x[0] ^= sl + (sl >> 5); + x[0] = NMHASH_mult16(x[0], __NMH_M3); + x[0] ^= (x[0] >> 10) ^ (x[0] >> 20); result = x[0]; } #else /* at least NMH_SSE2 */ { - __m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4); - __m128i const sl = _mm_set1_epi32((int)seed + (int)len); - __m128i const m1 = _mm_set1_epi32((int)__NMH_M1); - __m128i const m2 = _mm_set1_epi32((int)__NMH_M2); - __m128i const m3 = _mm_set1_epi32((int)__NMH_M3); - __m128i x = h0; - __m128i y = sl; - const uint32_t *const px = (const uint32_t*)&x; + __m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, + (int)NMH_PRIME32_3, (int)NMH_PRIME32_4); + __m128i const sl = _mm_set1_epi32((int)seed + (int)len); + __m128i const m1 = _mm_set1_epi32((int)__NMH_M1 ); + __m128i const m2 = _mm_set1_epi32((int)__NMH_M2 ); + __m128i const m3 = _mm_set1_epi32((int)__NMH_M3 ); + __m128i x = h0; + __m128i y = sl; + const uint32_t * const px = (const uint32_t *)&x; if (gt32bytes) { /* 32 to 127 bytes */ size_t const r = (len - 1) / 32; - size_t i; + size_t i; for (i = 0; i < r; ++i) { if (bswap) { - x = _mm_xor_si128(x, mm_bswap32(_mm_loadu_si128((const __m128i *)(p + i * 32)))); + x = _mm_xor_si128(x, mm_bswap32(_mm_loadu_si128((const __m128i *)(p + i * 32 )))); y = _mm_xor_si128(y, mm_bswap32(_mm_loadu_si128((const __m128i *)(p + i * 32 + 16)))); } else { - x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32))); + x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32 ))); y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16))); } x = _mm_add_epi32(x, y); x = _mm_mullo_epi16(x, m1); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13)); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13)); x = _mm_mullo_epi16(x, m2); x = _mm_xor_si128(x, y); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9)); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9)); x = _mm_mullo_epi16(x, m3); x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20)); } @@ -278,29 +277,33 @@ static inline uint32_t NMHASH32_9to255(const uint8_t* const RESTRICT p, } } else { /* 9 to 32 bytes */ - x = _mm_xor_si128(x, _mm_setr_epi32((int)GET_U32(p, 0), (int)GET_U32(p, ((len>>4)<<3)), (int)GET_U32(p, len - 8), (int)GET_U32(p, len - 8 - ((len>>4)<<3)))); - y = _mm_xor_si128(y, _mm_setr_epi32((int)GET_U32(p, 4), (int)GET_U32(p, ((len>>4)<<3) + 4), (int)GET_U32(p, len - 8 + 4), (int)GET_U32(p, len - 8 - ((len>>4)<<3) + 4))); + x = _mm_xor_si128(x, _mm_setr_epi32((int)GET_U32(p, 0), (int)GET_U32( + p, ((len >> 4) << 3)) , (int)GET_U32(p, len - 8), (int)GET_U32( + p, len - 8 - ((len >> 4) << 3))) ); + y = _mm_xor_si128(y, _mm_setr_epi32((int)GET_U32(p, 4), (int)GET_U32( + p, ((len >> 4) << 3) + 4), (int)GET_U32(p, len - 8 + 4), (int)GET_U32( + p, len - 8 - ((len >> 4) << 3) + 4))); } - x = _mm_add_epi32(x, y); + x = _mm_add_epi32(x, y); - y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6)); + y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6)); - x = _mm_mullo_epi16(x, m1); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13)); - x = _mm_mullo_epi16(x, m2); - x = _mm_xor_si128(x, y); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9)); - x = _mm_mullo_epi16(x, m3); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20)); + x = _mm_mullo_epi16(x, m1); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13)); + x = _mm_mullo_epi16(x, m2); + x = _mm_xor_si128(x, y); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9)); + x = _mm_mullo_epi16(x, m3); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20)); - x = _mm_xor_si128(x, h0); - x = _mm_add_epi32(x, _mm_srli_si128(x, 4)); - x = _mm_add_epi32(x, _mm_srli_si128(x, 8)); + x = _mm_xor_si128(x, h0); + x = _mm_add_epi32(x, _mm_srli_si128(x, 4)); + x = _mm_add_epi32(x, _mm_srli_si128(x, 8)); - x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5))); - x = _mm_mullo_epi16(x, m3); - x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20)); + x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5))); + x = _mm_mullo_epi16(x, m3); + x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20)); result = *px; } @@ -313,28 +316,26 @@ static inline uint32_t NMHASH32_9to255(const uint8_t* const RESTRICT p, #undef __NMH_M2 #undef __NMH_M1 -template < bool bswap > -static inline uint32_t NMHASH32_9to32(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { - return NMHASH32_9to255(p, len, seed); +template +static inline uint32_t NMHASH32_9to32( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { + return NMHASH32_9to255(p, len, seed); } -template < bool bswap > -static inline uint32_t NMHASH32_33to255(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { - return NMHASH32_9to255(p, len, seed); +template +static inline uint32_t NMHASH32_33to255( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { + return NMHASH32_9to255(p, len, seed); } -template < bool bswap > -static inline void NMHASH32_long_round_scalar(uint32_t * const RESTRICT accX, - uint32_t * const RESTRICT accY, const uint8_t * const RESTRICT p) { +template +static inline void NMHASH32_long_round_scalar( uint32_t * const RESTRICT accX, uint32_t * const RESTRICT accY, + const uint8_t * const RESTRICT p ) { /* * breadth first calculation will hint some compiler to auto * vectorize the code on gcc, the performance becomes 10x than the * depth first, and about 80% of the manually vectorized code */ const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT); - size_t i; + size_t i; for (i = 0; i < nbGroups; ++i) { accX[i] ^= GET_U32(p, i * 4); @@ -349,13 +350,13 @@ static inline void NMHASH32_long_round_scalar(uint32_t * const RESTRICT accX, accY[i] ^= accX[i] >> 1; } for (i = 0; i < nbGroups * 2; ++i) { - ((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i]; + ((uint16_t *)accX)[i] *= ((uint16_t *)__NMH_M1_V)[i]; } for (i = 0; i < nbGroups; ++i) { accX[i] ^= accX[i] << 5 ^ accX[i] >> 13; } for (i = 0; i < nbGroups * 2; ++i) { - ((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i]; + ((uint16_t *)accX)[i] *= ((uint16_t *)__NMH_M2_V)[i]; } for (i = 0; i < nbGroups; ++i) { accX[i] ^= accY[i]; @@ -364,7 +365,7 @@ static inline void NMHASH32_long_round_scalar(uint32_t * const RESTRICT accX, accX[i] ^= accX[i] << 11 ^ accX[i] >> 9; } for (i = 0; i < nbGroups * 2; ++i) { - ((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i]; + ((uint16_t *)accX)[i] *= ((uint16_t *)__NMH_M3_V)[i]; } for (i = 0; i < nbGroups; ++i) { accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20; @@ -373,36 +374,37 @@ static inline void NMHASH32_long_round_scalar(uint32_t * const RESTRICT accX, #if NMH_VECTOR > NMH_SCALAR -#if NMH_VECTOR == NMH_SSE2 -# define _NMH_M_(F) mm_ ## F -# define _NMH_MM_(F) _mm_ ## F -# define _NMH_MMW_(F) _mm_ ## F ## 128 -# define _NMH_MM_T __m128i -#elif NMH_VECTOR == NMH_AVX2 -# define _NMH_M_(F) mm256_ ## F -# define _NMH_MM_(F) _mm256_ ## F -# define _NMH_MMW_(F) _mm256_ ## F ## 256 -# define _NMH_MM_T __m256i -#elif NMH_VECTOR == NMH_AVX512 -# define _NMH_M_(F) mm512_ ## F -# define _NMH_MM_(F) _mm512_ ## F -# define _NMH_MMW_(F) _mm512_ ## F ## 512 -# define _NMH_MM_T __m512i -#endif - -#define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT))) - -template < bool bswap > -static inline void NMHASH32_long_round_sse(uint32_t * const RESTRICT accX, - uint32_t *const RESTRICT accY, const uint8_t* const RESTRICT p) { - const _NMH_MM_T *const RESTRICT m1 = (const _NMH_MM_T * RESTRICT)__NMH_M1_V; - const _NMH_MM_T *const RESTRICT m2 = (const _NMH_MM_T * RESTRICT)__NMH_M2_V; - const _NMH_MM_T *const RESTRICT m3 = (const _NMH_MM_T * RESTRICT)__NMH_M3_V; - - _NMH_MM_T *const xaccX = ( _NMH_MM_T * )accX; - _NMH_MM_T *const xaccY = ( _NMH_MM_T * )accY; - _NMH_MM_T *const xp = ( _NMH_MM_T * )p; - size_t i; + #if NMH_VECTOR == NMH_SSE2 + #define _NMH_M_(F) mm_ ## F + #define _NMH_MM_(F) _mm_ ## F + #define _NMH_MMW_(F) _mm_ ## F ## 128 + #define _NMH_MM_T __m128i + #elif NMH_VECTOR == NMH_AVX2 + #define _NMH_M_(F) mm256_ ## F + #define _NMH_MM_(F) _mm256_ ## F + #define _NMH_MMW_(F) _mm256_ ## F ## 256 + #define _NMH_MM_T __m256i + #elif NMH_VECTOR == NMH_AVX512 + #define _NMH_M_(F) mm512_ ## F + #define _NMH_MM_(F) _mm512_ ## F + #define _NMH_MMW_(F) _mm512_ ## F ## 512 + #define _NMH_MM_T __m512i + #endif + + #define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / \ + (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT))) + +template +static inline void NMHASH32_long_round_sse( uint32_t * const RESTRICT accX, uint32_t * const RESTRICT accY, + const uint8_t * const RESTRICT p ) { + const _NMH_MM_T * const RESTRICT m1 = (const _NMH_MM_T * RESTRICT) __NMH_M1_V; + const _NMH_MM_T * const RESTRICT m2 = (const _NMH_MM_T * RESTRICT) __NMH_M2_V; + const _NMH_MM_T * const RESTRICT m3 = (const _NMH_MM_T * RESTRICT) __NMH_M3_V; + + _NMH_MM_T * const xaccX = (_NMH_MM_T *)accX; + _NMH_MM_T * const xaccY = (_NMH_MM_T *)accY; + _NMH_MM_T * const xp = (_NMH_MM_T *)p; + size_t i; for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { if (bswap) { @@ -428,7 +430,8 @@ static inline void NMHASH32_long_round_sse(uint32_t * const RESTRICT accX, xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1); } for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { - xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13)); + xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_( + slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13)); } for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2); @@ -437,26 +440,28 @@ static inline void NMHASH32_long_round_sse(uint32_t * const RESTRICT accX, xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]); } for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { - xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9)); + xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_( + slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9)); } for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3); } for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) { - xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20)); + xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_( + srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20)); } } -# undef _NMH_MM_ -# undef _NMH_MMW_ -# undef _NMH_MM_T -#undef NMH_VECTOR_NB_GROUP + #undef _NMH_MM_ + #undef _NMH_MMW_ + #undef _NMH_MM_T + #undef NMH_VECTOR_NB_GROUP #endif /* NMH_VECTOR > NMH_SCALAR */ -template < bool bswap > -static inline void NMHASH32_long_round(uint32_t * const RESTRICT accX, - uint32_t *const RESTRICT accY, const uint8_t* const RESTRICT p) { +template +static inline void NMHASH32_long_round( uint32_t * const RESTRICT accX, uint32_t * const RESTRICT accY, + const uint8_t * const RESTRICT p ) { #if NMH_VECTOR > NMH_SCALAR return NMHASH32_long_round_sse(accX, accY, p); #else @@ -464,18 +469,17 @@ static inline void NMHASH32_long_round(uint32_t * const RESTRICT accX, #endif } -template < bool bswap > -static uint32_t NMHASH32_long(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { - alignas(16) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)]; - alignas(16) uint32_t accY[sizeof(accX)/sizeof(*accX)]; +template +static uint32_t NMHASH32_long( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { + alignas(16) uint32_t accX[sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT)]; + alignas(16) uint32_t accY[sizeof(accX) / sizeof(*accX)]; size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY)); - size_t i; - uint32_t sum = 0; + size_t i; + uint32_t sum = 0; /* init */ - for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i]; - for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed; + for (i = 0; i < sizeof(accX) / sizeof(*accX); ++i) { accX[i] = NMH_ACC_INIT[i]; } + for (i = 0; i < sizeof(accY) / sizeof(*accY); ++i) { accY[i] = seed; } for (i = 0; i < nbRounds; ++i) { NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY))); @@ -483,8 +487,8 @@ static uint32_t NMHASH32_long(const uint8_t* const RESTRICT p, NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY))); /* merge acc */ - for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i]; - for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i]; + for (i = 0; i < sizeof(accX) / sizeof(*accX); ++i) { accX[i] ^= NMH_ACC_INIT[i]; } + for (i = 0; i < sizeof(accX) / sizeof(*accX); ++i) { sum += accX[i]; } if (sizeof(size_t) > sizeof(uint32_t)) { sum += (uint32_t)(len >> 32); @@ -492,29 +496,30 @@ static uint32_t NMHASH32_long(const uint8_t* const RESTRICT p, return sum ^ (uint32_t)len; } -static inline uint32_t NMHASH32_avalanche32(uint32_t const x) { +static inline uint32_t NMHASH32_avalanche32( uint32_t const x ) { /* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */ const uint32_t m1 = UINT32_C(0xCCE5196D); const uint32_t m2 = UINT32_C(0x464BE229); - uint32_t vx; - vx = x; - vx ^= (vx >> 8) ^ (vx >> 21); - vx = NMHASH_mult16(vx, m1); - vx ^= (vx << 12) ^ (vx >> 7); - vx = NMHASH_mult16(vx, m2); + uint32_t vx; + + vx = x; + vx ^= (vx >> 8) ^ (vx >> 21); + vx = NMHASH_mult16(vx, m1); + vx ^= (vx << 12) ^ (vx >> 7); + vx = NMHASH_mult16(vx, m2); return vx ^ (vx >> 8) ^ (vx >> 21); } -template < bool bswap > -static inline uint32_t NMHASH32(const void * const RESTRICT input, - size_t const len, uint32_t seed) { - const uint8_t *const p = (const uint8_t *)input; +template +static inline uint32_t NMHASH32( const void * const RESTRICT input, size_t const len, uint32_t seed ) { + const uint8_t * const p = (const uint8_t *)input; + if (likely(len <= 32)) { if (likely(len > 8)) { return NMHASH32_9to32(p, len, seed); } if (likely(len > 4)) { - uint32_t x = GET_U32(p, 0); + uint32_t x = GET_U32(p, 0 ); uint32_t y = GET_U32(p, len - 4) ^ (NMH_PRIME32_4 + 2 + seed); x += y; x ^= x << (len + 7); @@ -522,22 +527,22 @@ static inline uint32_t NMHASH32(const void * const RESTRICT input, } else { uint32_t data; switch (len) { - case 0: seed += NMH_PRIME32_2; - data = 0; + case 0: seed += NMH_PRIME32_2; + data = 0; break; - case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1); - data = p[0]; + case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1); + data = p[0]; break; - case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1); - data = GET_U16(p, 0); + case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1); + data = GET_U16(p, 0); break; - case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1); - data = GET_U16(p, 0) | (p[2] << 16); + case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1); + data = GET_U16(p, 0) | (p[2] << 16); break; - case 4: seed += NMH_PRIME32_3; - data = GET_U32(p, 0); + case 4: seed += NMH_PRIME32_3; + data = GET_U32(p, 0); break; - default: return 0; + default: return 0; } return NMHASH32_0to8(data + seed, ROTL32(seed, 5)); } @@ -549,7 +554,7 @@ static inline uint32_t NMHASH32(const void * const RESTRICT input, } //------------------------------------------------------------ -static inline uint32_t NMHASH32X_0to4(uint32_t x, uint32_t const seed) { +static inline uint32_t NMHASH32X_0to4( uint32_t x, uint32_t const seed ) { /* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */ x ^= seed; x *= UINT32_C(0xBDAB1EA9); @@ -562,15 +567,15 @@ static inline uint32_t NMHASH32X_0to4(uint32_t x, uint32_t const seed) { return x; } -template < bool bswap > -static inline uint32_t NMHASH32X_5to8(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { +template +static inline uint32_t NMHASH32X_5to8( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { /* * - 5 to 9 bytes * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */ - uint32_t x = GET_U32(p, 0) ^ NMH_PRIME32_3; + uint32_t x = GET_U32(p, 0 ) ^ NMH_PRIME32_3; uint32_t const y = GET_U32(p, len - 4) ^ seed; + x += y; x ^= x >> len; x *= UINT32_C(0x11049A7D); @@ -583,10 +588,10 @@ static inline uint32_t NMHASH32X_5to8(const uint8_t* const RESTRICT p, return x; } -template < bool bswap > -static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, - size_t const len, uint32_t const seed) { - /* - at least 9 bytes +template +static inline uint32_t NMHASH32X_9to255( const uint8_t * const RESTRICT p, size_t const len, uint32_t const seed ) { + /* + * - at least 9 bytes * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322 */ @@ -595,7 +600,7 @@ static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, uint32_t y = seed; uint32_t a = NMH_PRIME32_4; uint32_t b = seed; - size_t i, r = (len - 1) / 16; + size_t i, r = (len - 1) / 16; for (i = 0; i < r; ++i) { x ^= GET_U32(p, i * 16 + 0); @@ -610,7 +615,7 @@ static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, x *= UINT32_C(0x065E9DAD); x ^= x >> 12; - a ^= GET_U32(p, i * 16 + 8); + a ^= GET_U32(p, i * 16 + 8); b ^= GET_U32(p, i * 16 + 12); a ^= b; a *= UINT32_C(0x11049A7D); @@ -623,8 +628,8 @@ static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, a ^= a >> 12; } - if (likely(((uint8_t)len-1) & 8)) { - if (likely(((uint8_t)len-1) & 4)) { + if (likely(((uint8_t)len - 1) & 8)) { + if (likely(((uint8_t)len - 1) & 4)) { a ^= GET_U32(p, r * 16 + 0); b ^= GET_U32(p, r * 16 + 4); a ^= b; @@ -652,7 +657,7 @@ static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, x ^= x >> 12; x *= UINT32_C(0x065E9DAD); } else { - if (likely(((uint8_t)len-1) & 4)) { + if (likely(((uint8_t)len - 1) & 4)) { a ^= GET_U32(p, r * 16) + b; a ^= a >> 16; a *= UINT32_C(0xA52FB2CD); @@ -674,7 +679,7 @@ static inline uint32_t NMHASH32X_9to255(const uint8_t* const RESTRICT p, return x; } -static inline uint32_t NMHASH32X_avalanche32(uint32_t x) { +static inline uint32_t NMHASH32X_avalanche32( uint32_t x ) { /* * mixer with 2 mul from skeeto/hash-prospector: * [15 d168aaad 15 af723597 15] = 0.15983776156606694 @@ -688,10 +693,10 @@ static inline uint32_t NMHASH32X_avalanche32(uint32_t x) { } /* use 32*32->32 multiplication for short hash */ -template < bool bswap > -static inline uint32_t NMHASH32X(const void* const RESTRICT input, - size_t const len, uint32_t seed) { - const uint8_t *const p = (const uint8_t *)input; +template +static inline uint32_t NMHASH32X( const void * const RESTRICT input, size_t const len, uint32_t seed ) { + const uint8_t * const p = (const uint8_t *)input; + if (likely(len <= 8)) { if (likely(len > 4)) { return NMHASH32X_5to8(p, len, seed); @@ -699,22 +704,22 @@ static inline uint32_t NMHASH32X(const void* const RESTRICT input, /* 0-4 bytes */ uint32_t data; switch (len) { - case 0: seed += NMH_PRIME32_2; - data = 0; + case 0: seed += NMH_PRIME32_2; + data = 0; break; - case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1); - data = p[0]; + case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1); + data = p[0]; break; - case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1); - data = GET_U16(p, 0); + case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1); + data = GET_U16(p, 0); break; - case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1); - data = GET_U16(p, 0) | (p[2] << 16); + case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1); + data = GET_U16(p, 0) | (p[2] << 16); break; - case 4: seed += NMH_PRIME32_1; - data = GET_U32(p, 0); + case 4: seed += NMH_PRIME32_1; + data = GET_U32(p, 0); break; - default: return 0; + default: return 0; } return NMHASH32X_0to4(data, seed); } @@ -726,54 +731,56 @@ static inline uint32_t NMHASH32X(const void* const RESTRICT input, } //------------------------------------------------------------ -template < bool bswap > -static void NMhash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void NMhash( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = NMHASH32(in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void NMhashX(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void NMhashX( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = NMHASH32X(in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(nmhash, - $.src_url = "https://github.com/gzm55/hash-garage", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/gzm55/hash-garage", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(NMHASH, - $.desc = "nmhash32 v2", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0x12A30553, - $.verification_BE = 0xE3222AC8, - $.hashfn_native = NMhash, - $.hashfn_bswap = NMhash -); + $.desc = "nmhash32 v2", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0x12A30553, + $.verification_BE = 0xE3222AC8, + $.hashfn_native = NMhash, + $.hashfn_bswap = NMhash + ); REGISTER_HASH(NMHASHX, - $.desc = "nmhash32x v2", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xA8580227, - $.verification_BE = 0x83B36886, - $.hashfn_native = NMhashX, - $.hashfn_bswap = NMhashX -); + $.desc = "nmhash32x v2", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xA8580227, + $.verification_BE = 0x83B36886, + $.hashfn_native = NMhashX, + $.hashfn_bswap = NMhashX + ); diff --git a/hashes/o1hash.cpp b/hashes/o1hash.cpp index 497e8086..2aa107c3 100644 --- a/hashes/o1hash.cpp +++ b/hashes/o1hash.cpp @@ -39,28 +39,29 @@ #include "Hashlib.h" /* - This is a quick and dirty hash function designed for O(1) speed. - It makes your hash table application fly in most cases. - It samples first, middle and last 4 bytes to produce the hash. - Do not use it in very serious applications as it's not secure. -*/ + * This is a quick and dirty hash function designed for O(1) speed. + * It makes your hash table application fly in most cases. + * It samples first, middle and last 4 bytes to produce the hash. + * Do not use it in very serious applications as it's not secure. + */ //------------------------------------------------------------ // Includes homegrown seeding for SMHasher3 -template < bool bswap > -static void o1hash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void o1hash( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * p = (const uint8_t *)in; - uint64_t h; + uint64_t h; + if (len >= 4) { - uint64_t first = GET_U32(p, 0); - uint64_t middle = GET_U32(p, ((len >> 1) - 2)); + uint64_t first = GET_U32(p, 0 ); + uint64_t middle = GET_U32(p, ((len >> 1) - 2)); uint64_t last = GET_U32(p, len - 4); h = (middle + (uint64_t)seed) * (first + last); } else if (len > 0) { uint64_t tail = seed + ( - (((uint64_t)p[ 0]) << 16) | + (((uint64_t)p[0 ]) << 16) | (((uint64_t)p[len >> 1]) << 8) | - (((uint64_t)p[ len - 1]))) ; + (((uint64_t)p[len - 1]))); h = tail * UINT64_C(0xa0761d6478bd642f); } else { h = 0; @@ -70,23 +71,23 @@ static void o1hash(const void * in, const size_t len, const seed_t seed, void * //------------------------------------------------------------ REGISTER_FAMILY(o1hash, - $.src_url = "https://github.com/wangyi-fudan/wyhash/blob/master/old_versions/o1hash.h", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/wangyi-fudan/wyhash/blob/master/old_versions/o1hash.h", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(o1hash, - $.desc = "o(1) hash, from wyhash", - $.sort_order = 45, - $.hash_flags = - FLAG_HASH_MOCK | - FLAG_HASH_NO_SEED , - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN , - $.bits = 64, - $.verification_LE = 0xAE049F09, - $.verification_BE = 0x299BD16A, - $.hashfn_native = o1hash, - $.hashfn_bswap = o1hash -); + $.desc = "o(1) hash, from wyhash", + $.sort_order = 45, + $.hash_flags = + FLAG_HASH_MOCK | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0xAE049F09, + $.verification_BE = 0x299BD16A, + $.hashfn_native = o1hash, + $.hashfn_bswap = o1hash + ); diff --git a/hashes/pearson.cpp b/hashes/pearson.cpp index f96c2f1c..de9b796e 100644 --- a/hashes/pearson.cpp +++ b/hashes/pearson.cpp @@ -32,506 +32,513 @@ #include "Hashlib.h" #if defined(HAVE_SSSE_3) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif // AES S-Box table -- allows for eventually supported hardware accelerated look-up -static const uint8_t t[256] ={ - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; +static const uint8_t t[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +}; static uint16_t t16[65536]; -static bool pearson_hash_init (void) { +static bool pearson_hash_init( void ) { #if !defined(HAVE_SSSE_3) - size_t i; + size_t i; - for (i = 0; i < 65536; i++) - t16[i] = (t[i >> 8] << 8) + t[(uint8_t)i]; + for (i = 0; i < 65536; i++) { + t16[i] = (t[i >> 8] << 8) + t[(uint8_t)i]; + } #endif - return true; + return true; } -static void pearson_hash_256_portable(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - /* initial values - astonishingly, assembling using SHIFTs and ORs (in register) - * works faster on well pipelined CPUs than loading the 64-bit value from memory. - * however, there is one advantage to loading from memory: as we also store back to - * memory at the end, we do not need to care about endianess! */ - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); - uint64_t high_upper_hash_mask = upper_hash_mask + UINT64_C(0x1010101010101010); - uint64_t high_lower_hash_mask = lower_hash_mask + UINT64_C(0x1010101010101010); - - // The one nod to endianness is that the hash_in value needs be in - // little-endian format always, to match up with the byte ordering - // of upper[] and lower[] above. - hash_in = COND_BSWAP(hash_in, isBE()); - uint64_t upper_hash = hash_in; - uint64_t lower_hash = hash_in; - uint64_t high_upper_hash = hash_in; - uint64_t high_lower_hash = hash_in; - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - uint64_t c = (uint8_t)in[i]; - c |= c << 8; - c |= c << 16; - c |= c << 32; - upper_hash ^= c ^ upper_hash_mask; - lower_hash ^= c ^ lower_hash_mask; - high_upper_hash ^= c ^ high_upper_hash_mask; - high_lower_hash ^= c ^ high_lower_hash_mask; - - // table lookup - uint64_t h = 0; - uint16_t x; - x = upper_hash; x = t16[x]; upper_hash >>= 16; h = x; h = ROTR64 (h, 16); - x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = upper_hash; x = t16[x]; ; h |= x; h = ROTR64 (h, 16); - upper_hash = h; - - h = 0; - x = lower_hash; x = t16[x]; lower_hash >>= 16; h = x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; ; h |= x; h = ROTR64 (h, 16); - lower_hash = h; - - h = 0; - x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h = x; h = ROTR64 (h, 16); - x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = high_upper_hash; x = t16[x]; ; h |= x; h = ROTR64 (h, 16); - high_upper_hash = h; - - h = 0; - x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h = x; h = ROTR64 (h, 16); - x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = high_lower_hash; x = t16[x]; ; h |= x; h = ROTR64 (h, 16); - high_lower_hash = h; - } - // store output - PUT_U64(high_upper_hash, out, 0); - PUT_U64(high_lower_hash, out, 8); - PUT_U64(upper_hash, out, 16); - PUT_U64(lower_hash, out, 24); +static void pearson_hash_256_portable( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + /* + * initial values - astonishingly, assembling using SHIFTs and ORs (in register) + * works faster on well pipelined CPUs than loading the 64-bit value from memory. + * however, there is one advantage to loading from memory: as we also store back to + * memory at the end, we do not need to care about endianess! + */ + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); + uint64_t high_upper_hash_mask = upper_hash_mask + UINT64_C(0x1010101010101010); + uint64_t high_lower_hash_mask = lower_hash_mask + UINT64_C(0x1010101010101010); + + // The one nod to endianness is that the hash_in value needs be in + // little-endian format always, to match up with the byte ordering + // of upper[] and lower[] above. + hash_in = COND_BSWAP(hash_in, isBE()); + uint64_t upper_hash = hash_in; + uint64_t lower_hash = hash_in; + uint64_t high_upper_hash = hash_in; + uint64_t high_lower_hash = hash_in; + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + uint64_t c = (uint8_t)in[i]; + c |= c << 8; + c |= c << 16; + c |= c << 32; + upper_hash ^= c ^ upper_hash_mask; + lower_hash ^= c ^ lower_hash_mask; + high_upper_hash ^= c ^ high_upper_hash_mask; + high_lower_hash ^= c ^ high_lower_hash_mask; + + // table lookup + uint64_t h = 0; + uint16_t x; + x = upper_hash; x = t16[x]; upper_hash >>= 16; h = x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + upper_hash = h; + + h = 0; + x = lower_hash; x = t16[x]; lower_hash >>= 16; h = x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + lower_hash = h; + + h = 0; + x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h = x; h = ROTR64(h, 16); + x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = high_upper_hash; x = t16[x]; high_upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = high_upper_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + high_upper_hash = h; + + h = 0; + x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h = x; h = ROTR64(h, 16); + x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = high_lower_hash; x = t16[x]; high_lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = high_lower_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + high_lower_hash = h; + } + // store output + PUT_U64(high_upper_hash, out, 0); + PUT_U64(high_lower_hash, out, 8); + PUT_U64(upper_hash , out, 16); + PUT_U64(lower_hash , out, 24); } -static void pearson_hash_128_portable(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - /* initial values - astonishingly, assembling using SHIFTs and ORs (in register) - * works faster on well pipelined CPUs than loading the 64-bit value from memory. - * however, there is one advantage to loading from memory: as we also store back to - * memory at the end, we do not need to care about endianess! */ - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); - - // The one nod to endianness is that the hash_in value needs be in - // little-endian format always, to match up with the byte ordering - // of upper[] and lower[] above. - hash_in = COND_BSWAP(hash_in, isBE()); - uint64_t upper_hash = hash_in; - uint64_t lower_hash = hash_in; - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - uint64_t c = (uint8_t)in[i]; - c |= c << 8; - c |= c << 16; - c |= c << 32; - upper_hash ^= c ^ upper_hash_mask; - lower_hash ^= c ^ lower_hash_mask; - - // table lookup - uint64_t h = 0; - uint16_t x; - x = upper_hash; x = t16[x]; upper_hash >>= 16; h = x; h = ROTR64 (h,16); - x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64 (h,16); - x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64 (h,16); - x = upper_hash; x = t16[x]; ; h |= x; h = ROTR64 (h,16); - upper_hash = h; - - h = 0; - x = lower_hash; x = t16[x]; lower_hash >>= 16; h = x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64 (h, 16); - x = lower_hash; x = t16[x]; ; h |= x; h = ROTR64 (h, 16); - lower_hash = h; - } - // store output - PUT_U64(upper_hash, out, 0); - PUT_U64(lower_hash, out, 8); +static void pearson_hash_128_portable( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + /* + * initial values - astonishingly, assembling using SHIFTs and ORs (in register) + * works faster on well pipelined CPUs than loading the 64-bit value from memory. + * however, there is one advantage to loading from memory: as we also store back to + * memory at the end, we do not need to care about endianess! + */ + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); + + // The one nod to endianness is that the hash_in value needs be in + // little-endian format always, to match up with the byte ordering + // of upper[] and lower[] above. + hash_in = COND_BSWAP(hash_in, isBE()); + uint64_t upper_hash = hash_in; + uint64_t lower_hash = hash_in; + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + uint64_t c = (uint8_t)in[i]; + c |= c << 8; + c |= c << 16; + c |= c << 32; + upper_hash ^= c ^ upper_hash_mask; + lower_hash ^= c ^ lower_hash_mask; + + // table lookup + uint64_t h = 0; + uint16_t x; + x = upper_hash; x = t16[x]; upper_hash >>= 16; h = x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; upper_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = upper_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + upper_hash = h; + + h = 0; + x = lower_hash; x = t16[x]; lower_hash >>= 16; h = x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; lower_hash >>= 16; h |= x; h = ROTR64(h, 16); + x = lower_hash; x = t16[x]; h |= x; h = ROTR64(h, 16); + lower_hash = h; + } + // store output + PUT_U64(upper_hash, out, 0); + PUT_U64(lower_hash, out, 8); } -static void pearson_hash_64_portable(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - uint64_t hash_mask = UINT64_C(0x0706050403020100); - uint64_t hash = hash_in; - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - uint64_t c = (uint8_t)in[i]; - c |= c << 8; - c |= c << 16; - c |= c << 32; - hash ^= c ^ hash_mask; - // table lookup - - uint64_t h = 0; - h = (t16[(uint16_t)(hash >> 16)] << 16) + t16[(uint16_t)hash]; - h <<= 32; - h |= (uint32_t)((t16[(uint16_t)(hash >> 48)] << 16)) + t16[(uint16_t)(hash >> 32)]; - hash = ROTR64(h, 32); - } - // store output - if (isBE()) { - PUT_U64(hash, out, 0); - } else { - PUT_U64(hash, out, 0); - } +static void pearson_hash_64_portable( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + uint64_t hash_mask = UINT64_C(0x0706050403020100); + uint64_t hash = hash_in; + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + uint64_t c = (uint8_t)in[i]; + c |= c << 8; + c |= c << 16; + c |= c << 32; + hash ^= c ^ hash_mask; + // table lookup + + uint64_t h = 0; + h = (t16[(uint16_t)(hash >> 16)] << 16) + t16[(uint16_t)hash]; + h <<= 32; + h |= (uint32_t)((t16[(uint16_t)(hash >> 48)] << 16)) + t16[(uint16_t)(hash >> 32)]; + hash = ROTR64(h, 32); + } + // store output + if (isBE()) { + PUT_U64(hash, out, 0); + } else { + PUT_U64(hash, out, 0); + } } #if defined(HAVE_X86_64_AES) -static void pearson_hash_256_aesni(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; +static void pearson_hash_256_aesni( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); - __m128i tmp = _mm_set1_epi8(0x10); + __m128i tmp = _mm_set1_epi8(0x10); - __m128i hash_mask = _mm_set_epi64x(lower_hash_mask, upper_hash_mask); - __m128i high_hash_mask = _mm_xor_si128 (tmp, hash_mask); - __m128i hash= _mm_set_epi64x(hash_in, hash_in); - __m128i high_hash= _mm_set_epi64x(hash_in, hash_in); + __m128i hash_mask = _mm_set_epi64x(lower_hash_mask, upper_hash_mask); + __m128i high_hash_mask = _mm_xor_si128(tmp, hash_mask); + __m128i hash = _mm_set_epi64x(hash_in, hash_in); + __m128i high_hash = _mm_set_epi64x(hash_in, hash_in); - // table lookup preparation - __m128i ZERO = _mm_setzero_si128(); - __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); + // table lookup preparation + __m128i ZERO = _mm_setzero_si128(); + __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - high_hash = _mm_xor_si128 (high_hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); - high_hash = _mm_xor_si128 (high_hash, high_hash_mask); + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash , cc ); + high_hash = _mm_xor_si128(high_hash, cc ); + hash = _mm_xor_si128(hash , hash_mask); + high_hash = _mm_xor_si128(high_hash, high_hash_mask); - // table lookup - hash = _mm_shuffle_epi8(hash, ISOLATE_SBOX_MASK); // re-order along AES round - high_hash = _mm_shuffle_epi8(high_hash, ISOLATE_SBOX_MASK); // re-order along AES round - hash = _mm_aesenclast_si128(hash, ZERO); - high_hash = _mm_aesenclast_si128(high_hash, ZERO); - } + // table lookup + hash = _mm_shuffle_epi8(hash , ISOLATE_SBOX_MASK); // re-order along AES round + high_hash = _mm_shuffle_epi8(high_hash, ISOLATE_SBOX_MASK); // re-order along AES round + hash = _mm_aesenclast_si128(hash , ZERO); + high_hash = _mm_aesenclast_si128(high_hash, ZERO); + } - // store output - _mm_store_si128 ((__m128i*)out , high_hash); - _mm_store_si128 ((__m128i*)&out[16] , hash); + // store output + _mm_store_si128((__m128i *)out , high_hash); + _mm_store_si128((__m128i *)&out[16], hash ); } -static void pearson_hash_128_aesni(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; +static void pearson_hash_128_aesni( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); - __m128i hash_mask = _mm_set_epi64x (lower_hash_mask, upper_hash_mask); - __m128i hash = _mm_set_epi64x(hash_in, hash_in); + __m128i hash_mask = _mm_set_epi64x(lower_hash_mask, upper_hash_mask); + __m128i hash = _mm_set_epi64x(hash_in, hash_in); - // table lookup preparation - __m128i ZERO = _mm_setzero_si128(); - __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); + // table lookup preparation + __m128i ZERO = _mm_setzero_si128(); + __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash, cc ); + hash = _mm_xor_si128(hash, hash_mask); - // table lookup - hash = _mm_shuffle_epi8(hash, ISOLATE_SBOX_MASK); // re-order along AES round - hash = _mm_aesenclast_si128(hash, ZERO); - } - // store output - _mm_store_si128 ((__m128i*)out , hash); + // table lookup + hash = _mm_shuffle_epi8(hash, ISOLATE_SBOX_MASK); // re-order along AES round + hash = _mm_aesenclast_si128(hash, ZERO); + } + // store output + _mm_store_si128((__m128i *)out, hash); } -static void pearson_hash_64_aesni(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - __m128i hash_mask = _mm_cvtsi64_si128(UINT64_C(0x0706050403020100)); - __m128i hash = _mm_cvtsi64_si128(hash_in); +static void pearson_hash_64_aesni( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + __m128i hash_mask = _mm_cvtsi64_si128(UINT64_C(0x0706050403020100)); + __m128i hash = _mm_cvtsi64_si128(hash_in); - // table lookup preparation - __m128i ZERO = _mm_setzero_si128(); - __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); + // table lookup preparation + __m128i ZERO = _mm_setzero_si128(); + __m128i ISOLATE_SBOX_MASK = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash, cc ); + hash = _mm_xor_si128(hash, hash_mask); - // table lookup - hash = _mm_shuffle_epi8(hash, ISOLATE_SBOX_MASK); // re-order along AES round - hash = _mm_aesenclast_si128(hash, ZERO); - } + // table lookup + hash = _mm_shuffle_epi8(hash, ISOLATE_SBOX_MASK); // re-order along AES round + hash = _mm_aesenclast_si128(hash, ZERO); + } - // store output - _mm_storel_epi64((__m128i*)out , hash); + // store output + _mm_storel_epi64((__m128i *)out, hash); } #elif defined(HAVE_SSSE_3) -static void pearson_hash_256_ssse3(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); - - __m128i tmp = _mm_set1_epi8(0x10); - - __m128i hash_mask = _mm_set_epi64x (lower_hash_mask, upper_hash_mask); - __m128i high_hash_mask = _mm_xor_si128 (tmp, hash_mask); - __m128i hash= _mm_set_epi64x(hash_in, hash_in); - __m128i high_hash= _mm_set_epi64x(hash_in, hash_in); - - // table lookup preparation - __m128i const p16 = _mm_set1_epi8 (0x10); - __m128i lut_result = _mm_xor_si128 (lut_result, lut_result); - __m128i high_lut_result = _mm_xor_si128 (high_lut_result, high_lut_result); - __m128i selected_entries; - __m128i high_selected_entries; - __m128i table_line; - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - high_hash = _mm_xor_si128 (high_hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); - high_hash = _mm_xor_si128 (high_hash, high_hash_mask); - - // table lookup - size_t j; - __m128i lut_index = hash; - __m128i high_lut_index = high_hash; - lut_result = _mm_xor_si128 (lut_result, lut_result); - high_lut_result = _mm_xor_si128 (lut_result, lut_result); - for (j = 0; j < 256; j += 16) { - table_line = _mm_load_si128 ((__m128i *)&t[j]); - selected_entries = _mm_min_epu8 (lut_index, p16); - selected_entries = _mm_cmpeq_epi8 (selected_entries, p16); - selected_entries = _mm_or_si128 (selected_entries, lut_index); - selected_entries = _mm_shuffle_epi8 (table_line, selected_entries); - high_selected_entries = _mm_min_epu8 (high_lut_index, p16); - high_selected_entries = _mm_cmpeq_epi8 (high_selected_entries, p16); - high_selected_entries = _mm_or_si128 (high_selected_entries, high_lut_index); - high_selected_entries = _mm_shuffle_epi8 (table_line, high_selected_entries); - lut_result = _mm_or_si128 (lut_result, selected_entries); - lut_index = _mm_sub_epi8 (lut_index, p16); - high_lut_result = _mm_or_si128 (high_lut_result, high_selected_entries); - high_lut_index = _mm_sub_epi8 (high_lut_index, p16); +static void pearson_hash_256_ssse3( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); + + __m128i tmp = _mm_set1_epi8(0x10); + + __m128i hash_mask = _mm_set_epi64x(lower_hash_mask, upper_hash_mask); + __m128i high_hash_mask = _mm_xor_si128(tmp, hash_mask); + __m128i hash = _mm_set_epi64x(hash_in, hash_in); + __m128i high_hash = _mm_set_epi64x(hash_in, hash_in); + + // table lookup preparation + __m128i const p16 = _mm_set1_epi8(0x10); + __m128i lut_result = _mm_xor_si128(lut_result , lut_result); + __m128i high_lut_result = _mm_xor_si128(high_lut_result, high_lut_result); + __m128i selected_entries; + __m128i high_selected_entries; + __m128i table_line; + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash , cc ); + high_hash = _mm_xor_si128(high_hash, cc ); + hash = _mm_xor_si128(hash , hash_mask); + high_hash = _mm_xor_si128(high_hash, high_hash_mask); + + // table lookup + size_t j; + __m128i lut_index = hash; + __m128i high_lut_index = high_hash; + lut_result = _mm_xor_si128(lut_result, lut_result); + high_lut_result = _mm_xor_si128(lut_result, lut_result); + for (j = 0; j < 256; j += 16) { + table_line = _mm_load_si128((__m128i *)&t[j]); + selected_entries = _mm_min_epu8(lut_index, p16); + selected_entries = _mm_cmpeq_epi8(selected_entries, p16); + selected_entries = _mm_or_si128(selected_entries, lut_index); + selected_entries = _mm_shuffle_epi8(table_line, selected_entries); + high_selected_entries = _mm_min_epu8(high_lut_index, p16); + high_selected_entries = _mm_cmpeq_epi8(high_selected_entries, p16); + high_selected_entries = _mm_or_si128(high_selected_entries, high_lut_index); + high_selected_entries = _mm_shuffle_epi8(table_line, high_selected_entries); + lut_result = _mm_or_si128(lut_result, selected_entries); + lut_index = _mm_sub_epi8(lut_index, p16); + high_lut_result = _mm_or_si128(high_lut_result, high_selected_entries); + high_lut_index = _mm_sub_epi8(high_lut_index, p16); + } + hash = lut_result; + high_hash = high_lut_result; } - hash = lut_result; - high_hash = high_lut_result; - } - // store output - _mm_store_si128 ((__m128i*)out , high_hash); - _mm_store_si128 ((__m128i*)&out[16] , hash); + // store output + _mm_store_si128((__m128i *)out , high_hash); + _mm_store_si128((__m128i *)&out[16], hash ); } -static void pearson_hash_128_ssse3(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - - uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; - - uint64_t upper_hash_mask = GET_U64(upper, 0); - uint64_t lower_hash_mask = GET_U64(lower, 0); - - __m128i hash_mask = _mm_set_epi64x (lower_hash_mask, upper_hash_mask); - __m128i hash = _mm_set_epi64x(hash_in, hash_in); - - // table lookup preparation - __m128i const p16 = _mm_set1_epi8 (0x10); - __m128i lut_result = _mm_xor_si128 (lut_result, lut_result); - __m128i selected_entries; - __m128i table_line; - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); - - // table lookup - size_t j; - __m128i lut_index = hash; - lut_result = _mm_xor_si128 (lut_result, lut_result); - for (j = 0; j < 256; j += 16) { - table_line = _mm_load_si128 ((__m128i *)&t[j]); - selected_entries = _mm_min_epu8 (lut_index, p16); - selected_entries = _mm_cmpeq_epi8 (selected_entries, p16); - selected_entries = _mm_or_si128 (selected_entries, lut_index); - selected_entries = _mm_shuffle_epi8 (table_line, selected_entries); - lut_result = _mm_or_si128 (lut_result, selected_entries); - lut_index = _mm_sub_epi8 (lut_index, p16); +static void pearson_hash_128_ssse3( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + + uint8_t upper[8] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + uint8_t lower[8] = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; + + uint64_t upper_hash_mask = GET_U64(upper, 0); + uint64_t lower_hash_mask = GET_U64(lower, 0); + + __m128i hash_mask = _mm_set_epi64x(lower_hash_mask, upper_hash_mask); + __m128i hash = _mm_set_epi64x(hash_in, hash_in); + + // table lookup preparation + __m128i const p16 = _mm_set1_epi8(0x10); + __m128i lut_result = _mm_xor_si128(lut_result, lut_result); + __m128i selected_entries; + __m128i table_line; + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash, cc ); + hash = _mm_xor_si128(hash, hash_mask); + + // table lookup + size_t j; + __m128i lut_index = hash; + lut_result = _mm_xor_si128(lut_result, lut_result); + for (j = 0; j < 256; j += 16) { + table_line = _mm_load_si128((__m128i *)&t[j]); + selected_entries = _mm_min_epu8(lut_index, p16); + selected_entries = _mm_cmpeq_epi8(selected_entries, p16); + selected_entries = _mm_or_si128(selected_entries, lut_index); + selected_entries = _mm_shuffle_epi8(table_line, selected_entries); + lut_result = _mm_or_si128(lut_result, selected_entries); + lut_index = _mm_sub_epi8(lut_index, p16); + } + hash = lut_result; } - hash = lut_result; - } - // store output - _mm_store_si128 ((__m128i*)out , hash); + // store output + _mm_store_si128((__m128i *)out, hash); } -static void pearson_hash_64_ssse3(uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in) { - size_t i; - __m128i hash_mask = _mm_cvtsi64_si128(UINT64_C(0x0706050403020100)); - __m128i hash = _mm_cvtsi64_si128 (hash_in); - - // table lookup preparation - __m128i const p16 = _mm_set1_epi8 (0x10); - __m128i lut_result = _mm_xor_si128 (lut_result, lut_result); - - for (i = 0; i < len; i++) { - // broadcast the character, xor into hash, make them different permutations - __m128i cc = _mm_set1_epi8 (in[i]); - hash = _mm_xor_si128 (hash, cc); - hash = _mm_xor_si128 (hash, hash_mask); - - // table lookup - size_t j; - __m128i lut_index = hash; - lut_result = _mm_xor_si128 (lut_result, lut_result); - for (j = 0; j < 256; j += 16) { - __m128i table_line = _mm_load_si128 ((__m128i *)&t[j]); - __m128i selected_entries = _mm_min_epu8 (lut_index, p16); - selected_entries = _mm_cmpeq_epi8 (selected_entries, p16); - selected_entries = _mm_or_si128 (selected_entries, lut_index); - selected_entries = _mm_shuffle_epi8 (table_line, selected_entries); - lut_result = _mm_or_si128 (lut_result, selected_entries); - lut_index = _mm_sub_epi8 (lut_index, p16); +static void pearson_hash_64_ssse3( uint8_t * out, const uint8_t * in, size_t len, uint64_t hash_in ) { + size_t i; + __m128i hash_mask = _mm_cvtsi64_si128(UINT64_C(0x0706050403020100)); + __m128i hash = _mm_cvtsi64_si128(hash_in); + + // table lookup preparation + __m128i const p16 = _mm_set1_epi8(0x10); + __m128i lut_result = _mm_xor_si128(lut_result, lut_result); + + for (i = 0; i < len; i++) { + // broadcast the character, xor into hash, make them different permutations + __m128i cc = _mm_set1_epi8(in[i]); + hash = _mm_xor_si128(hash, cc ); + hash = _mm_xor_si128(hash, hash_mask); + + // table lookup + size_t j; + __m128i lut_index = hash; + lut_result = _mm_xor_si128(lut_result, lut_result); + for (j = 0; j < 256; j += 16) { + __m128i table_line = _mm_load_si128((__m128i *)&t[j]); + __m128i selected_entries = _mm_min_epu8(lut_index, p16); + selected_entries = _mm_cmpeq_epi8(selected_entries, p16); + selected_entries = _mm_or_si128(selected_entries, lut_index); + selected_entries = _mm_shuffle_epi8(table_line, selected_entries); + lut_result = _mm_or_si128(lut_result, selected_entries); + lut_index = _mm_sub_epi8(lut_index, p16); + } + hash = lut_result; } - hash = lut_result; - } - // store output - _mm_storel_epi64((__m128i*)out , hash); + // store output + _mm_storel_epi64((__m128i *)out, hash); } + #endif -static void pearson64(const void * in, const size_t len, const seed_t seed, void * out) { +static void pearson64( const void * in, const size_t len, const seed_t seed, void * out ) { #if defined(HAVE_X86_64_AES) - pearson_hash_64_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_64_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #elif defined(HAVE_SSSE_3) - pearson_hash_64_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_64_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #else - pearson_hash_64_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_64_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #endif } -static void pearson128(const void * in, const size_t len, const seed_t seed, void * out) { +static void pearson128( const void * in, const size_t len, const seed_t seed, void * out ) { #if defined(HAVE_X86_64_AES) - pearson_hash_128_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_128_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #elif defined(HAVE_SSSE_3) - pearson_hash_128_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_128_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #else - pearson_hash_128_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_128_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #endif } -static void pearson256(const void * in, const size_t len, const seed_t seed, void * out) { +static void pearson256( const void * in, const size_t len, const seed_t seed, void * out ) { #if defined(HAVE_X86_64_AES) - pearson_hash_256_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_256_aesni((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #elif defined(HAVE_SSSE_3) - pearson_hash_256_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_256_ssse3((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #else - pearson_hash_256_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); + pearson_hash_256_portable((uint8_t *)out, (const uint8_t *)in, len, (uint64_t)seed); #endif } REGISTER_FAMILY(pearson, - $.src_url = "https://github.com/Logan007/pearson", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/Logan007/pearson", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(Pearson_64, - $.desc = "Pearson hash, 8 lanes using AES sbox", - $.hash_flags = - FLAG_HASH_AES_BASED | - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x12E4C8CD, - $.verification_BE = 0x12E4C8CD, - $.hashfn_native = pearson64, - $.hashfn_bswap = pearson64, - $.initfn = pearson_hash_init -); + $.desc = "Pearson hash, 8 lanes using AES sbox", + $.hash_flags = + FLAG_HASH_AES_BASED | + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x12E4C8CD, + $.verification_BE = 0x12E4C8CD, + $.hashfn_native = pearson64, + $.hashfn_bswap = pearson64, + $.initfn = pearson_hash_init + ); REGISTER_HASH(Pearson_128, - $.desc = "Pearson hash, 16 lanes using AES sbox", - $.hash_flags = - FLAG_HASH_AES_BASED | - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0xDC5048A3, - $.verification_BE = 0xDC5048A3, - $.hashfn_native = pearson128, - $.hashfn_bswap = pearson128, - $.initfn = pearson_hash_init -); + $.desc = "Pearson hash, 16 lanes using AES sbox", + $.hash_flags = + FLAG_HASH_AES_BASED | + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0xDC5048A3, + $.verification_BE = 0xDC5048A3, + $.hashfn_native = pearson128, + $.hashfn_bswap = pearson128, + $.initfn = pearson_hash_init + ); REGISTER_HASH(Pearson_256, - $.desc = "Pearson hash, 32 lanes using AES sbox", - $.hash_flags = - FLAG_HASH_AES_BASED | - FLAG_HASH_LOOKUP_TABLE, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 256, - $.verification_LE = 0xA9B1DE02, - $.verification_BE = 0xA9B1DE02, - $.hashfn_native = pearson256, - $.hashfn_bswap = pearson256, - $.initfn = pearson_hash_init -); + $.desc = "Pearson hash, 32 lanes using AES sbox", + $.hash_flags = + FLAG_HASH_AES_BASED | + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 256, + $.verification_LE = 0xA9B1DE02, + $.verification_BE = 0xA9B1DE02, + $.hashfn_native = pearson256, + $.hashfn_bswap = pearson256, + $.initfn = pearson_hash_init + ); diff --git a/hashes/pengyhash.cpp b/hashes/pengyhash.cpp index 53330052..a982c04e 100644 --- a/hashes/pengyhash.cpp +++ b/hashes/pengyhash.cpp @@ -33,13 +33,13 @@ #include "Hashlib.h" //------------------------------------------------------------ -template < bool bswap > -static uint64_t pengyhash(const uint8_t * p, size_t size, uint64_t seed) { +template +static uint64_t pengyhash( const uint8_t * p, size_t size, uint64_t seed ) { uint64_t b[4] = { 0 }; uint64_t s[4] = { 0, 0, 0, size }; - int i; + int i; - for(; size >= 32; size -= 32, p += 32) { + for (; size >= 32; size -= 32, p += 32) { memcpy(b, p, 32); s[1] = (s[0] += s[1] + GET_U64((uint8_t *)&b[3], 0)) + (s[1] << 14 | s[1] >> 50); @@ -50,7 +50,7 @@ static uint64_t pengyhash(const uint8_t * p, size_t size, uint64_t seed) { memcpy(b, p, size); - for(i = 0; i < 6; i++) { + for (i = 0; i < 6; i++) { s[1] = (s[0] += s[1] + GET_U64((uint8_t *)&b[3], 0)) + (s[1] << 14 | s[1] >> 50) + seed; s[3] = (s[2] += s[3] + GET_U64((uint8_t *)&b[2], 0)) + (s[3] << 23 | s[3] >> 41); s[3] = (s[0] += s[3] + GET_U64((uint8_t *)&b[1], 0)) ^ (s[3] << 16 | s[3] >> 48); @@ -61,28 +61,29 @@ static uint64_t pengyhash(const uint8_t * p, size_t size, uint64_t seed) { } //------------------------------------------------------------ -template < bool bswap > -static void pengy(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void pengy( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = pengyhash((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(pengyhash, - $.src_url = "https://github.com/tinypeng/pengyhash", - $.src_status = HashFamilyInfo::SRC_STABLEISH -); + $.src_url = "https://github.com/tinypeng/pengyhash", + $.src_status = HashFamilyInfo::SRC_STABLEISH + ); REGISTER_HASH(pengyhash, - $.desc = "pengyhash v0.2", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0x1FC2217B, - $.verification_BE = 0x774D23AB, - $.hashfn_native = pengy, - $.hashfn_bswap = pengy -); + $.desc = "pengyhash v0.2", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0x1FC2217B, + $.verification_BE = 0x774D23AB, + $.hashfn_native = pengy, + $.hashfn_bswap = pengy + ); diff --git a/hashes/perlhashes.cpp b/hashes/perlhashes.cpp index 6a2b9a4f..032808fa 100644 --- a/hashes/perlhashes.cpp +++ b/hashes/perlhashes.cpp @@ -25,185 +25,195 @@ // hash value, as the perl code does. The old verification codes can // be obtained by removing "+ (uint32_t)len" from the "hash =" lines. -static uint32_t djb2(const uint8_t * str, const size_t len, const uint32_t seed) { - const uint8_t * end = str + len; - uint32_t hash = seed + (uint32_t)len; +static uint32_t djb2( const uint8_t * str, const size_t len, const uint32_t seed ) { + const uint8_t * end = str + len; + uint32_t hash = seed + (uint32_t)len; + while (str < end) { hash = ((hash << 5) + hash) + *str++; } return hash; } -static uint32_t sdbm(const uint8_t * str, const size_t len, const uint32_t seed) { - const uint8_t * end = str + len; - uint32_t hash = seed + (uint32_t)len; +static uint32_t sdbm( const uint8_t * str, const size_t len, const uint32_t seed ) { + const uint8_t * end = str + len; + uint32_t hash = seed + (uint32_t)len; + while (str < end) { hash = (hash << 6) + (hash << 16) - hash + *str++; } return hash; } -static uint32_t jenkinsOAAT(const uint8_t * str, const size_t len, const uint32_t seed) { - const uint8_t * end = str + len; - uint32_t hash = seed + (uint32_t)len; +static uint32_t jenkinsOAAT( const uint8_t * str, const size_t len, const uint32_t seed ) { + const uint8_t * end = str + len; + uint32_t hash = seed + (uint32_t)len; + while (str < end) { hash += *str++; hash += (hash << 10); - hash ^= (hash >> 6); + hash ^= (hash >> 6); } - hash += (hash << 3); + hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } -static uint32_t jenkinsOAAT_old(const uint8_t * str, const size_t len, const uint32_t seed) { - const uint8_t * end = str + len; - uint32_t hash = seed; +static uint32_t jenkinsOAAT_old( const uint8_t * str, const size_t len, const uint32_t seed ) { + const uint8_t * end = str + len; + uint32_t hash = seed; + while (str < end) { hash += *str++; hash += (hash << 10); - hash ^= (hash >> 6); + hash ^= (hash >> 6); } - hash += (hash << 3); + hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } -static uint32_t jenkinsOAAT_hard(const uint8_t * str, const size_t len, const uint64_t seed64) { - const uint8_t * end = str + len; - uint32_t hash = (uint32_t)seed64 + (uint32_t)len; +static uint32_t jenkinsOAAT_hard( const uint8_t * str, const size_t len, const uint64_t seed64 ) { + const uint8_t * end = str + len; + uint32_t hash = (uint32_t)seed64 + (uint32_t)len; + while (str < end) { hash += (hash << 10); - hash ^= (hash >> 6); + hash ^= (hash >> 6); hash += *str++; } - hash += (hash << 10); - hash ^= (hash >> 6); + hash += (hash << 10); + hash ^= (hash >> 6); hash += (seed64 >> 32) & 0xFF; - hash += (hash << 10); - hash ^= (hash >> 6); + hash += (hash << 10); + hash ^= (hash >> 6); hash += (seed64 >> 40) & 0xFF; - hash += (hash << 10); - hash ^= (hash >> 6); + hash += (hash << 10); + hash ^= (hash >> 6); hash += (seed64 >> 48) & 0xFF; - hash += (hash << 10); - hash ^= (hash >> 6); + hash += (hash << 10); + hash ^= (hash >> 6); hash += (seed64 >> 56) & 0xFF; - hash += (hash << 10); - hash ^= (hash >> 6); - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash += (hash << 10); + hash ^= (hash >> 6); + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); return hash; } //------------------------------------------------------------ -template < bool bswap > -static void perl_djb2(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void perl_djb2( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = djb2((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void perl_sdbm(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void perl_sdbm( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = sdbm((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void perl_jenkins(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void perl_jenkins( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = jenkinsOAAT((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void perl_jenkins_old(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void perl_jenkins_old( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = jenkinsOAAT_old((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void perl_jenkins_hard(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void perl_jenkins_hard( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = jenkinsOAAT_hard((const uint8_t *)in, len, (uint64_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(perloldhashes, - $.src_url = "https://github.com/Perl/perl5/blob/6b0260474df579e9412f57249519747ab8bb5c2b/hv_func.h", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/Perl/perl5/blob/6b0260474df579e9412f57249519747ab8bb5c2b/hv_func.h", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(perl_djb2, - $.desc = "djb2 OAAT hash (from old perl5 code)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0x4962CBAB, - $.verification_BE = 0xCBC1BFB3, - $.hashfn_native = perl_djb2, - $.hashfn_bswap = perl_djb2 -); + $.desc = "djb2 OAAT hash (from old perl5 code)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0x4962CBAB, + $.verification_BE = 0xCBC1BFB3, + $.hashfn_native = perl_djb2, + $.hashfn_bswap = perl_djb2 + ); REGISTER_HASH(perl_sdbm, - $.desc = "sdbm OAAT hash (from old perl5 code)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0xD973311D, - $.verification_BE = 0xA3228EF6, - $.hashfn_native = perl_sdbm, - $.hashfn_bswap = perl_sdbm -); + $.desc = "sdbm OAAT hash (from old perl5 code)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0xD973311D, + $.verification_BE = 0xA3228EF6, + $.hashfn_native = perl_sdbm, + $.hashfn_bswap = perl_sdbm + ); REGISTER_HASH(perl_jenkins, - $.desc = "Bob Jenkins' OAAT hash (from old perl5 code)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0xE3ED0E54, - $.verification_BE = 0xA83E99BF, - $.hashfn_native = perl_jenkins, - $.hashfn_bswap = perl_jenkins -); + $.desc = "Bob Jenkins' OAAT hash (from old perl5 code)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0xE3ED0E54, + $.verification_BE = 0xA83E99BF, + $.hashfn_native = perl_jenkins, + $.hashfn_bswap = perl_jenkins + ); REGISTER_HASH(perl_jenkins_old, - $.desc = "Bob Jenkins' OAAT hash (\"old\" version from old perl5 code)", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0xEE05869B, - $.verification_BE = 0x691105C0, - $.hashfn_native = perl_jenkins_old, - $.hashfn_bswap = perl_jenkins_old -); + $.desc = "Bob Jenkins' OAAT hash (\"old\" version from old perl5 code)", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS | + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0xEE05869B, + $.verification_BE = 0x691105C0, + $.hashfn_native = perl_jenkins_old, + $.hashfn_bswap = perl_jenkins_old + ); REGISTER_HASH(perl_jenkins_hard, - $.desc = "Bob Jenkins' OAAT hash (\"hard\" version from old perl5 code)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0x1C216B25, - $.verification_BE = 0x3B326068, - $.hashfn_native = perl_jenkins_hard, - $.hashfn_bswap = perl_jenkins_hard -); + $.desc = "Bob Jenkins' OAAT hash (\"hard\" version from old perl5 code)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0x1C216B25, + $.verification_BE = 0x3B326068, + $.hashfn_native = perl_jenkins_hard, + $.hashfn_bswap = perl_jenkins_hard + ); diff --git a/hashes/pmp_multilinear.cpp b/hashes/pmp_multilinear.cpp index ffbe2e96..04b76b91 100644 --- a/hashes/pmp_multilinear.cpp +++ b/hashes/pmp_multilinear.cpp @@ -29,10 +29,10 @@ #include "Hashlib.h" #if defined(HAVE_AVX2) || defined(HAVE_SSE_4_1) || defined(HAVE_SSE_2) -#undef HAVE_AVX2 -#undef HAVE_SSE_4_1 -#undef HAVE_SSE_2 -//#include "Intrinsics.h" + #undef HAVE_AVX2 + #undef HAVE_SSE_4_1 + #undef HAVE_SSE_2 +// #include "Intrinsics.h" #endif #include "Mathmult.h" @@ -43,69 +43,66 @@ using namespace std; //------------------------------------------------------------- // Common typedefs #if __BYTE_ORDER == __LITTLE_ENDIAN -typedef union _ULARGE_INTEGER__XX -{ - struct { - uint32_t LowPart; - uint32_t HighPart; - }; - struct { - uint32_t LowPart; - uint32_t HighPart; - } u; - uint64_t QuadPart; +typedef union _ULARGE_INTEGER__XX { + struct { + uint32_t LowPart; + uint32_t HighPart; + }; + struct { + uint32_t LowPart; + uint32_t HighPart; + } u; + uint64_t QuadPart; } ULARGE_INTEGER__XX; typedef union _LARGE_INTEGER__XX { struct { - uint32_t LowPart; - int32_t HighPart; + uint32_t LowPart; + int32_t HighPart; }; struct { - uint32_t LowPart; - int32_t HighPart; - } u; - int64_t QuadPart; + uint32_t LowPart; + int32_t HighPart; + } u; + int64_t QuadPart; } LARGE_INTEGER__XX; #else -typedef union _ULARGE_INTEGER__XX -{ - struct { - uint32_t HighPart; - uint32_t LowPart; - }; - struct { - uint32_t HighPart; - uint32_t LowPart; - } u; - uint64_t QuadPart; +typedef union _ULARGE_INTEGER__XX { + struct { + uint32_t HighPart; + uint32_t LowPart; + }; + struct { + uint32_t HighPart; + uint32_t LowPart; + } u; + uint64_t QuadPart; } ULARGE_INTEGER__XX; typedef union _LARGE_INTEGER__XX { struct { - int32_t HighPart; - uint32_t LowPart; + int32_t HighPart; + uint32_t LowPart; }; struct { - int32_t HighPart; - uint32_t LowPart; - } u; - int64_t QuadPart; + int32_t HighPart; + uint32_t LowPart; + } u; + int64_t QuadPart; } LARGE_INTEGER__XX; #endif -typedef struct _ULARGELARGE_INTEGER__XX -{ - uint64_t LowPart; - uint64_t HighPart; +typedef struct _ULARGELARGE_INTEGER__XX { + uint64_t LowPart; + uint64_t HighPart; } ULARGELARGE_INTEGER__XX; #if defined(__arm__) -typedef struct { uint32_t value __attribute__((__packed__)); } unaligned_uint32; -typedef struct { uint64_t value __attribute__((__packed__)); } unaligned_uint64; +typedef struct { uint32_t value __attribute__((__packed__)); } unaligned_uint32; +typedef struct { uint64_t value __attribute__((__packed__)); } unaligned_uint64; #else -typedef struct { uint32_t value; } unaligned_uint32; -typedef struct { uint64_t value; } unaligned_uint64; +typedef struct { uint32_t value; } unaligned_uint32; +typedef struct { uint64_t value; } unaligned_uint64; #endif // __arm__ //------------------------------------------------------------- @@ -116,13 +113,13 @@ typedef struct { uint64_t value; } unaligned_uint64; #define PMPML_32_WORD_SIZE_BYTES_LOG2 2 #define PMPML_32_LEVELS 8 // Derived constants -static const uint32_t PMPML_32_CHUNK_SIZE = (1 << PMPML_32_CHUNK_SIZE_LOG2); -static const uint32_t PMPML_32_WORD_SIZE_BYTES = (1 << PMPML_32_WORD_SIZE_BYTES_LOG2); -static const uint32_t PMPML_32_CHUNK_SIZE_BYTES = PMPML_32_CHUNK_SIZE * PMPML_32_WORD_SIZE_BYTES; +static const uint32_t PMPML_32_CHUNK_SIZE = (1 << PMPML_32_CHUNK_SIZE_LOG2 ); +static const uint32_t PMPML_32_WORD_SIZE_BYTES = (1 << PMPML_32_WORD_SIZE_BYTES_LOG2); +static const uint32_t PMPML_32_CHUNK_SIZE_BYTES = PMPML_32_CHUNK_SIZE * PMPML_32_WORD_SIZE_BYTES; static const uint32_t PMPML_32_CHUNK_SIZE_BYTES_LOG2 = PMPML_32_CHUNK_SIZE_LOG2 + PMPML_32_WORD_SIZE_BYTES_LOG2; // container for coefficients -typedef struct alignas(32) _random_data_for_PMPML_32 { +typedef struct alignas( 32 ) _random_data_for_PMPML_32 { uint64_t const_term; uint64_t cachedSum; uint64_t dummy[2]; @@ -130,184 +127,184 @@ typedef struct alignas(32) _random_data_for_PMPML_32 { } random_data_for_PMPML_32; static thread_local random_data_for_PMPML_32 rd_for_PMPML_32[PMPML_32_LEVELS] = { - // Level 0 - { - UINT64_C(0xb5ae35fa), UINT64_C(0x45dfdab824), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 0 { - 0x801841bb, 0x5ef2b6fc, 0xcc5a24e2, 0x1b6c5dd5, 0xeb07483b, 0xef894c5b, 0x02213973, 0x2d34d946, - 0x11af1a4d, 0xd0a96734, 0xf39454a6, 0x58574f85, 0x08bc3780, 0x3d5e4d6e, 0x72302724, 0x89d2f7d4, - 0x97d9459e, 0xba75d6d3, 0x69efa09d, 0x56f8f06a, 0x7345e990, 0x8ac230e9, 0xd21f3d0c, 0x3fffba8a, - 0xd6dd6772, 0xd8c69c6b, 0x77a68e52, 0xde17020d, 0xf969ac45, 0x4ec4e3fb, 0x66e1eaae, 0x8c3e2c33, - 0xd031a884, 0x5942d1f7, 0x355157a1, 0x79e517ce, 0x6f6e67c9, 0xdbeb2ce9, 0xaf4c5195, 0x1d72b4ce, - 0x2214d9f3, 0xdab836c3, 0x94a54c8d, 0xa259587e, 0x8e5a6bd6, 0x75d23672, 0xf08fcd74, 0x59297837, - 0xc1f093c7, 0xb1e14572, 0x84e25787, 0xfa18cbdd, 0xc0a8efe1, 0x8f746f29, 0xd1dfea17, 0xd17d1d65, - 0x99c0334e, 0xc200ce59, 0xbac039b7, 0xaa8da145, 0x91787415, 0x7478d0e6, 0xd4fcb135, 0x76c4ce66, - 0xdf1d9e9b, 0xe6a6640f, 0x94dd9b8e, 0x7f530896, 0xd5a76dff, 0xda99ae01, 0x2830dcad, 0x18421917, - 0xc98aeb4f, 0x0048fdda, 0xd5ae8cba, 0xe9d27a3f, 0xc51ba04d, 0x8f1403e7, 0x2cbc94bd, 0x2c47c847, - 0xbf127785, 0x54d2a15b, 0x6a818544, 0x993ca700, 0x31f529ed, 0x4cf30c4c, 0x386af44a, 0x1378d4c0, - 0x3c40ac83, 0x3d27aaa4, 0x9b1c685e, 0x61dbbba6, 0xe5fbbd87, 0x800c57fd, 0xccd49830, 0x1ee12d69, - 0x84868385, 0xbaf5679f, 0xd0417045, 0x4f5c30f0, 0x70558f08, 0x7c1e281d, 0xfe17014e, 0x56404d7c, - 0x77dcfdd3, 0xf0d53161, 0xf9914927, 0x69bc0362, 0x609759cb, 0xfc9afc53, 0xc5f28ba8, 0x9cbe677d, - 0x8b8311e5, 0x40a1fbde, 0x500ef7fc, 0xd51ceaa4, 0x2c666e8f, 0xbf81662b, 0xa0922fe9, 0x65a75374, - 0xc744184e, 0x1fad7a1a, 0xbc3678c2, 0xde23fbbc, 0x0403fd45, 0x69cd23ae, 0xf3dc2f19, 0x31416e93, + UINT64_C(0xb5ae35fa), UINT64_C(0x45dfdab824), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x801841bb, 0x5ef2b6fc, 0xcc5a24e2, 0x1b6c5dd5, 0xeb07483b, 0xef894c5b, 0x02213973, 0x2d34d946, + 0x11af1a4d, 0xd0a96734, 0xf39454a6, 0x58574f85, 0x08bc3780, 0x3d5e4d6e, 0x72302724, 0x89d2f7d4, + 0x97d9459e, 0xba75d6d3, 0x69efa09d, 0x56f8f06a, 0x7345e990, 0x8ac230e9, 0xd21f3d0c, 0x3fffba8a, + 0xd6dd6772, 0xd8c69c6b, 0x77a68e52, 0xde17020d, 0xf969ac45, 0x4ec4e3fb, 0x66e1eaae, 0x8c3e2c33, + 0xd031a884, 0x5942d1f7, 0x355157a1, 0x79e517ce, 0x6f6e67c9, 0xdbeb2ce9, 0xaf4c5195, 0x1d72b4ce, + 0x2214d9f3, 0xdab836c3, 0x94a54c8d, 0xa259587e, 0x8e5a6bd6, 0x75d23672, 0xf08fcd74, 0x59297837, + 0xc1f093c7, 0xb1e14572, 0x84e25787, 0xfa18cbdd, 0xc0a8efe1, 0x8f746f29, 0xd1dfea17, 0xd17d1d65, + 0x99c0334e, 0xc200ce59, 0xbac039b7, 0xaa8da145, 0x91787415, 0x7478d0e6, 0xd4fcb135, 0x76c4ce66, + 0xdf1d9e9b, 0xe6a6640f, 0x94dd9b8e, 0x7f530896, 0xd5a76dff, 0xda99ae01, 0x2830dcad, 0x18421917, + 0xc98aeb4f, 0x0048fdda, 0xd5ae8cba, 0xe9d27a3f, 0xc51ba04d, 0x8f1403e7, 0x2cbc94bd, 0x2c47c847, + 0xbf127785, 0x54d2a15b, 0x6a818544, 0x993ca700, 0x31f529ed, 0x4cf30c4c, 0x386af44a, 0x1378d4c0, + 0x3c40ac83, 0x3d27aaa4, 0x9b1c685e, 0x61dbbba6, 0xe5fbbd87, 0x800c57fd, 0xccd49830, 0x1ee12d69, + 0x84868385, 0xbaf5679f, 0xd0417045, 0x4f5c30f0, 0x70558f08, 0x7c1e281d, 0xfe17014e, 0x56404d7c, + 0x77dcfdd3, 0xf0d53161, 0xf9914927, 0x69bc0362, 0x609759cb, 0xfc9afc53, 0xc5f28ba8, 0x9cbe677d, + 0x8b8311e5, 0x40a1fbde, 0x500ef7fc, 0xd51ceaa4, 0x2c666e8f, 0xbf81662b, 0xa0922fe9, 0x65a75374, + 0xc744184e, 0x1fad7a1a, 0xbc3678c2, 0xde23fbbc, 0x0403fd45, 0x69cd23ae, 0xf3dc2f19, 0x31416e93, + }, }, - }, - // Level 1 - { - UINT64_C(0xc3dbb82), UINT64_C(0x3c33d12213), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 1 { - 0xd233467b, 0x72a70d41, 0x8bd6cb67, 0x2e954d02, 0x08142b46, 0xb9613249, 0x8136a81d, 0x3cdab6cf, - 0x70433dfc, 0x984d385b, 0x66f13c63, 0x392a028c, 0x84b10a87, 0xb54b7873, 0x7af58609, 0xbe835997, - 0x09878350, 0x2702ed23, 0x940ffe4b, 0x073982e4, 0x4b565486, 0xc1872a1b, 0xcb9af7a0, 0xd8a84f81, - 0xd8234048, 0x3d9a44b4, 0xfcecd1d5, 0x114fe193, 0x7e848584, 0x0082760d, 0x0ede3da7, 0x0040762c, - 0xe522397a, 0x44ec8715, 0x422bc161, 0x0764c174, 0x3c511482, 0xd7dea424, 0xa12ec3c0, 0x66d33ec0, - 0x0aaa55ce, 0x65f93ec0, 0xadaaaf7f, 0x647e772d, 0xa6b0a4fa, 0x88a72a0d, 0x1cfa03b4, 0x4f28c0c6, - 0xa7c64b56, 0xedd8af5e, 0xa47e7242, 0x99f8d210, 0x8ad70f5f, 0xa8e3cdfb, 0x0a1db865, 0x56b2e1b0, - 0x0dd7b307, 0x564a191f, 0xca38b54f, 0x61567b67, 0xd50c9644, 0x7671637e, 0x92d511cc, 0x25057afc, - 0xd286cba4, 0x71f8dda9, 0x2ad9996c, 0x75ad65f0, 0x9418c0e9, 0xe6d0066b, 0xf1d15419, 0x264afe8b, - 0x98c932e2, 0x3a6d5f8d, 0x289a7d0c, 0x3d18290d, 0xb9ecee8d, 0xdff7a79b, 0x7ecc3cde, 0x583e06a0, - 0x8e29d297, 0xdc8650cb, 0x30f7861d, 0xf2de5cf9, 0x924dc8bc, 0x5afb46e9, 0xb997b1d9, 0x463d84a2, - 0xfb8e2e7e, 0x043418b8, 0xa94e6a05, 0xae5c1efa, 0x7c7e4583, 0xcb6755ac, 0xf3359dba, 0xf05fdf94, - 0x79db25ea, 0xed490569, 0x993d8da0, 0x6593ce5a, 0x03e3ed39, 0x044f74a3, 0x84777814, 0xcb2848d7, - 0x41881b64, 0xf52d206e, 0x1fb1ebaf, 0x07a3d4b3, 0x63a5924f, 0x35c21005, 0xc981c63c, 0x9e3fdbaa, - 0x89b64b0d, 0x0f2aba74, 0x512f3cfe, 0xb053e5d0, 0x59a69c4a, 0x400c442f, 0x28afebd0, 0x4540c190, - 0xc7f5e757, 0x7d40152b, 0x321fa235, 0xb6309529, 0x021c71e1, 0x7474f524, 0xc4f2e22e, 0x778b9371, + UINT64_C(0xc3dbb82), UINT64_C(0x3c33d12213), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0xd233467b, 0x72a70d41, 0x8bd6cb67, 0x2e954d02, 0x08142b46, 0xb9613249, 0x8136a81d, 0x3cdab6cf, + 0x70433dfc, 0x984d385b, 0x66f13c63, 0x392a028c, 0x84b10a87, 0xb54b7873, 0x7af58609, 0xbe835997, + 0x09878350, 0x2702ed23, 0x940ffe4b, 0x073982e4, 0x4b565486, 0xc1872a1b, 0xcb9af7a0, 0xd8a84f81, + 0xd8234048, 0x3d9a44b4, 0xfcecd1d5, 0x114fe193, 0x7e848584, 0x0082760d, 0x0ede3da7, 0x0040762c, + 0xe522397a, 0x44ec8715, 0x422bc161, 0x0764c174, 0x3c511482, 0xd7dea424, 0xa12ec3c0, 0x66d33ec0, + 0x0aaa55ce, 0x65f93ec0, 0xadaaaf7f, 0x647e772d, 0xa6b0a4fa, 0x88a72a0d, 0x1cfa03b4, 0x4f28c0c6, + 0xa7c64b56, 0xedd8af5e, 0xa47e7242, 0x99f8d210, 0x8ad70f5f, 0xa8e3cdfb, 0x0a1db865, 0x56b2e1b0, + 0x0dd7b307, 0x564a191f, 0xca38b54f, 0x61567b67, 0xd50c9644, 0x7671637e, 0x92d511cc, 0x25057afc, + 0xd286cba4, 0x71f8dda9, 0x2ad9996c, 0x75ad65f0, 0x9418c0e9, 0xe6d0066b, 0xf1d15419, 0x264afe8b, + 0x98c932e2, 0x3a6d5f8d, 0x289a7d0c, 0x3d18290d, 0xb9ecee8d, 0xdff7a79b, 0x7ecc3cde, 0x583e06a0, + 0x8e29d297, 0xdc8650cb, 0x30f7861d, 0xf2de5cf9, 0x924dc8bc, 0x5afb46e9, 0xb997b1d9, 0x463d84a2, + 0xfb8e2e7e, 0x043418b8, 0xa94e6a05, 0xae5c1efa, 0x7c7e4583, 0xcb6755ac, 0xf3359dba, 0xf05fdf94, + 0x79db25ea, 0xed490569, 0x993d8da0, 0x6593ce5a, 0x03e3ed39, 0x044f74a3, 0x84777814, 0xcb2848d7, + 0x41881b64, 0xf52d206e, 0x1fb1ebaf, 0x07a3d4b3, 0x63a5924f, 0x35c21005, 0xc981c63c, 0x9e3fdbaa, + 0x89b64b0d, 0x0f2aba74, 0x512f3cfe, 0xb053e5d0, 0x59a69c4a, 0x400c442f, 0x28afebd0, 0x4540c190, + 0xc7f5e757, 0x7d40152b, 0x321fa235, 0xb6309529, 0x021c71e1, 0x7474f524, 0xc4f2e22e, 0x778b9371, + }, }, - }, - // Level 2 - { - UINT64_C(0x4ae2b467), UINT64_C(0x41b6700d41), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 2 { - 0xf8898c22, 0x863868bc, 0xd35470e9, 0x58d21ad6, 0xa2fce702, 0xe4f58530, 0x0225c8a9, 0x9b29b401, - 0xf4f6d3eb, 0xf751b2ce, 0x2afa3d7a, 0xc1edf3e9, 0x4c57e2d1, 0xc2ef970d, 0x8a70aa25, 0x887d0102, - 0xcc09e169, 0xeb5b75e2, 0x760b047e, 0xa2d21874, 0xc2bf310a, 0x8f030e02, 0x4b97fa22, 0x6a413ddb, - 0x708062b4, 0x58cc67d3, 0x52459895, 0x78d345e3, 0x2b7a9415, 0xbaf4d1fe, 0x83462969, 0x923fa257, - 0x91617494, 0xedf8d2f5, 0xc3d41302, 0xdf1934ff, 0x78a27863, 0xe7bf06a2, 0xc21b996d, 0x1e72411e, - 0x98da3053, 0x0c2195ad, 0xf984dd09, 0x4b30dac8, 0xf3a03a7a, 0xee6540ec, 0x966dffb7, 0xb463fdbe, - 0xbec26037, 0xcc9adad0, 0xdb71b8ef, 0x57341ca0, 0xa742ec7b, 0xe86321e9, 0x7a9d9f15, 0x7809e2a6, - 0x2cb6a0a0, 0x344756d0, 0x6e8e8c88, 0x7ecf3ff7, 0x129d18a0, 0x0965dc6a, 0xf6a2cad1, 0xd938681b, - 0xa1d07081, 0x4253df74, 0x774a5200, 0x59e1356d, 0x7aad36b5, 0x7dd6414a, 0x4700a70e, 0xd0da811c, - 0x1fd2a8b8, 0x1dee15ad, 0x7f15ae5a, 0xc1f74f27, 0xfd8bfb7f, 0x16815bb9, 0x64d29007, 0xc8919e9f, - 0x0b8c7e82, 0xfd5e92c2, 0x6e073fb7, 0xd52df9c2, 0x0c5c519d, 0x3ad86cb4, 0xfde300c8, 0x674c4dac, - 0x54899a0a, 0xbf9a9be5, 0xe198c073, 0x6025af27, 0x433bac50, 0x669d3281, 0xee3838b3, 0x0df3a048, - 0x2d0de6cd, 0xd289c8eb, 0x6b1c9eb1, 0x1634922b, 0x61917d41, 0x8b8bdeec, 0x12b73dcf, 0x96353517, - 0x20e29858, 0xecc04cb9, 0x0074a2ca, 0x58a0f1ba, 0x6ed4e71f, 0x063fec8e, 0xc5bc30c2, 0x77af6d46, - 0x078a6a93, 0x8c8da7a2, 0x1d02b1cc, 0x96b659f9, 0x8d8b4fbd, 0x521b2964, 0x990235f7, 0x55c63419, - 0x1ad869a5, 0x51987dbd, 0x99e7a3ff, 0xf584d99a, 0xc11c3506, 0xb1adca80, 0x55007e41, 0x09efa72b, + UINT64_C(0x4ae2b467), UINT64_C(0x41b6700d41), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0xf8898c22, 0x863868bc, 0xd35470e9, 0x58d21ad6, 0xa2fce702, 0xe4f58530, 0x0225c8a9, 0x9b29b401, + 0xf4f6d3eb, 0xf751b2ce, 0x2afa3d7a, 0xc1edf3e9, 0x4c57e2d1, 0xc2ef970d, 0x8a70aa25, 0x887d0102, + 0xcc09e169, 0xeb5b75e2, 0x760b047e, 0xa2d21874, 0xc2bf310a, 0x8f030e02, 0x4b97fa22, 0x6a413ddb, + 0x708062b4, 0x58cc67d3, 0x52459895, 0x78d345e3, 0x2b7a9415, 0xbaf4d1fe, 0x83462969, 0x923fa257, + 0x91617494, 0xedf8d2f5, 0xc3d41302, 0xdf1934ff, 0x78a27863, 0xe7bf06a2, 0xc21b996d, 0x1e72411e, + 0x98da3053, 0x0c2195ad, 0xf984dd09, 0x4b30dac8, 0xf3a03a7a, 0xee6540ec, 0x966dffb7, 0xb463fdbe, + 0xbec26037, 0xcc9adad0, 0xdb71b8ef, 0x57341ca0, 0xa742ec7b, 0xe86321e9, 0x7a9d9f15, 0x7809e2a6, + 0x2cb6a0a0, 0x344756d0, 0x6e8e8c88, 0x7ecf3ff7, 0x129d18a0, 0x0965dc6a, 0xf6a2cad1, 0xd938681b, + 0xa1d07081, 0x4253df74, 0x774a5200, 0x59e1356d, 0x7aad36b5, 0x7dd6414a, 0x4700a70e, 0xd0da811c, + 0x1fd2a8b8, 0x1dee15ad, 0x7f15ae5a, 0xc1f74f27, 0xfd8bfb7f, 0x16815bb9, 0x64d29007, 0xc8919e9f, + 0x0b8c7e82, 0xfd5e92c2, 0x6e073fb7, 0xd52df9c2, 0x0c5c519d, 0x3ad86cb4, 0xfde300c8, 0x674c4dac, + 0x54899a0a, 0xbf9a9be5, 0xe198c073, 0x6025af27, 0x433bac50, 0x669d3281, 0xee3838b3, 0x0df3a048, + 0x2d0de6cd, 0xd289c8eb, 0x6b1c9eb1, 0x1634922b, 0x61917d41, 0x8b8bdeec, 0x12b73dcf, 0x96353517, + 0x20e29858, 0xecc04cb9, 0x0074a2ca, 0x58a0f1ba, 0x6ed4e71f, 0x063fec8e, 0xc5bc30c2, 0x77af6d46, + 0x078a6a93, 0x8c8da7a2, 0x1d02b1cc, 0x96b659f9, 0x8d8b4fbd, 0x521b2964, 0x990235f7, 0x55c63419, + 0x1ad869a5, 0x51987dbd, 0x99e7a3ff, 0xf584d99a, 0xc11c3506, 0xb1adca80, 0x55007e41, 0x09efa72b, + }, }, - }, - // Level 3 - { - UINT64_C(0xae82fd43), UINT64_C(0x4358e7ef21), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 3 { - 0x9e6c8a0f, 0x9107b963, 0xdc39a0eb, 0x9fb2328d, 0xd4f03812, 0xce7ff238, 0x99710f09, 0x90b5a0ba, - 0x53cb9654, 0xdca51386, 0x5a03c91d, 0x542e4280, 0x92d368ff, 0x6769cd0b, 0xacad27d0, 0x3947f94b, - 0xf33a3265, 0x2f298054, 0x5094d047, 0x962591a6, 0x89c1de39, 0x0ef43de4, 0xe87f5576, 0xb342b1dc, - 0xffb893e3, 0x08a96d7d, 0xe1023f0d, 0x054ac7ea, 0xeb0a8934, 0xe1558e68, 0xce76025c, 0x47c0a61f, - 0x9d476622, 0xee83acc6, 0x5fb7a3fd, 0xa1798b06, 0x97cfbc96, 0x341dc4f8, 0x079d4d68, 0x85811d0d, - 0xe81cd930, 0x83f55707, 0x7cd3da51, 0xe504fcf6, 0x5afed439, 0x35677002, 0x40d755aa, 0xcea876c6, - 0x1c8a9953, 0x9a7d47c1, 0x9343c019, 0x60ffafe4, 0x7c12e1c5, 0xa64b2499, 0x9e13587f, 0x6e690d98, - 0x24a0dcfe, 0xfc4c35a6, 0x66eca52a, 0xe9e0315f, 0xa208fe48, 0x16d7bd81, 0xd5c9b0fb, 0xe7337bf9, - 0x2d3ad9dc, 0x6924c3f3, 0x8e7174f8, 0x01f7e499, 0x2e3edfb8, 0x8dfe2b6a, 0x40f43c09, 0xcf51dafc, - 0xafe98c70, 0x31b3d859, 0x07f28e34, 0x6527d100, 0x5274484e, 0x92fa82fe, 0xf059d18a, 0x55e4c67c, - 0x51e5d061, 0xaa4408e9, 0xbd7463cc, 0xb587505f, 0xfc88d42e, 0x70b3e921, 0xeabb6770, 0xfb3a060b, - 0xd675527a, 0xb8d6153f, 0xbd1763ad, 0x6f1a2573, 0xf96490be, 0xce99095f, 0x966d1090, 0x65e2a371, - 0x3a81e7f8, 0x769315db, 0xaa973861, 0x8d6d798c, 0xa935a7ae, 0x194de67a, 0x402f5da2, 0x58a7f932, - 0xa1eb519c, 0x65125c5b, 0x961b4b6c, 0x518c8dab, 0x47233e7f, 0x1b19109b, 0x46a1b3c1, 0x5dc3dd6c, - 0x709b63af, 0x3e43e71c, 0x7b997703, 0xa2259145, 0x81f87a1c, 0xa6c8a082, 0xa12ef053, 0x412e7f0e, - 0x29bef6e8, 0xcc8fca68, 0xf521167a, 0x203c0e84, 0xe92d5cd7, 0x9589c2d1, 0x208e2f28, 0x906bd537, + UINT64_C(0xae82fd43), UINT64_C(0x4358e7ef21), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x9e6c8a0f, 0x9107b963, 0xdc39a0eb, 0x9fb2328d, 0xd4f03812, 0xce7ff238, 0x99710f09, 0x90b5a0ba, + 0x53cb9654, 0xdca51386, 0x5a03c91d, 0x542e4280, 0x92d368ff, 0x6769cd0b, 0xacad27d0, 0x3947f94b, + 0xf33a3265, 0x2f298054, 0x5094d047, 0x962591a6, 0x89c1de39, 0x0ef43de4, 0xe87f5576, 0xb342b1dc, + 0xffb893e3, 0x08a96d7d, 0xe1023f0d, 0x054ac7ea, 0xeb0a8934, 0xe1558e68, 0xce76025c, 0x47c0a61f, + 0x9d476622, 0xee83acc6, 0x5fb7a3fd, 0xa1798b06, 0x97cfbc96, 0x341dc4f8, 0x079d4d68, 0x85811d0d, + 0xe81cd930, 0x83f55707, 0x7cd3da51, 0xe504fcf6, 0x5afed439, 0x35677002, 0x40d755aa, 0xcea876c6, + 0x1c8a9953, 0x9a7d47c1, 0x9343c019, 0x60ffafe4, 0x7c12e1c5, 0xa64b2499, 0x9e13587f, 0x6e690d98, + 0x24a0dcfe, 0xfc4c35a6, 0x66eca52a, 0xe9e0315f, 0xa208fe48, 0x16d7bd81, 0xd5c9b0fb, 0xe7337bf9, + 0x2d3ad9dc, 0x6924c3f3, 0x8e7174f8, 0x01f7e499, 0x2e3edfb8, 0x8dfe2b6a, 0x40f43c09, 0xcf51dafc, + 0xafe98c70, 0x31b3d859, 0x07f28e34, 0x6527d100, 0x5274484e, 0x92fa82fe, 0xf059d18a, 0x55e4c67c, + 0x51e5d061, 0xaa4408e9, 0xbd7463cc, 0xb587505f, 0xfc88d42e, 0x70b3e921, 0xeabb6770, 0xfb3a060b, + 0xd675527a, 0xb8d6153f, 0xbd1763ad, 0x6f1a2573, 0xf96490be, 0xce99095f, 0x966d1090, 0x65e2a371, + 0x3a81e7f8, 0x769315db, 0xaa973861, 0x8d6d798c, 0xa935a7ae, 0x194de67a, 0x402f5da2, 0x58a7f932, + 0xa1eb519c, 0x65125c5b, 0x961b4b6c, 0x518c8dab, 0x47233e7f, 0x1b19109b, 0x46a1b3c1, 0x5dc3dd6c, + 0x709b63af, 0x3e43e71c, 0x7b997703, 0xa2259145, 0x81f87a1c, 0xa6c8a082, 0xa12ef053, 0x412e7f0e, + 0x29bef6e8, 0xcc8fca68, 0xf521167a, 0x203c0e84, 0xe92d5cd7, 0x9589c2d1, 0x208e2f28, 0x906bd537, + }, }, - }, - // Level 4 - { - UINT64_C(0xc3b9656e), UINT64_C(0x3f969c7ed3), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 4 { - 0x60731d8f, 0x2e17b1b7, 0xb808f3c7, 0xf20f223c, 0xb964bc3c, 0xaa61a231, 0x3d84cd54, 0x94f006d6, - 0x684e8f60, 0xb64adf58, 0x7033ff6c, 0x01ea1b40, 0xbcaf2776, 0x70250562, 0x342ec517, 0x1e280438, - 0xaeaa96ba, 0x802391c2, 0x35a7f213, 0x8d0f57aa, 0xf8a1153b, 0x917a692a, 0xbac0385c, 0x6dc2f7dd, - 0xc573a21b, 0x0469558c, 0xf206c551, 0xfe683c17, 0x54d0c3bc, 0x80734381, 0xc4eef75c, 0x22648b9e, - 0xede23e78, 0x8823f123, 0xd687c6a7, 0x85b6752b, 0xb8cf5160, 0x8109a1c8, 0x1b4c7ceb, 0xaa8b17a6, - 0xeda3fcbf, 0xb6d65214, 0xe6171214, 0x98f4ee28, 0xc1ac9d91, 0x0810d22e, 0x1ccec281, 0xd1911b8a, - 0x272b7696, 0x860fc01d, 0x903c0029, 0xf3308e35, 0x8c2021ef, 0x52ebae93, 0x6ece3f90, 0x2d01f59f, - 0x15cf87c9, 0x79c113fd, 0xcee953e9, 0x6152456a, 0x82d25ea1, 0x743316c4, 0x351f50d1, 0x06e3708f, - 0x45060a80, 0x4c13c59a, 0x0a737387, 0x3eaa3672, 0xe5176942, 0x8431098a, 0x0cd55f05, 0x9d5c2eda, - 0x6df6d514, 0x41a412ea, 0x67606dd0, 0xdec02567, 0xaebddaad, 0xf48d85d8, 0x7f41af4b, 0xbb8b03b7, - 0x29bb612f, 0xc96546c9, 0xb04dfcc9, 0x2ee6c830, 0xafb0bc9e, 0x08e0ef18, 0xea81d1fc, 0xa58be897, - 0xee996482, 0xb7ee4493, 0x0c561cd5, 0x7695207b, 0x763a34f3, 0x7093196a, 0xecf527bd, 0xb3037632, - 0x40fdbc46, 0x72a3f33d, 0xb09e2e73, 0x1b41ab32, 0x32c280f4, 0x865d6444, 0xa998ef38, 0xe1f097de, - 0x5f6c5d4f, 0xfebdf03d, 0xc569ef53, 0xec6decf1, 0x03de6003, 0x0e3063d7, 0x8dd9c0a0, 0x062c97a4, - 0xa45c835e, 0xd167187d, 0xfe55e66e, 0x6b24b6df, 0x572c5189, 0x30c18b20, 0x3c0346f8, 0x5982a13e, - 0xbf491b0f, 0x248df32c, 0x6f572546, 0x51296aff, 0x1a8c0702, 0x94a21284, 0x371e69c8, 0x2298720e, + UINT64_C(0xc3b9656e), UINT64_C(0x3f969c7ed3), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x60731d8f, 0x2e17b1b7, 0xb808f3c7, 0xf20f223c, 0xb964bc3c, 0xaa61a231, 0x3d84cd54, 0x94f006d6, + 0x684e8f60, 0xb64adf58, 0x7033ff6c, 0x01ea1b40, 0xbcaf2776, 0x70250562, 0x342ec517, 0x1e280438, + 0xaeaa96ba, 0x802391c2, 0x35a7f213, 0x8d0f57aa, 0xf8a1153b, 0x917a692a, 0xbac0385c, 0x6dc2f7dd, + 0xc573a21b, 0x0469558c, 0xf206c551, 0xfe683c17, 0x54d0c3bc, 0x80734381, 0xc4eef75c, 0x22648b9e, + 0xede23e78, 0x8823f123, 0xd687c6a7, 0x85b6752b, 0xb8cf5160, 0x8109a1c8, 0x1b4c7ceb, 0xaa8b17a6, + 0xeda3fcbf, 0xb6d65214, 0xe6171214, 0x98f4ee28, 0xc1ac9d91, 0x0810d22e, 0x1ccec281, 0xd1911b8a, + 0x272b7696, 0x860fc01d, 0x903c0029, 0xf3308e35, 0x8c2021ef, 0x52ebae93, 0x6ece3f90, 0x2d01f59f, + 0x15cf87c9, 0x79c113fd, 0xcee953e9, 0x6152456a, 0x82d25ea1, 0x743316c4, 0x351f50d1, 0x06e3708f, + 0x45060a80, 0x4c13c59a, 0x0a737387, 0x3eaa3672, 0xe5176942, 0x8431098a, 0x0cd55f05, 0x9d5c2eda, + 0x6df6d514, 0x41a412ea, 0x67606dd0, 0xdec02567, 0xaebddaad, 0xf48d85d8, 0x7f41af4b, 0xbb8b03b7, + 0x29bb612f, 0xc96546c9, 0xb04dfcc9, 0x2ee6c830, 0xafb0bc9e, 0x08e0ef18, 0xea81d1fc, 0xa58be897, + 0xee996482, 0xb7ee4493, 0x0c561cd5, 0x7695207b, 0x763a34f3, 0x7093196a, 0xecf527bd, 0xb3037632, + 0x40fdbc46, 0x72a3f33d, 0xb09e2e73, 0x1b41ab32, 0x32c280f4, 0x865d6444, 0xa998ef38, 0xe1f097de, + 0x5f6c5d4f, 0xfebdf03d, 0xc569ef53, 0xec6decf1, 0x03de6003, 0x0e3063d7, 0x8dd9c0a0, 0x062c97a4, + 0xa45c835e, 0xd167187d, 0xfe55e66e, 0x6b24b6df, 0x572c5189, 0x30c18b20, 0x3c0346f8, 0x5982a13e, + 0xbf491b0f, 0x248df32c, 0x6f572546, 0x51296aff, 0x1a8c0702, 0x94a21284, 0x371e69c8, 0x2298720e, + }, }, - }, - // Level 5 - { - UINT64_C(0xe3c9939c), UINT64_C(0x3d848fecbb), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 5 { - 0x78bb7f84, 0xc6a18ac7, 0xeb321f90, 0x35d4f871, 0x61a5f4a7, 0x6d591ba2, 0x7f93ad57, 0x96841919, - 0xea7890a9, 0x0fa2f69c, 0x1866af58, 0x7f257346, 0xdcc51cd9, 0x92e78656, 0xc4628292, 0x42e01b49, - 0x40541662, 0x37af7888, 0x4faa39af, 0xa3207d98, 0x63750fda, 0x2767c143, 0xf11a2916, 0x618ceb9b, - 0x9d684ce0, 0x69088033, 0x1ab5a1c7, 0x0f0a4f86, 0x4e49f893, 0x0ca32464, 0x90a7c38e, 0x5a0aded0, - 0x2dae1926, 0x0d935a0e, 0xde592a69, 0x085299b2, 0x4977a3a0, 0x7e82d9bc, 0x399e6a95, 0xdb9f1b90, - 0xe1dfe431, 0xbac5a72d, 0x168fe9ef, 0x9727301e, 0x76cd1ddb, 0x2bcd89e0, 0x45b7de13, 0xf239f2ad, - 0xae66187d, 0xb92a6f32, 0xf0fb1c7f, 0xb77384f2, 0x6e405312, 0x6616a82e, 0x9bdca728, 0x1b5e6782, - 0xdd243a3f, 0xf148d161, 0xfe0e7b47, 0x0fdadcf7, 0x9f21d59d, 0x5057328f, 0x22f944b9, 0x7e68d807, - 0x46de914d, 0x2d351dad, 0x6b0f3436, 0x6d6a8943, 0xcd18923c, 0x2e8fa891, 0x33f1ed84, 0x30e3a20a, - 0xa15f52a0, 0x3162fa56, 0xa60d4a72, 0x3e9fab64, 0x0a584673, 0x99d08542, 0x5ce99b5a, 0xcf1be8b0, - 0xe83225e3, 0xad522e70, 0xb17e0c87, 0x5b081b14, 0xc4c71a48, 0xb430a70b, 0xf38673cd, 0x1aad3b26, - 0x0e50ca70, 0xa1aeb568, 0x4140ea0c, 0xdabeee2d, 0x2779c11b, 0x5e06c86e, 0x12803b8f, 0xa46fd322, - 0x7de67db9, 0x7d1ee355, 0xbea94742, 0xf529e572, 0x5374fffc, 0xf9037c7a, 0x1010523f, 0xb1a96f9c, - 0x89b49bfc, 0xf2469dc2, 0x1692f9e1, 0x95ec9a68, 0x09426ab7, 0x0bc30953, 0x8628bd58, 0xa28375f2, - 0xd9d4c2bf, 0xaae40027, 0x2b56df1b, 0x9d9fbc50, 0x14bf937d, 0xe7b0fb0a, 0xa5e40995, 0xfae90145, - 0x1ea68371, 0x671f2f40, 0xc654778c, 0x477cf3fd, 0x6aa5cbda, 0x8f9960c8, 0xc08542ef, 0x88bbddc8, + UINT64_C(0xe3c9939c), UINT64_C(0x3d848fecbb), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x78bb7f84, 0xc6a18ac7, 0xeb321f90, 0x35d4f871, 0x61a5f4a7, 0x6d591ba2, 0x7f93ad57, 0x96841919, + 0xea7890a9, 0x0fa2f69c, 0x1866af58, 0x7f257346, 0xdcc51cd9, 0x92e78656, 0xc4628292, 0x42e01b49, + 0x40541662, 0x37af7888, 0x4faa39af, 0xa3207d98, 0x63750fda, 0x2767c143, 0xf11a2916, 0x618ceb9b, + 0x9d684ce0, 0x69088033, 0x1ab5a1c7, 0x0f0a4f86, 0x4e49f893, 0x0ca32464, 0x90a7c38e, 0x5a0aded0, + 0x2dae1926, 0x0d935a0e, 0xde592a69, 0x085299b2, 0x4977a3a0, 0x7e82d9bc, 0x399e6a95, 0xdb9f1b90, + 0xe1dfe431, 0xbac5a72d, 0x168fe9ef, 0x9727301e, 0x76cd1ddb, 0x2bcd89e0, 0x45b7de13, 0xf239f2ad, + 0xae66187d, 0xb92a6f32, 0xf0fb1c7f, 0xb77384f2, 0x6e405312, 0x6616a82e, 0x9bdca728, 0x1b5e6782, + 0xdd243a3f, 0xf148d161, 0xfe0e7b47, 0x0fdadcf7, 0x9f21d59d, 0x5057328f, 0x22f944b9, 0x7e68d807, + 0x46de914d, 0x2d351dad, 0x6b0f3436, 0x6d6a8943, 0xcd18923c, 0x2e8fa891, 0x33f1ed84, 0x30e3a20a, + 0xa15f52a0, 0x3162fa56, 0xa60d4a72, 0x3e9fab64, 0x0a584673, 0x99d08542, 0x5ce99b5a, 0xcf1be8b0, + 0xe83225e3, 0xad522e70, 0xb17e0c87, 0x5b081b14, 0xc4c71a48, 0xb430a70b, 0xf38673cd, 0x1aad3b26, + 0x0e50ca70, 0xa1aeb568, 0x4140ea0c, 0xdabeee2d, 0x2779c11b, 0x5e06c86e, 0x12803b8f, 0xa46fd322, + 0x7de67db9, 0x7d1ee355, 0xbea94742, 0xf529e572, 0x5374fffc, 0xf9037c7a, 0x1010523f, 0xb1a96f9c, + 0x89b49bfc, 0xf2469dc2, 0x1692f9e1, 0x95ec9a68, 0x09426ab7, 0x0bc30953, 0x8628bd58, 0xa28375f2, + 0xd9d4c2bf, 0xaae40027, 0x2b56df1b, 0x9d9fbc50, 0x14bf937d, 0xe7b0fb0a, 0xa5e40995, 0xfae90145, + 0x1ea68371, 0x671f2f40, 0xc654778c, 0x477cf3fd, 0x6aa5cbda, 0x8f9960c8, 0xc08542ef, 0x88bbddc8, + }, }, - }, - // Level 6 - { - UINT64_C(0xf33fe2d4), UINT64_C(0x3be3330adb), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 6 { - 0x413faa9b, 0x1a3a2814, 0x957ff066, 0xfc5c55ec, 0x7898f40d, 0x30d71b62, 0xab1f1b9a, 0x5c93c31a, - 0x27e1bf84, 0x277fd4f4, 0xc8de8b61, 0x619ec0a3, 0xcc3106c9, 0x7e07e8c7, 0xadbbff04, 0x986f8050, - 0x26cd3f0a, 0xe7dcfd5a, 0xed3be524, 0x4a1e0f2b, 0xe0888023, 0x24d0c5eb, 0x476e89ae, 0x1a222b82, - 0xb3d0cd98, 0x8856e275, 0x95ac5c19, 0xbbf334b5, 0x1a346ac4, 0x9f9ed27d, 0xe64567c6, 0xfc52f176, - 0x98c8223c, 0xc09233fb, 0x078e98a4, 0xa36a369a, 0x89dfd3f0, 0x10a40ad1, 0xd14f4f1f, 0xe8ec2908, - 0xb9af0bd3, 0x4d55c288, 0xc235e430, 0x77564268, 0x42c4877e, 0x00baab49, 0xd79bda2b, 0x490fcfc2, - 0x225bfa4b, 0x216af042, 0xac221547, 0x6d8d84e0, 0x17dc383c, 0x49dcb049, 0x46d29882, 0x6661b4ed, - 0x77b0becd, 0xf7a52591, 0x70c7256d, 0x0872d1fd, 0x2940fad9, 0x2c857e39, 0x358bf808, 0x0081180c, - 0x01ec2a40, 0x3b7e716d, 0x2e0da024, 0xb77c9d9f, 0x725b6a35, 0x42d22b0c, 0x30fe2079, 0x8b72db40, - 0xba80de6a, 0x03fb3689, 0x0557ad42, 0x7237cc5d, 0x792b74ae, 0x3bd5a870, 0x136749ef, 0x81c9ddf5, - 0x95b80aa7, 0x7e885861, 0xc797839c, 0x667083b5, 0xe8e9b2d7, 0x9b282b8e, 0x8e7a7db0, 0x79d39fea, - 0x1f9cea00, 0xf7c5c4f1, 0x9e669399, 0x136a5889, 0x680d40a6, 0xea6ba4fa, 0xf7660f4b, 0xfd9af075, - 0xf242ad0c, 0xcf89799a, 0x1173b431, 0x8b3b0aa0, 0xd8e862ff, 0x6ee0e93e, 0x482772e0, 0x6f382985, - 0x995506f1, 0x5f1c3b7f, 0xc54d0f78, 0x5ba663aa, 0x91e7cc43, 0x07295028, 0xe1f9640d, 0x5e0d49cb, - 0xd1d6d96a, 0x7e602d59, 0xc8a376ac, 0x15ddcff4, 0x90481328, 0x543e0eb7, 0x07d297e4, 0xddfb2d18, - 0x94a578aa, 0x9a39368e, 0x6aab286e, 0x0a39debd, 0x8ee5e818, 0x5c30655e, 0x661772e5, 0x527b25c1, + UINT64_C(0xf33fe2d4), UINT64_C(0x3be3330adb), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x413faa9b, 0x1a3a2814, 0x957ff066, 0xfc5c55ec, 0x7898f40d, 0x30d71b62, 0xab1f1b9a, 0x5c93c31a, + 0x27e1bf84, 0x277fd4f4, 0xc8de8b61, 0x619ec0a3, 0xcc3106c9, 0x7e07e8c7, 0xadbbff04, 0x986f8050, + 0x26cd3f0a, 0xe7dcfd5a, 0xed3be524, 0x4a1e0f2b, 0xe0888023, 0x24d0c5eb, 0x476e89ae, 0x1a222b82, + 0xb3d0cd98, 0x8856e275, 0x95ac5c19, 0xbbf334b5, 0x1a346ac4, 0x9f9ed27d, 0xe64567c6, 0xfc52f176, + 0x98c8223c, 0xc09233fb, 0x078e98a4, 0xa36a369a, 0x89dfd3f0, 0x10a40ad1, 0xd14f4f1f, 0xe8ec2908, + 0xb9af0bd3, 0x4d55c288, 0xc235e430, 0x77564268, 0x42c4877e, 0x00baab49, 0xd79bda2b, 0x490fcfc2, + 0x225bfa4b, 0x216af042, 0xac221547, 0x6d8d84e0, 0x17dc383c, 0x49dcb049, 0x46d29882, 0x6661b4ed, + 0x77b0becd, 0xf7a52591, 0x70c7256d, 0x0872d1fd, 0x2940fad9, 0x2c857e39, 0x358bf808, 0x0081180c, + 0x01ec2a40, 0x3b7e716d, 0x2e0da024, 0xb77c9d9f, 0x725b6a35, 0x42d22b0c, 0x30fe2079, 0x8b72db40, + 0xba80de6a, 0x03fb3689, 0x0557ad42, 0x7237cc5d, 0x792b74ae, 0x3bd5a870, 0x136749ef, 0x81c9ddf5, + 0x95b80aa7, 0x7e885861, 0xc797839c, 0x667083b5, 0xe8e9b2d7, 0x9b282b8e, 0x8e7a7db0, 0x79d39fea, + 0x1f9cea00, 0xf7c5c4f1, 0x9e669399, 0x136a5889, 0x680d40a6, 0xea6ba4fa, 0xf7660f4b, 0xfd9af075, + 0xf242ad0c, 0xcf89799a, 0x1173b431, 0x8b3b0aa0, 0xd8e862ff, 0x6ee0e93e, 0x482772e0, 0x6f382985, + 0x995506f1, 0x5f1c3b7f, 0xc54d0f78, 0x5ba663aa, 0x91e7cc43, 0x07295028, 0xe1f9640d, 0x5e0d49cb, + 0xd1d6d96a, 0x7e602d59, 0xc8a376ac, 0x15ddcff4, 0x90481328, 0x543e0eb7, 0x07d297e4, 0xddfb2d18, + 0x94a578aa, 0x9a39368e, 0x6aab286e, 0x0a39debd, 0x8ee5e818, 0x5c30655e, 0x661772e5, 0x527b25c1, + }, }, - }, - // Level 7 - { - UINT64_C(0x6d983dad), UINT64_C(0x3e435b56e5), {UINT64_C(0), UINT64_C(0)}, // dummy + // Level 7 { - 0x4014ee95, 0xfdbe07f6, 0x27a2c5d7, 0x497ae9f0, 0x18a372d5, 0x375c55ae, 0x4aab4110, 0x2d554d43, - 0x9504cbcd, 0xfbaedcce, 0x758c4326, 0xfafbba66, 0x9bda2b02, 0x1d955954, 0xe4bb3e12, 0xd558ed02, - 0x770c3bec, 0x6fcf284d, 0x7142cbb0, 0xefe84369, 0x9516d833, 0x097022c9, 0x8572785a, 0xcc866071, - 0x11084cac, 0x15707ce6, 0xc8a05f69, 0xf15c7b38, 0x3607b067, 0xa8f646b2, 0x62949620, 0x0e013130, - 0xe73a8f37, 0x853e3bd2, 0x4ad40839, 0x961fff58, 0x5b9a291e, 0x4df678ae, 0x9e49ab57, 0x12c0823b, - 0x804a15b9, 0xedbe4a7f, 0x3f65fe91, 0x0aca6940, 0xa14a7dc6, 0xd9a78895, 0x4c90b7fa, 0x90443c6a, - 0xc1325ada, 0x48876a7b, 0x091df649, 0x7ae46bc8, 0xdcfdc695, 0xc398dd91, 0xe6a24f20, 0x333f496b, - 0xe08413da, 0xbd197fa0, 0x55abc5e6, 0xa1abe124, 0x1cfdeee2, 0x48732fff, 0xdb2f1a4a, 0x192de0ae, - 0x87a288b7, 0x406f0062, 0xc4358b22, 0x19ccdeba, 0xa30cd0c5, 0x848d1e9a, 0x2fd31932, 0x7b78238e, - 0x9e9a208e, 0x517f5394, 0x8b689859, 0xe2202a00, 0x7d82aa8d, 0x736d2f4c, 0x8a5c630a, 0xaf1857bf, - 0xd56d5b1f, 0x3416feea, 0x6b16d737, 0xf61f0747, 0x359f0963, 0x6044d7c6, 0xedcdcafd, 0xa53ff8c5, - 0x09c7732a, 0x7f1b4137, 0x9d63e5c0, 0x776c5120, 0x0b0d231e, 0x57e54da1, 0x3b5e1e5e, 0x63069af7, - 0xa44a600c, 0x3d5a02fb, 0x2387039e, 0xf32214b4, 0x95707014, 0x65ae19ab, 0xa906bfd3, 0x41083458, - 0x106bdfd4, 0x41a3efe8, 0xb58bee3f, 0xaa70953c, 0x01cf2485, 0x40e5bdb9, 0xc94b2765, 0xc79cd151, - 0xad2d9daa, 0x62b40b60, 0x02800b32, 0x97d69686, 0xa9f0efdb, 0x24952809, 0x48694c4f, 0x630104fe, - 0x24f26b53, 0xc94d2a0f, 0x8635b8db, 0xb6822421, 0xe53c26dd, 0x9286330f, 0xf5a431ec, 0xacbb86b4, + UINT64_C(0x6d983dad), UINT64_C(0x3e435b56e5), { UINT64_C(0), UINT64_C(0) }, // dummy + { + 0x4014ee95, 0xfdbe07f6, 0x27a2c5d7, 0x497ae9f0, 0x18a372d5, 0x375c55ae, 0x4aab4110, 0x2d554d43, + 0x9504cbcd, 0xfbaedcce, 0x758c4326, 0xfafbba66, 0x9bda2b02, 0x1d955954, 0xe4bb3e12, 0xd558ed02, + 0x770c3bec, 0x6fcf284d, 0x7142cbb0, 0xefe84369, 0x9516d833, 0x097022c9, 0x8572785a, 0xcc866071, + 0x11084cac, 0x15707ce6, 0xc8a05f69, 0xf15c7b38, 0x3607b067, 0xa8f646b2, 0x62949620, 0x0e013130, + 0xe73a8f37, 0x853e3bd2, 0x4ad40839, 0x961fff58, 0x5b9a291e, 0x4df678ae, 0x9e49ab57, 0x12c0823b, + 0x804a15b9, 0xedbe4a7f, 0x3f65fe91, 0x0aca6940, 0xa14a7dc6, 0xd9a78895, 0x4c90b7fa, 0x90443c6a, + 0xc1325ada, 0x48876a7b, 0x091df649, 0x7ae46bc8, 0xdcfdc695, 0xc398dd91, 0xe6a24f20, 0x333f496b, + 0xe08413da, 0xbd197fa0, 0x55abc5e6, 0xa1abe124, 0x1cfdeee2, 0x48732fff, 0xdb2f1a4a, 0x192de0ae, + 0x87a288b7, 0x406f0062, 0xc4358b22, 0x19ccdeba, 0xa30cd0c5, 0x848d1e9a, 0x2fd31932, 0x7b78238e, + 0x9e9a208e, 0x517f5394, 0x8b689859, 0xe2202a00, 0x7d82aa8d, 0x736d2f4c, 0x8a5c630a, 0xaf1857bf, + 0xd56d5b1f, 0x3416feea, 0x6b16d737, 0xf61f0747, 0x359f0963, 0x6044d7c6, 0xedcdcafd, 0xa53ff8c5, + 0x09c7732a, 0x7f1b4137, 0x9d63e5c0, 0x776c5120, 0x0b0d231e, 0x57e54da1, 0x3b5e1e5e, 0x63069af7, + 0xa44a600c, 0x3d5a02fb, 0x2387039e, 0xf32214b4, 0x95707014, 0x65ae19ab, 0xa906bfd3, 0x41083458, + 0x106bdfd4, 0x41a3efe8, 0xb58bee3f, 0xaa70953c, 0x01cf2485, 0x40e5bdb9, 0xc94b2765, 0xc79cd151, + 0xad2d9daa, 0x62b40b60, 0x02800b32, 0x97d69686, 0xa9f0efdb, 0x24952809, 0x48694c4f, 0x630104fe, + 0x24f26b53, 0xc94d2a0f, 0x8635b8db, 0xb6822421, 0xe53c26dd, 0x9286330f, 0xf5a431ec, 0xacbb86b4, + }, }, - }, }; -//STATIC_ASSERT(PMPML_32_LEVELS <= 8, "Only 8 levels of data currently exist"); +// STATIC_ASSERT(PMPML_32_LEVELS <= 8, "Only 8 levels of data currently exist"); //------------------------------------------------------------- // 64-bit constants @@ -317,345 +314,345 @@ static thread_local random_data_for_PMPML_32 rd_for_PMPML_32[PMPML_32_LEVELS] = #define PMPML_64_WORD_SIZE_BYTES_LOG2 3 #define PMPML_64_LEVELS 8 // Derived constants -static const uint32_t PMPML_64_CHUNK_SIZE = (1 << PMPML_64_CHUNK_SIZE_LOG2); -static const uint32_t PMPML_64_WORD_SIZE_BYTES = (1 << PMPML_64_WORD_SIZE_BYTES_LOG2); -static const uint32_t PMPML_64_CHUNK_SIZE_BYTES = PMPML_64_CHUNK_SIZE * PMPML_64_WORD_SIZE_BYTES; +static const uint32_t PMPML_64_CHUNK_SIZE = (1 << PMPML_64_CHUNK_SIZE_LOG2 ); +static const uint32_t PMPML_64_WORD_SIZE_BYTES = (1 << PMPML_64_WORD_SIZE_BYTES_LOG2); +static const uint32_t PMPML_64_CHUNK_SIZE_BYTES = PMPML_64_CHUNK_SIZE * PMPML_64_WORD_SIZE_BYTES; static const uint32_t PMPML_64_CHUNK_SIZE_BYTES_LOG2 = PMPML_64_CHUNK_SIZE_LOG2 + PMPML_64_WORD_SIZE_BYTES_LOG2; // container for coefficients -typedef struct alignas(32) _random_data_for_PMPML_64 { - uint64_t const_term; - uint64_t cachedSumLow; - uint64_t cachedSumHigh; - uint64_t dummy; - uint64_t random_coeff[1 << PMPML_64_CHUNK_SIZE_LOG2]; +typedef struct alignas( 32 ) _random_data_for_PMPML_64 { + uint64_t const_term; + uint64_t cachedSumLow; + uint64_t cachedSumHigh; + uint64_t dummy; + uint64_t random_coeff[1 << PMPML_64_CHUNK_SIZE_LOG2]; } random_data_for_PMPML_64; static thread_local random_data_for_PMPML_64 rd_for_PMPML_64[PMPML_64_LEVELS] = { - // Level 0 - { - UINT64_C(0x4a29bfabe82f3abe), UINT64_C(0x2ccb0e578cfa99b), UINT64_C(0x000000041), 0, // sum of coeff and dummy + // Level 0 { - UINT64_C(0x2f129e0f017dff36), UINT64_C(0xb42c52ed219ac8ce), UINT64_C(0xd3324e2b5efdfa21), UINT64_C(0xc830746c5019f1de), - UINT64_C(0x57b1306026904f72), UINT64_C(0x0ec3ffd84539cf3d), UINT64_C(0x95664d4564b54986), UINT64_C(0xe0ee74349c002680), - UINT64_C(0x5a365b98971ff939), UINT64_C(0xf6bcac95513c540e), UINT64_C(0x49567d345ab6b3cf), UINT64_C(0x526ab3f6dee0def3), - UINT64_C(0x1d6fb9cf7dc2f089), UINT64_C(0xaeff1dbeb93f0749), UINT64_C(0xd4e05404a7eecac8), UINT64_C(0x5175e11e90cf1a69), - UINT64_C(0x29aac3810d90cf44), UINT64_C(0xe9930a671d8aab37), UINT64_C(0x00eded5ac8eeb924), UINT64_C(0xdb4820639e005b34), - UINT64_C(0x12debc35a3054ea7), UINT64_C(0x5a9dccd55b94986f), UINT64_C(0x666773be4be48027), UINT64_C(0xf9a45b94c9c5ce42), - UINT64_C(0xf3f018ccd958cf92), UINT64_C(0x473c23beeb584939), UINT64_C(0xc5e4f821ec00cd5b), UINT64_C(0x1d61cf5079c28b1c), - UINT64_C(0xf46643c7b0c9427b), UINT64_C(0x34d7177b30a2a078), UINT64_C(0x5279d153b2ab790a), UINT64_C(0xeaf18c48a1791f4c), - UINT64_C(0x90a13cb0c7ccb5b1), UINT64_C(0x2900f5242f23c3e6), UINT64_C(0x0975f1f8a1f6800f), UINT64_C(0xa53f1a9605cce7f2), - UINT64_C(0x0b396087cda51e60), UINT64_C(0x842e287b1fc29d36), UINT64_C(0x4556b0258878e52d), UINT64_C(0x546c60312887a3f0), - UINT64_C(0xdc13b1bb35399672), UINT64_C(0x32f18c1aa7a4697c), UINT64_C(0xc9223ebe2ebe5810), UINT64_C(0xeb845691d3f028e8), - UINT64_C(0xa21337280cc34732), UINT64_C(0x94d78e46776a29e2), UINT64_C(0x6cba9535a7c4c9a8), UINT64_C(0x9758fe18e1fb3d08), - UINT64_C(0x92478227db728e63), UINT64_C(0xa782477118744c90), UINT64_C(0xb1e0b74044f53769), UINT64_C(0x7b3a58b416f2474f), - UINT64_C(0xea041c911fc2991f), UINT64_C(0x4515562dfb118051), UINT64_C(0x36133ab6715ff0bd), UINT64_C(0xb0d107f4c74bcfc7), - UINT64_C(0xef47885bb62db5b8), UINT64_C(0xb2060330e33f5951), UINT64_C(0x96758e992ce56ba6), UINT64_C(0xe6ca7568b7f6a8ec), - UINT64_C(0xd6fd9b1a7b29fb71), UINT64_C(0x2e95d6aaa1593907), UINT64_C(0xf1abe303bdda6758), UINT64_C(0x1eb12f0ed0f91332), - UINT64_C(0xf593589b9ff39cbb), UINT64_C(0x110e67013362cf26), UINT64_C(0x671ca6801c7f9d57), UINT64_C(0x0aa55c338ed83b64), - UINT64_C(0x627d00690f3f465d), UINT64_C(0xff97bfbba48e8524), UINT64_C(0x9c3f5a0387919b50), UINT64_C(0x25f1e1efb7f91c48), - UINT64_C(0x7114cada956a53ae), UINT64_C(0x626a4e2ff89c39af), UINT64_C(0x86540186b2e391cc), UINT64_C(0x82d5f935e9a90bcd), - UINT64_C(0xe2d4d3059b6f5dc1), UINT64_C(0xbb3cc83e6478dd2e), UINT64_C(0x59b9b400b166ed62), UINT64_C(0xf04b9b209bb113b1), - UINT64_C(0xb27be3c3397ac130), UINT64_C(0xf619002cc54ac417), UINT64_C(0x46a8c23f12907210), UINT64_C(0x54fc42e7d99aa54f), - UINT64_C(0x2b264e8ea68323e7), UINT64_C(0x0e0b0f627257dfb9), UINT64_C(0xadc098de597949e8), UINT64_C(0xe2ba17b10bd5401a), - UINT64_C(0x7fa49be97f34ca1a), UINT64_C(0x8817b0a7e7d981cf), UINT64_C(0x3bede65042860a1f), UINT64_C(0xae569b2aafd241eb), - UINT64_C(0x5f1cc5a3059aa744), UINT64_C(0x762409219323dae9), UINT64_C(0x64d5aac875461b4e), UINT64_C(0x62147c9101655025), - UINT64_C(0xbde2c420826c8ddd), UINT64_C(0xde6d7e2be12d0797), UINT64_C(0x8338ac734c823357), UINT64_C(0x419b2aa58f1b985a), - UINT64_C(0x39ed88775355ae2d), UINT64_C(0x7a2e8cc72c7f3bce), UINT64_C(0x97935746814fa944), UINT64_C(0x828331abf2018ef4), - UINT64_C(0xd6b9060cd1d0ba56), UINT64_C(0x5548e64ac7626ff2), UINT64_C(0xe4635461f9175d23), UINT64_C(0x566d5d69d40cd206), - UINT64_C(0x65ffaf0c83ae838f), UINT64_C(0x5a585c800a52de9e), UINT64_C(0x64a121bc55d0b7a2), UINT64_C(0x661ef9d5b90d6e53), - UINT64_C(0xb298bfcff8afba20), UINT64_C(0x2a60665850d1a5e8), UINT64_C(0x61aba7a90d9ae6eb), UINT64_C(0x083667e22ffdf423), - UINT64_C(0xd5efe61f9bd9a79c), UINT64_C(0x582a3cf851cafad0), UINT64_C(0x1989365a301ef819), UINT64_C(0xe2778e8aee7b917e), - UINT64_C(0x4bd139ea2fc74066), UINT64_C(0x2716bfaa4b18912a), UINT64_C(0x1a477a7687dbbe34), UINT64_C(0x90127b1d8835c6e1), - UINT64_C(0x44651dc23bfac77d), UINT64_C(0xb030740966562609), UINT64_C(0xb295d4733127a190), UINT64_C(0xf022c66dc7b74382), + UINT64_C(0x4a29bfabe82f3abe), UINT64_C(0x2ccb0e578cfa99b), UINT64_C(0x000000041), 0, // sum of coeff and dummy + { + UINT64_C(0x2f129e0f017dff36), UINT64_C(0xb42c52ed219ac8ce), UINT64_C(0xd3324e2b5efdfa21), UINT64_C(0xc830746c5019f1de), + UINT64_C(0x57b1306026904f72), UINT64_C(0x0ec3ffd84539cf3d), UINT64_C(0x95664d4564b54986), UINT64_C(0xe0ee74349c002680), + UINT64_C(0x5a365b98971ff939), UINT64_C(0xf6bcac95513c540e), UINT64_C(0x49567d345ab6b3cf), UINT64_C(0x526ab3f6dee0def3), + UINT64_C(0x1d6fb9cf7dc2f089), UINT64_C(0xaeff1dbeb93f0749), UINT64_C(0xd4e05404a7eecac8), UINT64_C(0x5175e11e90cf1a69), + UINT64_C(0x29aac3810d90cf44), UINT64_C(0xe9930a671d8aab37), UINT64_C(0x00eded5ac8eeb924), UINT64_C(0xdb4820639e005b34), + UINT64_C(0x12debc35a3054ea7), UINT64_C(0x5a9dccd55b94986f), UINT64_C(0x666773be4be48027), UINT64_C(0xf9a45b94c9c5ce42), + UINT64_C(0xf3f018ccd958cf92), UINT64_C(0x473c23beeb584939), UINT64_C(0xc5e4f821ec00cd5b), UINT64_C(0x1d61cf5079c28b1c), + UINT64_C(0xf46643c7b0c9427b), UINT64_C(0x34d7177b30a2a078), UINT64_C(0x5279d153b2ab790a), UINT64_C(0xeaf18c48a1791f4c), + UINT64_C(0x90a13cb0c7ccb5b1), UINT64_C(0x2900f5242f23c3e6), UINT64_C(0x0975f1f8a1f6800f), UINT64_C(0xa53f1a9605cce7f2), + UINT64_C(0x0b396087cda51e60), UINT64_C(0x842e287b1fc29d36), UINT64_C(0x4556b0258878e52d), UINT64_C(0x546c60312887a3f0), + UINT64_C(0xdc13b1bb35399672), UINT64_C(0x32f18c1aa7a4697c), UINT64_C(0xc9223ebe2ebe5810), UINT64_C(0xeb845691d3f028e8), + UINT64_C(0xa21337280cc34732), UINT64_C(0x94d78e46776a29e2), UINT64_C(0x6cba9535a7c4c9a8), UINT64_C(0x9758fe18e1fb3d08), + UINT64_C(0x92478227db728e63), UINT64_C(0xa782477118744c90), UINT64_C(0xb1e0b74044f53769), UINT64_C(0x7b3a58b416f2474f), + UINT64_C(0xea041c911fc2991f), UINT64_C(0x4515562dfb118051), UINT64_C(0x36133ab6715ff0bd), UINT64_C(0xb0d107f4c74bcfc7), + UINT64_C(0xef47885bb62db5b8), UINT64_C(0xb2060330e33f5951), UINT64_C(0x96758e992ce56ba6), UINT64_C(0xe6ca7568b7f6a8ec), + UINT64_C(0xd6fd9b1a7b29fb71), UINT64_C(0x2e95d6aaa1593907), UINT64_C(0xf1abe303bdda6758), UINT64_C(0x1eb12f0ed0f91332), + UINT64_C(0xf593589b9ff39cbb), UINT64_C(0x110e67013362cf26), UINT64_C(0x671ca6801c7f9d57), UINT64_C(0x0aa55c338ed83b64), + UINT64_C(0x627d00690f3f465d), UINT64_C(0xff97bfbba48e8524), UINT64_C(0x9c3f5a0387919b50), UINT64_C(0x25f1e1efb7f91c48), + UINT64_C(0x7114cada956a53ae), UINT64_C(0x626a4e2ff89c39af), UINT64_C(0x86540186b2e391cc), UINT64_C(0x82d5f935e9a90bcd), + UINT64_C(0xe2d4d3059b6f5dc1), UINT64_C(0xbb3cc83e6478dd2e), UINT64_C(0x59b9b400b166ed62), UINT64_C(0xf04b9b209bb113b1), + UINT64_C(0xb27be3c3397ac130), UINT64_C(0xf619002cc54ac417), UINT64_C(0x46a8c23f12907210), UINT64_C(0x54fc42e7d99aa54f), + UINT64_C(0x2b264e8ea68323e7), UINT64_C(0x0e0b0f627257dfb9), UINT64_C(0xadc098de597949e8), UINT64_C(0xe2ba17b10bd5401a), + UINT64_C(0x7fa49be97f34ca1a), UINT64_C(0x8817b0a7e7d981cf), UINT64_C(0x3bede65042860a1f), UINT64_C(0xae569b2aafd241eb), + UINT64_C(0x5f1cc5a3059aa744), UINT64_C(0x762409219323dae9), UINT64_C(0x64d5aac875461b4e), UINT64_C(0x62147c9101655025), + UINT64_C(0xbde2c420826c8ddd), UINT64_C(0xde6d7e2be12d0797), UINT64_C(0x8338ac734c823357), UINT64_C(0x419b2aa58f1b985a), + UINT64_C(0x39ed88775355ae2d), UINT64_C(0x7a2e8cc72c7f3bce), UINT64_C(0x97935746814fa944), UINT64_C(0x828331abf2018ef4), + UINT64_C(0xd6b9060cd1d0ba56), UINT64_C(0x5548e64ac7626ff2), UINT64_C(0xe4635461f9175d23), UINT64_C(0x566d5d69d40cd206), + UINT64_C(0x65ffaf0c83ae838f), UINT64_C(0x5a585c800a52de9e), UINT64_C(0x64a121bc55d0b7a2), UINT64_C(0x661ef9d5b90d6e53), + UINT64_C(0xb298bfcff8afba20), UINT64_C(0x2a60665850d1a5e8), UINT64_C(0x61aba7a90d9ae6eb), UINT64_C(0x083667e22ffdf423), + UINT64_C(0xd5efe61f9bd9a79c), UINT64_C(0x582a3cf851cafad0), UINT64_C(0x1989365a301ef819), UINT64_C(0xe2778e8aee7b917e), + UINT64_C(0x4bd139ea2fc74066), UINT64_C(0x2716bfaa4b18912a), UINT64_C(0x1a477a7687dbbe34), UINT64_C(0x90127b1d8835c6e1), + UINT64_C(0x44651dc23bfac77d), UINT64_C(0xb030740966562609), UINT64_C(0xb295d4733127a190), UINT64_C(0xf022c66dc7b74382), + }, }, - }, // Level 1 - { - UINT64_C(0x39cd7650ff4f752a), UINT64_C(0xe9b49347770073e9), UINT64_C(0x00000003f), 0, // sum of coeff and dummy { - UINT64_C(0x6a22166c40f87e99), UINT64_C(0xff7e13387c337404), UINT64_C(0xd15f0f4dd5de05be), UINT64_C(0x825bb897d6ad1ef4), - UINT64_C(0x77b045691a63a8ec), UINT64_C(0x0a49df4370eb4048), UINT64_C(0xf6c80d9827e7043b), UINT64_C(0x1628979784f8c50d), - UINT64_C(0xd1a3e1f52402e01b), UINT64_C(0x6cfa2849efd5bc7f), UINT64_C(0xc6416ba240b063ec), UINT64_C(0x772d9ac4e43b2707), - UINT64_C(0x8cc9c4735bea20c5), UINT64_C(0xede4a423d10791b3), UINT64_C(0xc75eb6c16dbb96eb), UINT64_C(0x2df99f5f3ac91794), - UINT64_C(0x31be65ba10763ed5), UINT64_C(0xe89ce26b47440bc2), UINT64_C(0xe537526e59ddafdf), UINT64_C(0x16ae378ed0ef349c), - UINT64_C(0x747c11f0403b290e), UINT64_C(0xc1ada5226937ff10), UINT64_C(0x91886c173226bd6f), UINT64_C(0x7e0002e3c3aaeee3), - UINT64_C(0x65c329b5ce3ffac3), UINT64_C(0xd01f1343a37cc2f7), UINT64_C(0x366e7896927020e8), UINT64_C(0x84327c9993246a19), - UINT64_C(0x2c08dcf57f5487d1), UINT64_C(0x9981f7143c3f09bf), UINT64_C(0xe413c704e8ac8b14), UINT64_C(0x6c1354b6a416b3fb), - UINT64_C(0xaf14a970a5db32a3), UINT64_C(0x37428eb1cbdf20a8), UINT64_C(0x9b3a2f48a45999fc), UINT64_C(0x894d39e47aad1efa), - UINT64_C(0x662abdc6b0bb17e8), UINT64_C(0xd449820255e4bc4a), UINT64_C(0x5fc5d5a18389fa01), UINT64_C(0xf76102aa2484326e), - UINT64_C(0x08c4308c96b8ef43), UINT64_C(0x5c3a562402cee74c), UINT64_C(0xcf896705837e6c8c), UINT64_C(0xe069655ea3c1a067), - UINT64_C(0x3478c1c88ef76c15), UINT64_C(0x8f97330dff9ff33b), UINT64_C(0xba8c150f3fa32e41), UINT64_C(0x1f9be6e624480693), - UINT64_C(0x65d39bd613016d2c), UINT64_C(0x8d4504cb5be46d10), UINT64_C(0xf8b9f2f1685ce679), UINT64_C(0x023c59373ff7edc6), - UINT64_C(0x86283f83c707e5fa), UINT64_C(0xd7c3eebedd1a109b), UINT64_C(0x942b2786ea139167), UINT64_C(0xf54a2b229a268134), - UINT64_C(0x85d175f335d21fa1), UINT64_C(0xce39abb9d7e787e0), UINT64_C(0x3290b3797c71b62d), UINT64_C(0x954aebd35bc2d445), - UINT64_C(0xfb24c9a40287bbea), UINT64_C(0x7c50d2bef8066d38), UINT64_C(0xf8614d3fa751b1d1), UINT64_C(0x0ed6bd1b203b43b9), - UINT64_C(0x7444a688119fc803), UINT64_C(0xaafc0cf7a8f588a3), UINT64_C(0x86790f357d28efc6), UINT64_C(0xbc6d006ea2a48c65), - UINT64_C(0x192cd81c89e62897), UINT64_C(0x144a15fa87c09aa8), UINT64_C(0xc9466727de209085), UINT64_C(0xeaf453256eda97d1), - UINT64_C(0x2f0baafb5017bc8e), UINT64_C(0x1871e4808c0438bd), UINT64_C(0x1e78e125290b3e64), UINT64_C(0xb85bef6ba39ebc7d), - UINT64_C(0xc4487e3cabd4bf9e), UINT64_C(0x2ec0963510ce4901), UINT64_C(0x3b760a55c2ffc8aa), UINT64_C(0x0538bff351c74590), - UINT64_C(0xa2720fb707bf396d), UINT64_C(0xbca7ae2418758cc9), UINT64_C(0x6080c33057e68c8d), UINT64_C(0x0ce8e54cf677833c), - UINT64_C(0xc08644e5a40fa1ec), UINT64_C(0x143ce206cebb6352), UINT64_C(0x9842eb597773bb9a), UINT64_C(0xf9a01484a87d6b12), - UINT64_C(0x734da10581a35732), UINT64_C(0x1c5817613ea17f8d), UINT64_C(0xfbeb5bf815f12eb3), UINT64_C(0x0879175b1d28ed23), - UINT64_C(0xc470ffc0a1ce0cfd), UINT64_C(0x0b4b4e44b3d0b5d8), UINT64_C(0x2cd5a8501f56ac9a), UINT64_C(0xf2dfcf44a1689892), - UINT64_C(0x3bf38a66c6b001a2), UINT64_C(0xabfe0c1ce71d4829), UINT64_C(0xde1916f0d7565ad1), UINT64_C(0x97d66cfacf3df802), - UINT64_C(0x0e28348769858002), UINT64_C(0xefed65d521df30e9), UINT64_C(0x33abb8c0116b7721), UINT64_C(0xb21b1751d4a13405), - UINT64_C(0x3c445b844cb809e8), UINT64_C(0x48fe0d52ba18de8c), UINT64_C(0x88206dc4b93a7829), UINT64_C(0x2543fca442fe076b), - UINT64_C(0x4c6b6b567a3571d3), UINT64_C(0x47d9c2f551c39ba7), UINT64_C(0x2c6e0a4ebba24ac4), UINT64_C(0xb0a1c2f16942e728), - UINT64_C(0x536ca9a81adc2f15), UINT64_C(0xd84840af846d8115), UINT64_C(0x6a85aa0fa3159219), UINT64_C(0x4c167b95be156d20), - UINT64_C(0xcd3f7f07382d52cb), UINT64_C(0x000020e3a8604961), UINT64_C(0x0889912d52e797ba), UINT64_C(0x19eca83144939b12), - UINT64_C(0xb746c4bc57d2b80d), UINT64_C(0x5f19680e72e9ae82), UINT64_C(0xc8d7c655d341f90e), UINT64_C(0xd5d17f24f8e76882), - UINT64_C(0x111bc49d022a5575), UINT64_C(0xd6c434f7739424b9), UINT64_C(0x5d56d36b4ded16fe), UINT64_C(0x910276b4a008443f), + UINT64_C(0x39cd7650ff4f752a), UINT64_C(0xe9b49347770073e9), UINT64_C(0x00000003f), 0, // sum of coeff and dummy + { + UINT64_C(0x6a22166c40f87e99), UINT64_C(0xff7e13387c337404), UINT64_C(0xd15f0f4dd5de05be), UINT64_C(0x825bb897d6ad1ef4), + UINT64_C(0x77b045691a63a8ec), UINT64_C(0x0a49df4370eb4048), UINT64_C(0xf6c80d9827e7043b), UINT64_C(0x1628979784f8c50d), + UINT64_C(0xd1a3e1f52402e01b), UINT64_C(0x6cfa2849efd5bc7f), UINT64_C(0xc6416ba240b063ec), UINT64_C(0x772d9ac4e43b2707), + UINT64_C(0x8cc9c4735bea20c5), UINT64_C(0xede4a423d10791b3), UINT64_C(0xc75eb6c16dbb96eb), UINT64_C(0x2df99f5f3ac91794), + UINT64_C(0x31be65ba10763ed5), UINT64_C(0xe89ce26b47440bc2), UINT64_C(0xe537526e59ddafdf), UINT64_C(0x16ae378ed0ef349c), + UINT64_C(0x747c11f0403b290e), UINT64_C(0xc1ada5226937ff10), UINT64_C(0x91886c173226bd6f), UINT64_C(0x7e0002e3c3aaeee3), + UINT64_C(0x65c329b5ce3ffac3), UINT64_C(0xd01f1343a37cc2f7), UINT64_C(0x366e7896927020e8), UINT64_C(0x84327c9993246a19), + UINT64_C(0x2c08dcf57f5487d1), UINT64_C(0x9981f7143c3f09bf), UINT64_C(0xe413c704e8ac8b14), UINT64_C(0x6c1354b6a416b3fb), + UINT64_C(0xaf14a970a5db32a3), UINT64_C(0x37428eb1cbdf20a8), UINT64_C(0x9b3a2f48a45999fc), UINT64_C(0x894d39e47aad1efa), + UINT64_C(0x662abdc6b0bb17e8), UINT64_C(0xd449820255e4bc4a), UINT64_C(0x5fc5d5a18389fa01), UINT64_C(0xf76102aa2484326e), + UINT64_C(0x08c4308c96b8ef43), UINT64_C(0x5c3a562402cee74c), UINT64_C(0xcf896705837e6c8c), UINT64_C(0xe069655ea3c1a067), + UINT64_C(0x3478c1c88ef76c15), UINT64_C(0x8f97330dff9ff33b), UINT64_C(0xba8c150f3fa32e41), UINT64_C(0x1f9be6e624480693), + UINT64_C(0x65d39bd613016d2c), UINT64_C(0x8d4504cb5be46d10), UINT64_C(0xf8b9f2f1685ce679), UINT64_C(0x023c59373ff7edc6), + UINT64_C(0x86283f83c707e5fa), UINT64_C(0xd7c3eebedd1a109b), UINT64_C(0x942b2786ea139167), UINT64_C(0xf54a2b229a268134), + UINT64_C(0x85d175f335d21fa1), UINT64_C(0xce39abb9d7e787e0), UINT64_C(0x3290b3797c71b62d), UINT64_C(0x954aebd35bc2d445), + UINT64_C(0xfb24c9a40287bbea), UINT64_C(0x7c50d2bef8066d38), UINT64_C(0xf8614d3fa751b1d1), UINT64_C(0x0ed6bd1b203b43b9), + UINT64_C(0x7444a688119fc803), UINT64_C(0xaafc0cf7a8f588a3), UINT64_C(0x86790f357d28efc6), UINT64_C(0xbc6d006ea2a48c65), + UINT64_C(0x192cd81c89e62897), UINT64_C(0x144a15fa87c09aa8), UINT64_C(0xc9466727de209085), UINT64_C(0xeaf453256eda97d1), + UINT64_C(0x2f0baafb5017bc8e), UINT64_C(0x1871e4808c0438bd), UINT64_C(0x1e78e125290b3e64), UINT64_C(0xb85bef6ba39ebc7d), + UINT64_C(0xc4487e3cabd4bf9e), UINT64_C(0x2ec0963510ce4901), UINT64_C(0x3b760a55c2ffc8aa), UINT64_C(0x0538bff351c74590), + UINT64_C(0xa2720fb707bf396d), UINT64_C(0xbca7ae2418758cc9), UINT64_C(0x6080c33057e68c8d), UINT64_C(0x0ce8e54cf677833c), + UINT64_C(0xc08644e5a40fa1ec), UINT64_C(0x143ce206cebb6352), UINT64_C(0x9842eb597773bb9a), UINT64_C(0xf9a01484a87d6b12), + UINT64_C(0x734da10581a35732), UINT64_C(0x1c5817613ea17f8d), UINT64_C(0xfbeb5bf815f12eb3), UINT64_C(0x0879175b1d28ed23), + UINT64_C(0xc470ffc0a1ce0cfd), UINT64_C(0x0b4b4e44b3d0b5d8), UINT64_C(0x2cd5a8501f56ac9a), UINT64_C(0xf2dfcf44a1689892), + UINT64_C(0x3bf38a66c6b001a2), UINT64_C(0xabfe0c1ce71d4829), UINT64_C(0xde1916f0d7565ad1), UINT64_C(0x97d66cfacf3df802), + UINT64_C(0x0e28348769858002), UINT64_C(0xefed65d521df30e9), UINT64_C(0x33abb8c0116b7721), UINT64_C(0xb21b1751d4a13405), + UINT64_C(0x3c445b844cb809e8), UINT64_C(0x48fe0d52ba18de8c), UINT64_C(0x88206dc4b93a7829), UINT64_C(0x2543fca442fe076b), + UINT64_C(0x4c6b6b567a3571d3), UINT64_C(0x47d9c2f551c39ba7), UINT64_C(0x2c6e0a4ebba24ac4), UINT64_C(0xb0a1c2f16942e728), + UINT64_C(0x536ca9a81adc2f15), UINT64_C(0xd84840af846d8115), UINT64_C(0x6a85aa0fa3159219), UINT64_C(0x4c167b95be156d20), + UINT64_C(0xcd3f7f07382d52cb), UINT64_C(0x000020e3a8604961), UINT64_C(0x0889912d52e797ba), UINT64_C(0x19eca83144939b12), + UINT64_C(0xb746c4bc57d2b80d), UINT64_C(0x5f19680e72e9ae82), UINT64_C(0xc8d7c655d341f90e), UINT64_C(0xd5d17f24f8e76882), + UINT64_C(0x111bc49d022a5575), UINT64_C(0xd6c434f7739424b9), UINT64_C(0x5d56d36b4ded16fe), UINT64_C(0x910276b4a008443f), + }, }, - }, // Level 2 - { - UINT64_C(0x8d88b6de8694f9bd), UINT64_C(0xab3746b512cf0a0e), UINT64_C(0x00000003d), 0, // sum of coeff and dummy { - UINT64_C(0x8c35afea7008c707), UINT64_C(0x41ead554cfccdc94), UINT64_C(0x2efb2ec168e3bffc), UINT64_C(0xe7c3a0bbddc63920), - UINT64_C(0x4dce9e2b34302387), UINT64_C(0xfaf035fd5624990c), UINT64_C(0xccd919a786ba8213), UINT64_C(0x9a18857bdb2be4c1), - UINT64_C(0x001d03ba509647b6), UINT64_C(0x7e331694b4f66982), UINT64_C(0xb478c5a41317d762), UINT64_C(0xe717e226317c1144), - UINT64_C(0x022ffa0a2f15f66e), UINT64_C(0x6519929c261c063c), UINT64_C(0xff2060eae017d4e0), UINT64_C(0xefff6af725b87556), - UINT64_C(0x5d4d573a24be5312), UINT64_C(0xc07e9f4f495eb740), UINT64_C(0x5257032ed4c0e657), UINT64_C(0x2841f8526903c4ce), - UINT64_C(0xa5deee0ffb84873b), UINT64_C(0x45ce5d741491bbb2), UINT64_C(0x9c2b70601078ed64), UINT64_C(0x43837fdef168a0b0), - UINT64_C(0xf2ac139bf0bef9e8), UINT64_C(0x31f63ea0f89c8f29), UINT64_C(0x566268e5d7e2b1a7), UINT64_C(0x90a1dcf90070c039), - UINT64_C(0xb656b46da32098f3), UINT64_C(0x932e618f2bf02ff5), UINT64_C(0x6567346814e558c3), UINT64_C(0x6fee0aa9bbcd1aab), - UINT64_C(0x55a497a53ecf775d), UINT64_C(0xcce903fab3ead90d), UINT64_C(0x7fe3e530e9d3eaa0), UINT64_C(0x4dde47c8e75c1597), - UINT64_C(0x9d487b4725819ca5), UINT64_C(0x5893db2002678a18), UINT64_C(0x75f4da89918d8bff), UINT64_C(0x46736d07b2f80ed6), - UINT64_C(0x2b6e79c066e45341), UINT64_C(0xce708ef399b937cb), UINT64_C(0xa63749ae5d4f1767), UINT64_C(0x635d830a136e0563), - UINT64_C(0x55eea54f48f48df6), UINT64_C(0x68a076896b939688), UINT64_C(0x6e980d43ce7b11e9), UINT64_C(0x199065b551f0a7da), - UINT64_C(0x5d42faee0cb91d94), UINT64_C(0xa1770f53043c2107), UINT64_C(0x35c1ac46c4e4a748), UINT64_C(0xff43f86b0cd6ab3b), - UINT64_C(0x279dbad410c06a67), UINT64_C(0x40017b35ed84446a), UINT64_C(0xa73172134f9c5e8f), UINT64_C(0xfcff1de2975b0043), - UINT64_C(0xae0dd9ae2cfa364f), UINT64_C(0x52129c7818987b00), UINT64_C(0xaa0e91dae1a89606), UINT64_C(0x91dc4cbfdbb14973), - UINT64_C(0xb0ab9a3a7281965c), UINT64_C(0x9a8e2941fc1696a4), UINT64_C(0x6c76a89ed0a78b2c), UINT64_C(0xaa2539208db7d79a), - UINT64_C(0xcd5a73ca1b8ad462), UINT64_C(0xd2844afcfff68b7a), UINT64_C(0x808b81ab58a3c11e), UINT64_C(0x2003a1d79ee96e7e), - UINT64_C(0x87b236e5742b42d7), UINT64_C(0x3a3610e8bad3b373), UINT64_C(0xb481ca092e54fd87), UINT64_C(0xaf8adee08b5326e7), - UINT64_C(0x3ee2e6130ab53ef6), UINT64_C(0xbf7427af75a7c2d1), UINT64_C(0x4d7a6067dbeed20f), UINT64_C(0xcbdb5568d804ef3f), - UINT64_C(0x508ff58236e7a6f9), UINT64_C(0xacf7eac3c3037dab), UINT64_C(0x482b277d6928bddc), UINT64_C(0x538974760ddc6f83), - UINT64_C(0x6c3b990a1194ebe4), UINT64_C(0xeb3dfeda259aae19), UINT64_C(0x1043b1e32e6a609c), UINT64_C(0xe29853f3b731712a), - UINT64_C(0x725474cd1469a035), UINT64_C(0x08cc37d08547e287), UINT64_C(0x0de8c6d9ae66fe36), UINT64_C(0xaaef7eb47eb75f52), - UINT64_C(0xa29a69722b3bf66b), UINT64_C(0xd44d96ca50981b64), UINT64_C(0x0952a0827ec5b006), UINT64_C(0xaeced6c30c1fff4a), - UINT64_C(0xcf8551b4584c0c46), UINT64_C(0x2611b04aafedc71c), UINT64_C(0xd927dc8e6de6164f), UINT64_C(0x1fd5e2029d572551), - UINT64_C(0x45ad5bcd4bf72122), UINT64_C(0x54a3c4b12c343b21), UINT64_C(0x96156949c3f32a47), UINT64_C(0xa81023ef8e94e51b), - UINT64_C(0x26d335efc1d4efde), UINT64_C(0x669c4846e9284067), UINT64_C(0xcabd41a53335f6e1), UINT64_C(0x4f517812e06a917f), - UINT64_C(0xcdd989ce6aa55626), UINT64_C(0x5ca882c756fe4999), UINT64_C(0x639d8b99c6477c42), UINT64_C(0x2716a772911dca49), - UINT64_C(0x4374400157dc3d13), UINT64_C(0x1d0a512182a280f5), UINT64_C(0xd822a4f87a0ad77c), UINT64_C(0x0a0ab212f142db2b), - UINT64_C(0xe80fb8a935595883), UINT64_C(0x7568eec35a490b83), UINT64_C(0x09abdb9e114df5fc), UINT64_C(0x55137c447d1bca41), - UINT64_C(0x0de593a7acafcc85), UINT64_C(0xb975febcee3ca728), UINT64_C(0x63bef68e44fea1d5), UINT64_C(0xb013be7092b2a894), - UINT64_C(0xeba8c75d166e19d9), UINT64_C(0x224ad7936de628b9), UINT64_C(0x42b55663e6da91c0), UINT64_C(0x68f73c834d3b02a8), - UINT64_C(0x0bd2a1b0f697dc42), UINT64_C(0x89fc577d065f571a), UINT64_C(0xdc714c2c16925d8d), UINT64_C(0x5f94692fe9a6b2eb), + UINT64_C(0x8d88b6de8694f9bd), UINT64_C(0xab3746b512cf0a0e), UINT64_C(0x00000003d), 0, // sum of coeff and dummy + { + UINT64_C(0x8c35afea7008c707), UINT64_C(0x41ead554cfccdc94), UINT64_C(0x2efb2ec168e3bffc), UINT64_C(0xe7c3a0bbddc63920), + UINT64_C(0x4dce9e2b34302387), UINT64_C(0xfaf035fd5624990c), UINT64_C(0xccd919a786ba8213), UINT64_C(0x9a18857bdb2be4c1), + UINT64_C(0x001d03ba509647b6), UINT64_C(0x7e331694b4f66982), UINT64_C(0xb478c5a41317d762), UINT64_C(0xe717e226317c1144), + UINT64_C(0x022ffa0a2f15f66e), UINT64_C(0x6519929c261c063c), UINT64_C(0xff2060eae017d4e0), UINT64_C(0xefff6af725b87556), + UINT64_C(0x5d4d573a24be5312), UINT64_C(0xc07e9f4f495eb740), UINT64_C(0x5257032ed4c0e657), UINT64_C(0x2841f8526903c4ce), + UINT64_C(0xa5deee0ffb84873b), UINT64_C(0x45ce5d741491bbb2), UINT64_C(0x9c2b70601078ed64), UINT64_C(0x43837fdef168a0b0), + UINT64_C(0xf2ac139bf0bef9e8), UINT64_C(0x31f63ea0f89c8f29), UINT64_C(0x566268e5d7e2b1a7), UINT64_C(0x90a1dcf90070c039), + UINT64_C(0xb656b46da32098f3), UINT64_C(0x932e618f2bf02ff5), UINT64_C(0x6567346814e558c3), UINT64_C(0x6fee0aa9bbcd1aab), + UINT64_C(0x55a497a53ecf775d), UINT64_C(0xcce903fab3ead90d), UINT64_C(0x7fe3e530e9d3eaa0), UINT64_C(0x4dde47c8e75c1597), + UINT64_C(0x9d487b4725819ca5), UINT64_C(0x5893db2002678a18), UINT64_C(0x75f4da89918d8bff), UINT64_C(0x46736d07b2f80ed6), + UINT64_C(0x2b6e79c066e45341), UINT64_C(0xce708ef399b937cb), UINT64_C(0xa63749ae5d4f1767), UINT64_C(0x635d830a136e0563), + UINT64_C(0x55eea54f48f48df6), UINT64_C(0x68a076896b939688), UINT64_C(0x6e980d43ce7b11e9), UINT64_C(0x199065b551f0a7da), + UINT64_C(0x5d42faee0cb91d94), UINT64_C(0xa1770f53043c2107), UINT64_C(0x35c1ac46c4e4a748), UINT64_C(0xff43f86b0cd6ab3b), + UINT64_C(0x279dbad410c06a67), UINT64_C(0x40017b35ed84446a), UINT64_C(0xa73172134f9c5e8f), UINT64_C(0xfcff1de2975b0043), + UINT64_C(0xae0dd9ae2cfa364f), UINT64_C(0x52129c7818987b00), UINT64_C(0xaa0e91dae1a89606), UINT64_C(0x91dc4cbfdbb14973), + UINT64_C(0xb0ab9a3a7281965c), UINT64_C(0x9a8e2941fc1696a4), UINT64_C(0x6c76a89ed0a78b2c), UINT64_C(0xaa2539208db7d79a), + UINT64_C(0xcd5a73ca1b8ad462), UINT64_C(0xd2844afcfff68b7a), UINT64_C(0x808b81ab58a3c11e), UINT64_C(0x2003a1d79ee96e7e), + UINT64_C(0x87b236e5742b42d7), UINT64_C(0x3a3610e8bad3b373), UINT64_C(0xb481ca092e54fd87), UINT64_C(0xaf8adee08b5326e7), + UINT64_C(0x3ee2e6130ab53ef6), UINT64_C(0xbf7427af75a7c2d1), UINT64_C(0x4d7a6067dbeed20f), UINT64_C(0xcbdb5568d804ef3f), + UINT64_C(0x508ff58236e7a6f9), UINT64_C(0xacf7eac3c3037dab), UINT64_C(0x482b277d6928bddc), UINT64_C(0x538974760ddc6f83), + UINT64_C(0x6c3b990a1194ebe4), UINT64_C(0xeb3dfeda259aae19), UINT64_C(0x1043b1e32e6a609c), UINT64_C(0xe29853f3b731712a), + UINT64_C(0x725474cd1469a035), UINT64_C(0x08cc37d08547e287), UINT64_C(0x0de8c6d9ae66fe36), UINT64_C(0xaaef7eb47eb75f52), + UINT64_C(0xa29a69722b3bf66b), UINT64_C(0xd44d96ca50981b64), UINT64_C(0x0952a0827ec5b006), UINT64_C(0xaeced6c30c1fff4a), + UINT64_C(0xcf8551b4584c0c46), UINT64_C(0x2611b04aafedc71c), UINT64_C(0xd927dc8e6de6164f), UINT64_C(0x1fd5e2029d572551), + UINT64_C(0x45ad5bcd4bf72122), UINT64_C(0x54a3c4b12c343b21), UINT64_C(0x96156949c3f32a47), UINT64_C(0xa81023ef8e94e51b), + UINT64_C(0x26d335efc1d4efde), UINT64_C(0x669c4846e9284067), UINT64_C(0xcabd41a53335f6e1), UINT64_C(0x4f517812e06a917f), + UINT64_C(0xcdd989ce6aa55626), UINT64_C(0x5ca882c756fe4999), UINT64_C(0x639d8b99c6477c42), UINT64_C(0x2716a772911dca49), + UINT64_C(0x4374400157dc3d13), UINT64_C(0x1d0a512182a280f5), UINT64_C(0xd822a4f87a0ad77c), UINT64_C(0x0a0ab212f142db2b), + UINT64_C(0xe80fb8a935595883), UINT64_C(0x7568eec35a490b83), UINT64_C(0x09abdb9e114df5fc), UINT64_C(0x55137c447d1bca41), + UINT64_C(0x0de593a7acafcc85), UINT64_C(0xb975febcee3ca728), UINT64_C(0x63bef68e44fea1d5), UINT64_C(0xb013be7092b2a894), + UINT64_C(0xeba8c75d166e19d9), UINT64_C(0x224ad7936de628b9), UINT64_C(0x42b55663e6da91c0), UINT64_C(0x68f73c834d3b02a8), + UINT64_C(0x0bd2a1b0f697dc42), UINT64_C(0x89fc577d065f571a), UINT64_C(0xdc714c2c16925d8d), UINT64_C(0x5f94692fe9a6b2eb), + }, }, - }, // Level 3 - { - UINT64_C(0x8370e3dd2dd7e740), UINT64_C(0x4ac7a23650afaa5d), UINT64_C(0x00000003c), 0, // sum of coeff and dummy { - UINT64_C(0x141a416e635e3008), UINT64_C(0xe59e5696300fc54e), UINT64_C(0x3ac6afaf368cd3a6), UINT64_C(0x1c4d7641d7192768), - UINT64_C(0xaae556230b19cb19), UINT64_C(0x09fe3e074ade9f7e), UINT64_C(0xcc11adbd55ed21af), UINT64_C(0x862d3632edce6066), - UINT64_C(0x83200725a18ecf18), UINT64_C(0xef8a88f410ebfffa), UINT64_C(0x8f32ade56cc5cd11), UINT64_C(0x68601c8acb3b697b), - UINT64_C(0x3f7bc460e435c5be), UINT64_C(0xead87aaff097bf77), UINT64_C(0x5d35b160f1047863), UINT64_C(0x3c7c707d1decebe3), - UINT64_C(0xffab7fcb4b288977), UINT64_C(0xbb30bf67ea8078d4), UINT64_C(0x08c14f33079c0375), UINT64_C(0xc34be6df85f4e084), - UINT64_C(0xc5d61545239490a8), UINT64_C(0xc206111b5df05780), UINT64_C(0xb40b9d277b5eb1a6), UINT64_C(0x61f772ed20991bd7), - UINT64_C(0xa423cf9ee644f9b9), UINT64_C(0x63a281c7fb30afbe), UINT64_C(0x33dd3deb21ee47f3), UINT64_C(0x3d882a465f6520e0), - UINT64_C(0xd8f44673c67ff2c6), UINT64_C(0x159cafea157a4f90), UINT64_C(0x38a18e681a48e2a0), UINT64_C(0xb9ebf2a06fe035b4), - UINT64_C(0xdd504b49fd3e67bb), UINT64_C(0xae67fb542747c488), UINT64_C(0x7416c312f3387e02), UINT64_C(0xa5bebc6a0bc34dd0), - UINT64_C(0x89a98f212c21c94a), UINT64_C(0xd377d8c55c6c78c8), UINT64_C(0x23f194d2e59b81d0), UINT64_C(0xc0efd26a5d0ed051), - UINT64_C(0x0112146515113ef8), UINT64_C(0x2031a3cd82ce8702), UINT64_C(0x7ec8e3c87ce50a07), UINT64_C(0x47a142fc6fcd89c7), - UINT64_C(0x2bcb63e57f0cae2f), UINT64_C(0x8664c6f962a87b24), UINT64_C(0xe6d174ff007b2c34), UINT64_C(0x87e09c902d073b32), - UINT64_C(0xb543d64ed7dfb009), UINT64_C(0x7c31c340b3dae313), UINT64_C(0x562ba6cf0b4713cc), UINT64_C(0x957f23822221316e), - UINT64_C(0x9612164e43a7d75e), UINT64_C(0x66088836498298a7), UINT64_C(0x2277a69befc583cd), UINT64_C(0xc6a74c6baecd220d), - UINT64_C(0xc3df4a454eaf882f), UINT64_C(0x4c70af7cee8f0bbc), UINT64_C(0x2ba3590fd97517d4), UINT64_C(0xbb00a28e752d346c), - UINT64_C(0xebfa174a39681974), UINT64_C(0x033d8678eca2890b), UINT64_C(0xede2c5142f49827c), UINT64_C(0x614d56f55dde9f8b), - UINT64_C(0x72e2e9d5582a0a08), UINT64_C(0x9d1f6238ddac882b), UINT64_C(0xfcd3682c3bd70286), UINT64_C(0x8958816740699ee2), - UINT64_C(0xa5c7a3559d07b917), UINT64_C(0x4d8e82254c5a70e4), UINT64_C(0x291f69d4c89e5c45), UINT64_C(0x9c94a14902c4b249), - UINT64_C(0xd9bcf68e0f055258), UINT64_C(0x3a0cc6dcfffd05b7), UINT64_C(0xf0a22a2d6b06d03a), UINT64_C(0xeb9a2918852926aa), - UINT64_C(0x37915f797a6675f7), UINT64_C(0x98cdbb4e1686b742), UINT64_C(0x7007270bff4fcbe1), UINT64_C(0xc458d4068dc6c70f), - UINT64_C(0x073bbe0965ce93f3), UINT64_C(0xe7f2df0297e091e6), UINT64_C(0x3bf1a925fb9e6d1c), UINT64_C(0x48af31eef7b34f4b), - UINT64_C(0x00e92e127962fa5e), UINT64_C(0x0f8fc920466f3cd3), UINT64_C(0x25a21a02222a64b5), UINT64_C(0xb9853aa495decb46), - UINT64_C(0x262dc131bb0c35bb), UINT64_C(0xaf519c96fb0e9f68), UINT64_C(0x755849eedbb94ff2), UINT64_C(0x13a3d660e45f77b0), - UINT64_C(0x9f5d4268c5d69a64), UINT64_C(0x8c8a5e806938377c), UINT64_C(0x5bd34bfb54b64524), UINT64_C(0x6b5f1db574ecfaa9), - UINT64_C(0x37f725e56c1e9dc3), UINT64_C(0xc7fe10ac9904f90f), UINT64_C(0x879ae4eff04c0ab8), UINT64_C(0x76aea0675622e495), - UINT64_C(0xe29e3a0ebbe40dba), UINT64_C(0x157ffad6ff36b56f), UINT64_C(0x5466d89bca624434), UINT64_C(0x5449470d65bc5b35), - UINT64_C(0x7f6c99db52e6348a), UINT64_C(0x776d4dff2abd85c7), UINT64_C(0xb010a7f1beffcc1a), UINT64_C(0xad74603f4c6d9ab6), - UINT64_C(0x0599c30e3b018f16), UINT64_C(0x127a45fdeef28abd), UINT64_C(0x4cf790e8928575a0), UINT64_C(0x58fa1edd4caa9a51), - UINT64_C(0x5f3e8dd37e04eb51), UINT64_C(0xac131e1aea11807f), UINT64_C(0xf46fd7f990fb8cca), UINT64_C(0x73963b93ad4b9bb2), - UINT64_C(0x004c15e2478e8c36), UINT64_C(0xc79d966848c52c68), UINT64_C(0x827091c5d5309f35), UINT64_C(0x8e6290b4ecb7be34), - UINT64_C(0x4a2a701831915090), UINT64_C(0xb9ed682c26ae8721), UINT64_C(0x06c94a32c3f063b5), UINT64_C(0x11946415f289d8b4), - UINT64_C(0x4e6d4a3b505cd181), UINT64_C(0x7ad8e06beddabbeb), UINT64_C(0x272e050758ccfa94), UINT64_C(0x1a38a7703463de87), + UINT64_C(0x8370e3dd2dd7e740), UINT64_C(0x4ac7a23650afaa5d), UINT64_C(0x00000003c), 0, // sum of coeff and dummy + { + UINT64_C(0x141a416e635e3008), UINT64_C(0xe59e5696300fc54e), UINT64_C(0x3ac6afaf368cd3a6), UINT64_C(0x1c4d7641d7192768), + UINT64_C(0xaae556230b19cb19), UINT64_C(0x09fe3e074ade9f7e), UINT64_C(0xcc11adbd55ed21af), UINT64_C(0x862d3632edce6066), + UINT64_C(0x83200725a18ecf18), UINT64_C(0xef8a88f410ebfffa), UINT64_C(0x8f32ade56cc5cd11), UINT64_C(0x68601c8acb3b697b), + UINT64_C(0x3f7bc460e435c5be), UINT64_C(0xead87aaff097bf77), UINT64_C(0x5d35b160f1047863), UINT64_C(0x3c7c707d1decebe3), + UINT64_C(0xffab7fcb4b288977), UINT64_C(0xbb30bf67ea8078d4), UINT64_C(0x08c14f33079c0375), UINT64_C(0xc34be6df85f4e084), + UINT64_C(0xc5d61545239490a8), UINT64_C(0xc206111b5df05780), UINT64_C(0xb40b9d277b5eb1a6), UINT64_C(0x61f772ed20991bd7), + UINT64_C(0xa423cf9ee644f9b9), UINT64_C(0x63a281c7fb30afbe), UINT64_C(0x33dd3deb21ee47f3), UINT64_C(0x3d882a465f6520e0), + UINT64_C(0xd8f44673c67ff2c6), UINT64_C(0x159cafea157a4f90), UINT64_C(0x38a18e681a48e2a0), UINT64_C(0xb9ebf2a06fe035b4), + UINT64_C(0xdd504b49fd3e67bb), UINT64_C(0xae67fb542747c488), UINT64_C(0x7416c312f3387e02), UINT64_C(0xa5bebc6a0bc34dd0), + UINT64_C(0x89a98f212c21c94a), UINT64_C(0xd377d8c55c6c78c8), UINT64_C(0x23f194d2e59b81d0), UINT64_C(0xc0efd26a5d0ed051), + UINT64_C(0x0112146515113ef8), UINT64_C(0x2031a3cd82ce8702), UINT64_C(0x7ec8e3c87ce50a07), UINT64_C(0x47a142fc6fcd89c7), + UINT64_C(0x2bcb63e57f0cae2f), UINT64_C(0x8664c6f962a87b24), UINT64_C(0xe6d174ff007b2c34), UINT64_C(0x87e09c902d073b32), + UINT64_C(0xb543d64ed7dfb009), UINT64_C(0x7c31c340b3dae313), UINT64_C(0x562ba6cf0b4713cc), UINT64_C(0x957f23822221316e), + UINT64_C(0x9612164e43a7d75e), UINT64_C(0x66088836498298a7), UINT64_C(0x2277a69befc583cd), UINT64_C(0xc6a74c6baecd220d), + UINT64_C(0xc3df4a454eaf882f), UINT64_C(0x4c70af7cee8f0bbc), UINT64_C(0x2ba3590fd97517d4), UINT64_C(0xbb00a28e752d346c), + UINT64_C(0xebfa174a39681974), UINT64_C(0x033d8678eca2890b), UINT64_C(0xede2c5142f49827c), UINT64_C(0x614d56f55dde9f8b), + UINT64_C(0x72e2e9d5582a0a08), UINT64_C(0x9d1f6238ddac882b), UINT64_C(0xfcd3682c3bd70286), UINT64_C(0x8958816740699ee2), + UINT64_C(0xa5c7a3559d07b917), UINT64_C(0x4d8e82254c5a70e4), UINT64_C(0x291f69d4c89e5c45), UINT64_C(0x9c94a14902c4b249), + UINT64_C(0xd9bcf68e0f055258), UINT64_C(0x3a0cc6dcfffd05b7), UINT64_C(0xf0a22a2d6b06d03a), UINT64_C(0xeb9a2918852926aa), + UINT64_C(0x37915f797a6675f7), UINT64_C(0x98cdbb4e1686b742), UINT64_C(0x7007270bff4fcbe1), UINT64_C(0xc458d4068dc6c70f), + UINT64_C(0x073bbe0965ce93f3), UINT64_C(0xe7f2df0297e091e6), UINT64_C(0x3bf1a925fb9e6d1c), UINT64_C(0x48af31eef7b34f4b), + UINT64_C(0x00e92e127962fa5e), UINT64_C(0x0f8fc920466f3cd3), UINT64_C(0x25a21a02222a64b5), UINT64_C(0xb9853aa495decb46), + UINT64_C(0x262dc131bb0c35bb), UINT64_C(0xaf519c96fb0e9f68), UINT64_C(0x755849eedbb94ff2), UINT64_C(0x13a3d660e45f77b0), + UINT64_C(0x9f5d4268c5d69a64), UINT64_C(0x8c8a5e806938377c), UINT64_C(0x5bd34bfb54b64524), UINT64_C(0x6b5f1db574ecfaa9), + UINT64_C(0x37f725e56c1e9dc3), UINT64_C(0xc7fe10ac9904f90f), UINT64_C(0x879ae4eff04c0ab8), UINT64_C(0x76aea0675622e495), + UINT64_C(0xe29e3a0ebbe40dba), UINT64_C(0x157ffad6ff36b56f), UINT64_C(0x5466d89bca624434), UINT64_C(0x5449470d65bc5b35), + UINT64_C(0x7f6c99db52e6348a), UINT64_C(0x776d4dff2abd85c7), UINT64_C(0xb010a7f1beffcc1a), UINT64_C(0xad74603f4c6d9ab6), + UINT64_C(0x0599c30e3b018f16), UINT64_C(0x127a45fdeef28abd), UINT64_C(0x4cf790e8928575a0), UINT64_C(0x58fa1edd4caa9a51), + UINT64_C(0x5f3e8dd37e04eb51), UINT64_C(0xac131e1aea11807f), UINT64_C(0xf46fd7f990fb8cca), UINT64_C(0x73963b93ad4b9bb2), + UINT64_C(0x004c15e2478e8c36), UINT64_C(0xc79d966848c52c68), UINT64_C(0x827091c5d5309f35), UINT64_C(0x8e6290b4ecb7be34), + UINT64_C(0x4a2a701831915090), UINT64_C(0xb9ed682c26ae8721), UINT64_C(0x06c94a32c3f063b5), UINT64_C(0x11946415f289d8b4), + UINT64_C(0x4e6d4a3b505cd181), UINT64_C(0x7ad8e06beddabbeb), UINT64_C(0x272e050758ccfa94), UINT64_C(0x1a38a7703463de87), + }, }, - }, // Level 4 - { - UINT64_C(0x7c024d493240fd81), UINT64_C(0xcbedce790be4d6b), UINT64_C(0x000000041), 0, // sum of coeff and dummy { - UINT64_C(0xc385e890cdafa370), UINT64_C(0x72af2ae52cda3c0c), UINT64_C(0x377cc48ad117edce), UINT64_C(0xf3724d905f5cdc46), - UINT64_C(0xf51e0db646e04641), UINT64_C(0xb3ef041173b95e50), UINT64_C(0x483d8f190412d741), UINT64_C(0x9565fe70636fe7d1), - UINT64_C(0x7b5497f93bca30f2), UINT64_C(0xf7aa697c1f31e835), UINT64_C(0x26b9b332c5097919), UINT64_C(0x609c027c0e94be94), - UINT64_C(0xa4a77bf651dff968), UINT64_C(0xd3e952f9477aa964), UINT64_C(0xb6eb6ba84eafa8c3), UINT64_C(0xecc3cb66b4f9e264), - UINT64_C(0x6f7de149b48c42d2), UINT64_C(0xef38e08b77c94c8b), UINT64_C(0xd6a178affe73a087), UINT64_C(0xba01cfe6a8b0bfaf), - UINT64_C(0x771821ab27b1d361), UINT64_C(0x7b5e6b3e68a80c08), UINT64_C(0xd53c33bab8faf82f), UINT64_C(0x81e128821c9b5835), - UINT64_C(0x6968851cd767ecb8), UINT64_C(0x539510f090361d02), UINT64_C(0xee243a481fed197e), UINT64_C(0x57a7a6f5c2d4a423), - UINT64_C(0x7afc981eebfd0da8), UINT64_C(0xca100d08037f88e1), UINT64_C(0x7caf7e30e051e2f3), UINT64_C(0x09c6f692bb7e0c5e), - UINT64_C(0xff97c9f9213491a7), UINT64_C(0x3c7f06f4da8b68a8), UINT64_C(0xcc22969e12b0c521), UINT64_C(0xd3c246d637dc486c), - UINT64_C(0x645c098f230c482c), UINT64_C(0x7be14df33d02c990), UINT64_C(0xea99f1bc32cc189f), UINT64_C(0x8b776c2437b66a29), - UINT64_C(0xb6975830b26d1bcb), UINT64_C(0x3c24c07fb12dedfb), UINT64_C(0x939403d4624cb460), UINT64_C(0x0b4f454217f1f947), - UINT64_C(0x1ba0c284e2ac36c2), UINT64_C(0x25cfdc661fa02193), UINT64_C(0x661dc556bc51ede9), UINT64_C(0x8e4e8f1996c5b04f), - UINT64_C(0x6196e065ebbfc052), UINT64_C(0xbc1f2b573fcaf323), UINT64_C(0x74b0be15966126bc), UINT64_C(0xb61922dc3648b491), - UINT64_C(0x7528e5507af25415), UINT64_C(0xa03fee7cecbf5a92), UINT64_C(0x28f080a17abcdbf4), UINT64_C(0xf558e58265b50247), - UINT64_C(0x48946bc6b781b231), UINT64_C(0x1d3f9268ece51d01), UINT64_C(0x64cfd592583cd6d1), UINT64_C(0x33227252dde03dcc), - UINT64_C(0xfe487eba451edd0e), UINT64_C(0x1554136d4e0da4f8), UINT64_C(0x5446eb38aa369ed4), UINT64_C(0x5b46c4ce910d2ab6), - UINT64_C(0x5ca4f4ee4346e6f3), UINT64_C(0xb8a0111cf306801f), UINT64_C(0x4f96aae6581da78e), UINT64_C(0x6245d9523980b137), - UINT64_C(0x5e6efad77dd317ba), UINT64_C(0x7eb8de8eb617c7f4), UINT64_C(0x84e4d9ed06dce648), UINT64_C(0x24ed663bd6ce99fd), - UINT64_C(0xdf0ba8713d3bd076), UINT64_C(0xc11063b88172e67a), UINT64_C(0xb173e8e756868535), UINT64_C(0x6f9b72467e93008f), - UINT64_C(0x0c7ab90fa88aa8b2), UINT64_C(0x3deb22d963a56bcf), UINT64_C(0xa56348ee35314bb8), UINT64_C(0x9881a7a2129cebdb), - UINT64_C(0xc160ec1b18ecaeb6), UINT64_C(0x358f2bd362310528), UINT64_C(0xa92ccae5ed750d12), UINT64_C(0xdce6d5d94a23845d), - UINT64_C(0xf50e3e4e30ac79f4), UINT64_C(0x308e35ff0a5c199f), UINT64_C(0x9843f1db5c0f0066), UINT64_C(0x21e31f7ea490ff33), - UINT64_C(0x180b0bd32ae3dc81), UINT64_C(0x64067fc5626d1cd9), UINT64_C(0x10803e502f4b4eef), UINT64_C(0x64f3d35137338ceb), - UINT64_C(0x12f3445e0c9d7641), UINT64_C(0x7be6720939744b5c), UINT64_C(0xe85e4cc174c166e2), UINT64_C(0x9468eb4ab9946aed), - UINT64_C(0xa8bb2b2d4df63a32), UINT64_C(0xb2f95c382e934037), UINT64_C(0x3e902ed369fbbb44), UINT64_C(0x185a9eade1869dd0), - UINT64_C(0xd240a5734d051bf1), UINT64_C(0x92faec8652bea745), UINT64_C(0x8996ab0aec688aba), UINT64_C(0xbcac5f2824c8daef), - UINT64_C(0x5881daacfc329969), UINT64_C(0x55364eaf990b3b21), UINT64_C(0xe5de0bd0d06f1120), UINT64_C(0xd6a6fb94a44fbf1a), - UINT64_C(0x4e10e2dcf9e9aa49), UINT64_C(0xfe401a3e5cdb41ae), UINT64_C(0x81a4db50e11a295f), UINT64_C(0xfcc87dd6a04da032), - UINT64_C(0x6c5f6fa90c36ccb6), UINT64_C(0xf7fa702ef53bd5bd), UINT64_C(0x37345651f635ded5), UINT64_C(0x9650ac0acc8b0f11), - UINT64_C(0xfb1fc5e6a46f6c48), UINT64_C(0x75fbd67a4f588024), UINT64_C(0xbcf48525891fbf4e), UINT64_C(0x076fdfe68cb57efc), - UINT64_C(0x9ff4fdeb562abe4d), UINT64_C(0x363686dcec66ee6f), UINT64_C(0x3ed3c65e6660e857), UINT64_C(0x555629fb07677f9c), - UINT64_C(0x0b9e59e5e2dc63f0), UINT64_C(0x3dd204d3c272f8e8), UINT64_C(0x0a5e2bc12753cc6f), UINT64_C(0x261571527dae8627), + UINT64_C(0x7c024d493240fd81), UINT64_C(0xcbedce790be4d6b), UINT64_C(0x000000041), 0, // sum of coeff and dummy + { + UINT64_C(0xc385e890cdafa370), UINT64_C(0x72af2ae52cda3c0c), UINT64_C(0x377cc48ad117edce), UINT64_C(0xf3724d905f5cdc46), + UINT64_C(0xf51e0db646e04641), UINT64_C(0xb3ef041173b95e50), UINT64_C(0x483d8f190412d741), UINT64_C(0x9565fe70636fe7d1), + UINT64_C(0x7b5497f93bca30f2), UINT64_C(0xf7aa697c1f31e835), UINT64_C(0x26b9b332c5097919), UINT64_C(0x609c027c0e94be94), + UINT64_C(0xa4a77bf651dff968), UINT64_C(0xd3e952f9477aa964), UINT64_C(0xb6eb6ba84eafa8c3), UINT64_C(0xecc3cb66b4f9e264), + UINT64_C(0x6f7de149b48c42d2), UINT64_C(0xef38e08b77c94c8b), UINT64_C(0xd6a178affe73a087), UINT64_C(0xba01cfe6a8b0bfaf), + UINT64_C(0x771821ab27b1d361), UINT64_C(0x7b5e6b3e68a80c08), UINT64_C(0xd53c33bab8faf82f), UINT64_C(0x81e128821c9b5835), + UINT64_C(0x6968851cd767ecb8), UINT64_C(0x539510f090361d02), UINT64_C(0xee243a481fed197e), UINT64_C(0x57a7a6f5c2d4a423), + UINT64_C(0x7afc981eebfd0da8), UINT64_C(0xca100d08037f88e1), UINT64_C(0x7caf7e30e051e2f3), UINT64_C(0x09c6f692bb7e0c5e), + UINT64_C(0xff97c9f9213491a7), UINT64_C(0x3c7f06f4da8b68a8), UINT64_C(0xcc22969e12b0c521), UINT64_C(0xd3c246d637dc486c), + UINT64_C(0x645c098f230c482c), UINT64_C(0x7be14df33d02c990), UINT64_C(0xea99f1bc32cc189f), UINT64_C(0x8b776c2437b66a29), + UINT64_C(0xb6975830b26d1bcb), UINT64_C(0x3c24c07fb12dedfb), UINT64_C(0x939403d4624cb460), UINT64_C(0x0b4f454217f1f947), + UINT64_C(0x1ba0c284e2ac36c2), UINT64_C(0x25cfdc661fa02193), UINT64_C(0x661dc556bc51ede9), UINT64_C(0x8e4e8f1996c5b04f), + UINT64_C(0x6196e065ebbfc052), UINT64_C(0xbc1f2b573fcaf323), UINT64_C(0x74b0be15966126bc), UINT64_C(0xb61922dc3648b491), + UINT64_C(0x7528e5507af25415), UINT64_C(0xa03fee7cecbf5a92), UINT64_C(0x28f080a17abcdbf4), UINT64_C(0xf558e58265b50247), + UINT64_C(0x48946bc6b781b231), UINT64_C(0x1d3f9268ece51d01), UINT64_C(0x64cfd592583cd6d1), UINT64_C(0x33227252dde03dcc), + UINT64_C(0xfe487eba451edd0e), UINT64_C(0x1554136d4e0da4f8), UINT64_C(0x5446eb38aa369ed4), UINT64_C(0x5b46c4ce910d2ab6), + UINT64_C(0x5ca4f4ee4346e6f3), UINT64_C(0xb8a0111cf306801f), UINT64_C(0x4f96aae6581da78e), UINT64_C(0x6245d9523980b137), + UINT64_C(0x5e6efad77dd317ba), UINT64_C(0x7eb8de8eb617c7f4), UINT64_C(0x84e4d9ed06dce648), UINT64_C(0x24ed663bd6ce99fd), + UINT64_C(0xdf0ba8713d3bd076), UINT64_C(0xc11063b88172e67a), UINT64_C(0xb173e8e756868535), UINT64_C(0x6f9b72467e93008f), + UINT64_C(0x0c7ab90fa88aa8b2), UINT64_C(0x3deb22d963a56bcf), UINT64_C(0xa56348ee35314bb8), UINT64_C(0x9881a7a2129cebdb), + UINT64_C(0xc160ec1b18ecaeb6), UINT64_C(0x358f2bd362310528), UINT64_C(0xa92ccae5ed750d12), UINT64_C(0xdce6d5d94a23845d), + UINT64_C(0xf50e3e4e30ac79f4), UINT64_C(0x308e35ff0a5c199f), UINT64_C(0x9843f1db5c0f0066), UINT64_C(0x21e31f7ea490ff33), + UINT64_C(0x180b0bd32ae3dc81), UINT64_C(0x64067fc5626d1cd9), UINT64_C(0x10803e502f4b4eef), UINT64_C(0x64f3d35137338ceb), + UINT64_C(0x12f3445e0c9d7641), UINT64_C(0x7be6720939744b5c), UINT64_C(0xe85e4cc174c166e2), UINT64_C(0x9468eb4ab9946aed), + UINT64_C(0xa8bb2b2d4df63a32), UINT64_C(0xb2f95c382e934037), UINT64_C(0x3e902ed369fbbb44), UINT64_C(0x185a9eade1869dd0), + UINT64_C(0xd240a5734d051bf1), UINT64_C(0x92faec8652bea745), UINT64_C(0x8996ab0aec688aba), UINT64_C(0xbcac5f2824c8daef), + UINT64_C(0x5881daacfc329969), UINT64_C(0x55364eaf990b3b21), UINT64_C(0xe5de0bd0d06f1120), UINT64_C(0xd6a6fb94a44fbf1a), + UINT64_C(0x4e10e2dcf9e9aa49), UINT64_C(0xfe401a3e5cdb41ae), UINT64_C(0x81a4db50e11a295f), UINT64_C(0xfcc87dd6a04da032), + UINT64_C(0x6c5f6fa90c36ccb6), UINT64_C(0xf7fa702ef53bd5bd), UINT64_C(0x37345651f635ded5), UINT64_C(0x9650ac0acc8b0f11), + UINT64_C(0xfb1fc5e6a46f6c48), UINT64_C(0x75fbd67a4f588024), UINT64_C(0xbcf48525891fbf4e), UINT64_C(0x076fdfe68cb57efc), + UINT64_C(0x9ff4fdeb562abe4d), UINT64_C(0x363686dcec66ee6f), UINT64_C(0x3ed3c65e6660e857), UINT64_C(0x555629fb07677f9c), + UINT64_C(0x0b9e59e5e2dc63f0), UINT64_C(0x3dd204d3c272f8e8), UINT64_C(0x0a5e2bc12753cc6f), UINT64_C(0x261571527dae8627), + }, }, - }, // Level 5 - { - UINT64_C(0x742b91e91dcfb0a6), UINT64_C(0xcfeca6a967921914), UINT64_C(0x00000003c), 0, // sum of coeff and dummy { - UINT64_C(0x6edee5be930ba5a3), UINT64_C(0x7da756c8a9d5865f), UINT64_C(0x979d7286e9ec6a3a), UINT64_C(0xb5f53e73c1075910), - UINT64_C(0xac17c48f4a6369d1), UINT64_C(0xe59c869b50f242b8), UINT64_C(0xd82f2c4debbd7a92), UINT64_C(0x2f480ab7fcef8c2a), - UINT64_C(0x5455617627c7967c), UINT64_C(0x391f4653479cd148), UINT64_C(0x93816a1fe3fe659f), UINT64_C(0x750610cc458f0e83), - UINT64_C(0xaea9ec84538ba181), UINT64_C(0x07f69ef23331d201), UINT64_C(0x1154b8671a7e21a6), UINT64_C(0x44f2b2a5e705dccd), - UINT64_C(0xf4137114642bd756), UINT64_C(0x0d9fdd5c26862aa0), UINT64_C(0x24252072220e87e6), UINT64_C(0x40c56b66c01c20f4), - UINT64_C(0x3d1246932d66f5fb), UINT64_C(0x549be143f5ad841a), UINT64_C(0xf5a694fd849975f9), UINT64_C(0xab3a75807839e2ae), - UINT64_C(0xdbc151ec40a63d29), UINT64_C(0x252d86d9b6ff7885), UINT64_C(0xd848fb1e2a170064), UINT64_C(0x8dbfbaa7e285d213), - UINT64_C(0x48c5c1a431e6a390), UINT64_C(0x4ea411a44607dc21), UINT64_C(0xbb8535f2c692910e), UINT64_C(0x6d8c5388d2aed8b2), - UINT64_C(0x2fddc57f1a7b1cc8), UINT64_C(0x3a2c8bd7ea3f25ab), UINT64_C(0x87708e34be0fb414), UINT64_C(0x8543e5d4e9f7c34e), - UINT64_C(0x2c349130b9d62f31), UINT64_C(0x8589d21285426c0c), UINT64_C(0x5b2a39baebaad52f), UINT64_C(0x03f8700c91cd5413), - UINT64_C(0xcc00c06be9d784fb), UINT64_C(0x70a78056b4c5b930), UINT64_C(0x4a2aa9811bbd47a3), UINT64_C(0x4a878b1e922c6304), - UINT64_C(0x2443f15ef107a70f), UINT64_C(0xf64b29a8f4069376), UINT64_C(0xfc309fa9086da268), UINT64_C(0xffeedab78f765ff4), - UINT64_C(0xa99a216b423fac77), UINT64_C(0x3b9c309929d6991e), UINT64_C(0x113fe1aa6ba4c211), UINT64_C(0x2f214dea6f758f36), - UINT64_C(0x519806a4ba5b5ca8), UINT64_C(0xef203bc2948dda9e), UINT64_C(0xaa83a59110f3a193), UINT64_C(0xebdef286170eb7ef), - UINT64_C(0x9bd44760cd090ead), UINT64_C(0x234b9dde9fd14ab3), UINT64_C(0xee6e9c107305b2f4), UINT64_C(0x5eae7639d8a2b0ab), - UINT64_C(0x63d30ff6c83a7320), UINT64_C(0x3ded1e0f42fa1cb2), UINT64_C(0xd386b3b3b19d708e), UINT64_C(0x34d5016669fe449a), - UINT64_C(0xb9f91d66682b7278), UINT64_C(0x817659853e4e435e), UINT64_C(0xfc2e6483c3048759), UINT64_C(0xb261e03ffbd9519e), - UINT64_C(0xb49de284f5cf5d02), UINT64_C(0x02387c87bbbf7445), UINT64_C(0x6d937def7be53a83), UINT64_C(0x08526f8ae49dbd0f), - UINT64_C(0x615ef3f5af7fd5ab), UINT64_C(0x54cb4d9e528c1d79), UINT64_C(0x3cb713ba05a67835), UINT64_C(0xf592fb2d4d2af2db), - UINT64_C(0x86ec6601e42b2456), UINT64_C(0x0e857a59e7439d0d), UINT64_C(0x8326414cd1f6874f), UINT64_C(0xa92dad5f5d9a106a), - UINT64_C(0x58793e150f7ff874), UINT64_C(0x519bc1ed4913c3c5), UINT64_C(0x4f3b0da10be83d82), UINT64_C(0xd82c561b6f18a264), - UINT64_C(0xa47f8878009a1815), UINT64_C(0x0673feb8c6083dd6), UINT64_C(0x343ac4c37efb4d08), UINT64_C(0x4847b3364092fa4a), - UINT64_C(0x1a30098e32c503a0), UINT64_C(0x7f242c4cb083e69b), UINT64_C(0x08e69e6c3b1070ec), UINT64_C(0x0711fa2b404a9684), - UINT64_C(0xfc24e0a982ae39fa), UINT64_C(0x02ff5ca0bd974db5), UINT64_C(0x2777845db37d0e98), UINT64_C(0x5555b5942327e543), - UINT64_C(0x7717c93942df84b7), UINT64_C(0x2a661b86ad2dcdde), UINT64_C(0x61c93d7746664b20), UINT64_C(0x514090cc1a87d06b), - UINT64_C(0x7aa2f5f8bcf987ad), UINT64_C(0x2898047ec7fa8778), UINT64_C(0xe5cf2d9a08d8927c), UINT64_C(0xecde6d34e5c3fe5a), - UINT64_C(0x5589c848adaebaf8), UINT64_C(0xedac4b9343975aa2), UINT64_C(0x48503cf321ad26b2), UINT64_C(0x4e7f1530c16f8941), - UINT64_C(0x6a9fe4e56715fa4e), UINT64_C(0xefa9aec821c89e4b), UINT64_C(0xc23b542018927c97), UINT64_C(0xeedb11ae93481c6f), - UINT64_C(0x35f45dab8618f030), UINT64_C(0x2a5eb24e550fcb99), UINT64_C(0x5c6d2d61242cf3a8), UINT64_C(0x96058fee3f9becb0), - UINT64_C(0x811ed70d6e6cd756), UINT64_C(0x93642e8381c4a6a0), UINT64_C(0xc81e05bef85ad62b), UINT64_C(0xd12ce5cee02edeae), - UINT64_C(0x0a00b676c5f25868), UINT64_C(0xc5c91383914e9732), UINT64_C(0xd9e4fbd6c7a78695), UINT64_C(0x24741bcd3aab63f3), - UINT64_C(0xa86f85bc7932add8), UINT64_C(0xd851daaea4ade651), UINT64_C(0xc1b2a4b765bd4ee2), UINT64_C(0xd648f4971ef524f7), + UINT64_C(0x742b91e91dcfb0a6), UINT64_C(0xcfeca6a967921914), UINT64_C(0x00000003c), 0, // sum of coeff and dummy + { + UINT64_C(0x6edee5be930ba5a3), UINT64_C(0x7da756c8a9d5865f), UINT64_C(0x979d7286e9ec6a3a), UINT64_C(0xb5f53e73c1075910), + UINT64_C(0xac17c48f4a6369d1), UINT64_C(0xe59c869b50f242b8), UINT64_C(0xd82f2c4debbd7a92), UINT64_C(0x2f480ab7fcef8c2a), + UINT64_C(0x5455617627c7967c), UINT64_C(0x391f4653479cd148), UINT64_C(0x93816a1fe3fe659f), UINT64_C(0x750610cc458f0e83), + UINT64_C(0xaea9ec84538ba181), UINT64_C(0x07f69ef23331d201), UINT64_C(0x1154b8671a7e21a6), UINT64_C(0x44f2b2a5e705dccd), + UINT64_C(0xf4137114642bd756), UINT64_C(0x0d9fdd5c26862aa0), UINT64_C(0x24252072220e87e6), UINT64_C(0x40c56b66c01c20f4), + UINT64_C(0x3d1246932d66f5fb), UINT64_C(0x549be143f5ad841a), UINT64_C(0xf5a694fd849975f9), UINT64_C(0xab3a75807839e2ae), + UINT64_C(0xdbc151ec40a63d29), UINT64_C(0x252d86d9b6ff7885), UINT64_C(0xd848fb1e2a170064), UINT64_C(0x8dbfbaa7e285d213), + UINT64_C(0x48c5c1a431e6a390), UINT64_C(0x4ea411a44607dc21), UINT64_C(0xbb8535f2c692910e), UINT64_C(0x6d8c5388d2aed8b2), + UINT64_C(0x2fddc57f1a7b1cc8), UINT64_C(0x3a2c8bd7ea3f25ab), UINT64_C(0x87708e34be0fb414), UINT64_C(0x8543e5d4e9f7c34e), + UINT64_C(0x2c349130b9d62f31), UINT64_C(0x8589d21285426c0c), UINT64_C(0x5b2a39baebaad52f), UINT64_C(0x03f8700c91cd5413), + UINT64_C(0xcc00c06be9d784fb), UINT64_C(0x70a78056b4c5b930), UINT64_C(0x4a2aa9811bbd47a3), UINT64_C(0x4a878b1e922c6304), + UINT64_C(0x2443f15ef107a70f), UINT64_C(0xf64b29a8f4069376), UINT64_C(0xfc309fa9086da268), UINT64_C(0xffeedab78f765ff4), + UINT64_C(0xa99a216b423fac77), UINT64_C(0x3b9c309929d6991e), UINT64_C(0x113fe1aa6ba4c211), UINT64_C(0x2f214dea6f758f36), + UINT64_C(0x519806a4ba5b5ca8), UINT64_C(0xef203bc2948dda9e), UINT64_C(0xaa83a59110f3a193), UINT64_C(0xebdef286170eb7ef), + UINT64_C(0x9bd44760cd090ead), UINT64_C(0x234b9dde9fd14ab3), UINT64_C(0xee6e9c107305b2f4), UINT64_C(0x5eae7639d8a2b0ab), + UINT64_C(0x63d30ff6c83a7320), UINT64_C(0x3ded1e0f42fa1cb2), UINT64_C(0xd386b3b3b19d708e), UINT64_C(0x34d5016669fe449a), + UINT64_C(0xb9f91d66682b7278), UINT64_C(0x817659853e4e435e), UINT64_C(0xfc2e6483c3048759), UINT64_C(0xb261e03ffbd9519e), + UINT64_C(0xb49de284f5cf5d02), UINT64_C(0x02387c87bbbf7445), UINT64_C(0x6d937def7be53a83), UINT64_C(0x08526f8ae49dbd0f), + UINT64_C(0x615ef3f5af7fd5ab), UINT64_C(0x54cb4d9e528c1d79), UINT64_C(0x3cb713ba05a67835), UINT64_C(0xf592fb2d4d2af2db), + UINT64_C(0x86ec6601e42b2456), UINT64_C(0x0e857a59e7439d0d), UINT64_C(0x8326414cd1f6874f), UINT64_C(0xa92dad5f5d9a106a), + UINT64_C(0x58793e150f7ff874), UINT64_C(0x519bc1ed4913c3c5), UINT64_C(0x4f3b0da10be83d82), UINT64_C(0xd82c561b6f18a264), + UINT64_C(0xa47f8878009a1815), UINT64_C(0x0673feb8c6083dd6), UINT64_C(0x343ac4c37efb4d08), UINT64_C(0x4847b3364092fa4a), + UINT64_C(0x1a30098e32c503a0), UINT64_C(0x7f242c4cb083e69b), UINT64_C(0x08e69e6c3b1070ec), UINT64_C(0x0711fa2b404a9684), + UINT64_C(0xfc24e0a982ae39fa), UINT64_C(0x02ff5ca0bd974db5), UINT64_C(0x2777845db37d0e98), UINT64_C(0x5555b5942327e543), + UINT64_C(0x7717c93942df84b7), UINT64_C(0x2a661b86ad2dcdde), UINT64_C(0x61c93d7746664b20), UINT64_C(0x514090cc1a87d06b), + UINT64_C(0x7aa2f5f8bcf987ad), UINT64_C(0x2898047ec7fa8778), UINT64_C(0xe5cf2d9a08d8927c), UINT64_C(0xecde6d34e5c3fe5a), + UINT64_C(0x5589c848adaebaf8), UINT64_C(0xedac4b9343975aa2), UINT64_C(0x48503cf321ad26b2), UINT64_C(0x4e7f1530c16f8941), + UINT64_C(0x6a9fe4e56715fa4e), UINT64_C(0xefa9aec821c89e4b), UINT64_C(0xc23b542018927c97), UINT64_C(0xeedb11ae93481c6f), + UINT64_C(0x35f45dab8618f030), UINT64_C(0x2a5eb24e550fcb99), UINT64_C(0x5c6d2d61242cf3a8), UINT64_C(0x96058fee3f9becb0), + UINT64_C(0x811ed70d6e6cd756), UINT64_C(0x93642e8381c4a6a0), UINT64_C(0xc81e05bef85ad62b), UINT64_C(0xd12ce5cee02edeae), + UINT64_C(0x0a00b676c5f25868), UINT64_C(0xc5c91383914e9732), UINT64_C(0xd9e4fbd6c7a78695), UINT64_C(0x24741bcd3aab63f3), + UINT64_C(0xa86f85bc7932add8), UINT64_C(0xd851daaea4ade651), UINT64_C(0xc1b2a4b765bd4ee2), UINT64_C(0xd648f4971ef524f7), + }, }, - }, // Level 6 - { - UINT64_C(0xaf62ce594afbb378), UINT64_C(0x248e65d01cba3e0b), UINT64_C(0x00000003f), 0, // sum of coeff and dummy { - UINT64_C(0x6ce36b80768d6e7f), UINT64_C(0xa397920aa6626e5a), UINT64_C(0x04de32bd5633745d), UINT64_C(0xe699be0bb8411b1f), - UINT64_C(0xd06b3da1042ffeff), UINT64_C(0xc8c12f5678dbc1fe), UINT64_C(0x5f1c5df4786ec543), UINT64_C(0xc64eed21fe2dab71), - UINT64_C(0x43083efd3ab83bc9), UINT64_C(0xfbd27f38b364bb80), UINT64_C(0x948701fc4ed5f457), UINT64_C(0xb26d9d8304db31a5), - UINT64_C(0x18ec7952e4e525a9), UINT64_C(0x0a81dbd330204a9d), UINT64_C(0x033c520def3d2101), UINT64_C(0x73a6c045c701aadd), - UINT64_C(0xd7d19f80a027afec), UINT64_C(0x8bf3f0c57c2fe429), UINT64_C(0xb8344463c59719e3), UINT64_C(0xf76ffe54b2fd1d64), - UINT64_C(0xf3358f8c810dda81), UINT64_C(0x8049af80eb93f21f), UINT64_C(0x5ff59a51e9dafd79), UINT64_C(0xb3f6e7835814a5e9), - UINT64_C(0xbd127322c2e4b16c), UINT64_C(0x7bc601b6ef92afa3), UINT64_C(0x00b5e1e97c28a598), UINT64_C(0x38d94a15139b608e), - UINT64_C(0x39737d09f0035403), UINT64_C(0x65337848d976c3a2), UINT64_C(0x91c04f2a6a9ec21f), UINT64_C(0x02548b83235c115f), - UINT64_C(0x430e4ec854acc042), UINT64_C(0x0b0d27ee05bcd498), UINT64_C(0xf669534441242d11), UINT64_C(0x02cbaa107829c390), - UINT64_C(0x35b4d683817b903c), UINT64_C(0x31834f7142d5cfa0), UINT64_C(0x77fd19567cb1ffea), UINT64_C(0x0911558876310281), - UINT64_C(0xeaaef1c301d92167), UINT64_C(0xf1c746401671b4d3), UINT64_C(0x7d1888c23b2447e9), UINT64_C(0x72c44c19bde5d380), - UINT64_C(0x7a6156a99377bf58), UINT64_C(0xeafd8cb3722b6aa4), UINT64_C(0xa4b21df76c4ae4a6), UINT64_C(0xa612df347cb132bf), - UINT64_C(0x2f8331da53e4651f), UINT64_C(0x498baa43072061aa), UINT64_C(0x669cd34bdf522223), UINT64_C(0x611a32f117b489e3), - UINT64_C(0xb1d08c016e277a67), UINT64_C(0xb1d4d0937395b21f), UINT64_C(0x9d3e7447db71fd3d), UINT64_C(0x8d61714b54616249), - UINT64_C(0x91cfe6cad3939afb), UINT64_C(0x785efcfc1fbed3f8), UINT64_C(0xc7270e86e752b71a), UINT64_C(0xe91bc93a14e678c4), - UINT64_C(0x9bf095b9662cf95d), UINT64_C(0xa82d8d1309df2256), UINT64_C(0x41abc3fa674c6a06), UINT64_C(0x0e38a88b0398547e), - UINT64_C(0x6fe82427e8c24696), UINT64_C(0x0f20ed4a9e8e02c2), UINT64_C(0x5df70b3c4784b7e1), UINT64_C(0x000b2deddde9963c), - UINT64_C(0xc8929e6367803b53), UINT64_C(0xb28033a4c174c86d), UINT64_C(0x3a666b4c18406801), UINT64_C(0xbd8b5791ba056136), - UINT64_C(0x715ed0ae7c79e816), UINT64_C(0x577c1b256c64436a), UINT64_C(0x54a4f8d1b535e02d), UINT64_C(0xc8d7f16769d38240), - UINT64_C(0xb707839b15b0d3fc), UINT64_C(0x255def6be6755b91), UINT64_C(0x9bb54bbffd57d21f), UINT64_C(0xd882bcc3caa155e7), - UINT64_C(0x32706a042f57ab60), UINT64_C(0xf2f38aa7f8c31e8b), UINT64_C(0xa1e84cfff8dc3cae), UINT64_C(0xa703b9fc24c2e1db), - UINT64_C(0x8c3bd99cdd77d160), UINT64_C(0x4d4692d129444836), UINT64_C(0xef4b1c7cd501fd7d), UINT64_C(0xde07e34df48421ab), - UINT64_C(0xae4083dd864c910d), UINT64_C(0xfa4ba5e1a2d58460), UINT64_C(0x6f0068aa4e75a5ec), UINT64_C(0x0a9e07133b5a2abe), - UINT64_C(0x337739bfa36cecc8), UINT64_C(0xe3591f5cc97b787c), UINT64_C(0xf2bbe16b3ec41399), UINT64_C(0xf3dcc6246a758716), - UINT64_C(0xc73351933e7e2417), UINT64_C(0x0e1f947d867b0bdd), UINT64_C(0xe48bf8efb1f572a0), UINT64_C(0xd5b209d89f09fa2a), - UINT64_C(0x27478ae42843f9f1), UINT64_C(0x01b30ed80db664a5), UINT64_C(0x0181e5ed5e84cd8b), UINT64_C(0xf6318c19349acefb), - UINT64_C(0x69c8492982778f4b), UINT64_C(0x4af6702966bca750), UINT64_C(0xa8b4d353631e2482), UINT64_C(0x5ce04a70f584d238), - UINT64_C(0xfbf5b2cdc0394772), UINT64_C(0x104d44c77b80b6ae), UINT64_C(0xbe8e5a49d6ee3335), UINT64_C(0x5bf8f3f9a05f36f9), - UINT64_C(0x4be7aeb57af4a56a), UINT64_C(0xa09e9cd11d6ef9a7), UINT64_C(0x091ecc28674a929a), UINT64_C(0xad2c90bc1f89d87f), - UINT64_C(0xbf25df5f95456364), UINT64_C(0x7b104f2289b28c07), UINT64_C(0x902272c148ddc16d), UINT64_C(0x3285c7b614a096f3), - UINT64_C(0x6491973c285a2f0f), UINT64_C(0x31f84ba2ce5e3755), UINT64_C(0x3300c615947fd40c), UINT64_C(0x3c4747adf437f115), - UINT64_C(0x04fa56d556527742), UINT64_C(0xd7b45d6644b42059), UINT64_C(0x4cdea756d6091a28), UINT64_C(0x2431ed986745785b), + UINT64_C(0xaf62ce594afbb378), UINT64_C(0x248e65d01cba3e0b), UINT64_C(0x00000003f), 0, // sum of coeff and dummy + { + UINT64_C(0x6ce36b80768d6e7f), UINT64_C(0xa397920aa6626e5a), UINT64_C(0x04de32bd5633745d), UINT64_C(0xe699be0bb8411b1f), + UINT64_C(0xd06b3da1042ffeff), UINT64_C(0xc8c12f5678dbc1fe), UINT64_C(0x5f1c5df4786ec543), UINT64_C(0xc64eed21fe2dab71), + UINT64_C(0x43083efd3ab83bc9), UINT64_C(0xfbd27f38b364bb80), UINT64_C(0x948701fc4ed5f457), UINT64_C(0xb26d9d8304db31a5), + UINT64_C(0x18ec7952e4e525a9), UINT64_C(0x0a81dbd330204a9d), UINT64_C(0x033c520def3d2101), UINT64_C(0x73a6c045c701aadd), + UINT64_C(0xd7d19f80a027afec), UINT64_C(0x8bf3f0c57c2fe429), UINT64_C(0xb8344463c59719e3), UINT64_C(0xf76ffe54b2fd1d64), + UINT64_C(0xf3358f8c810dda81), UINT64_C(0x8049af80eb93f21f), UINT64_C(0x5ff59a51e9dafd79), UINT64_C(0xb3f6e7835814a5e9), + UINT64_C(0xbd127322c2e4b16c), UINT64_C(0x7bc601b6ef92afa3), UINT64_C(0x00b5e1e97c28a598), UINT64_C(0x38d94a15139b608e), + UINT64_C(0x39737d09f0035403), UINT64_C(0x65337848d976c3a2), UINT64_C(0x91c04f2a6a9ec21f), UINT64_C(0x02548b83235c115f), + UINT64_C(0x430e4ec854acc042), UINT64_C(0x0b0d27ee05bcd498), UINT64_C(0xf669534441242d11), UINT64_C(0x02cbaa107829c390), + UINT64_C(0x35b4d683817b903c), UINT64_C(0x31834f7142d5cfa0), UINT64_C(0x77fd19567cb1ffea), UINT64_C(0x0911558876310281), + UINT64_C(0xeaaef1c301d92167), UINT64_C(0xf1c746401671b4d3), UINT64_C(0x7d1888c23b2447e9), UINT64_C(0x72c44c19bde5d380), + UINT64_C(0x7a6156a99377bf58), UINT64_C(0xeafd8cb3722b6aa4), UINT64_C(0xa4b21df76c4ae4a6), UINT64_C(0xa612df347cb132bf), + UINT64_C(0x2f8331da53e4651f), UINT64_C(0x498baa43072061aa), UINT64_C(0x669cd34bdf522223), UINT64_C(0x611a32f117b489e3), + UINT64_C(0xb1d08c016e277a67), UINT64_C(0xb1d4d0937395b21f), UINT64_C(0x9d3e7447db71fd3d), UINT64_C(0x8d61714b54616249), + UINT64_C(0x91cfe6cad3939afb), UINT64_C(0x785efcfc1fbed3f8), UINT64_C(0xc7270e86e752b71a), UINT64_C(0xe91bc93a14e678c4), + UINT64_C(0x9bf095b9662cf95d), UINT64_C(0xa82d8d1309df2256), UINT64_C(0x41abc3fa674c6a06), UINT64_C(0x0e38a88b0398547e), + UINT64_C(0x6fe82427e8c24696), UINT64_C(0x0f20ed4a9e8e02c2), UINT64_C(0x5df70b3c4784b7e1), UINT64_C(0x000b2deddde9963c), + UINT64_C(0xc8929e6367803b53), UINT64_C(0xb28033a4c174c86d), UINT64_C(0x3a666b4c18406801), UINT64_C(0xbd8b5791ba056136), + UINT64_C(0x715ed0ae7c79e816), UINT64_C(0x577c1b256c64436a), UINT64_C(0x54a4f8d1b535e02d), UINT64_C(0xc8d7f16769d38240), + UINT64_C(0xb707839b15b0d3fc), UINT64_C(0x255def6be6755b91), UINT64_C(0x9bb54bbffd57d21f), UINT64_C(0xd882bcc3caa155e7), + UINT64_C(0x32706a042f57ab60), UINT64_C(0xf2f38aa7f8c31e8b), UINT64_C(0xa1e84cfff8dc3cae), UINT64_C(0xa703b9fc24c2e1db), + UINT64_C(0x8c3bd99cdd77d160), UINT64_C(0x4d4692d129444836), UINT64_C(0xef4b1c7cd501fd7d), UINT64_C(0xde07e34df48421ab), + UINT64_C(0xae4083dd864c910d), UINT64_C(0xfa4ba5e1a2d58460), UINT64_C(0x6f0068aa4e75a5ec), UINT64_C(0x0a9e07133b5a2abe), + UINT64_C(0x337739bfa36cecc8), UINT64_C(0xe3591f5cc97b787c), UINT64_C(0xf2bbe16b3ec41399), UINT64_C(0xf3dcc6246a758716), + UINT64_C(0xc73351933e7e2417), UINT64_C(0x0e1f947d867b0bdd), UINT64_C(0xe48bf8efb1f572a0), UINT64_C(0xd5b209d89f09fa2a), + UINT64_C(0x27478ae42843f9f1), UINT64_C(0x01b30ed80db664a5), UINT64_C(0x0181e5ed5e84cd8b), UINT64_C(0xf6318c19349acefb), + UINT64_C(0x69c8492982778f4b), UINT64_C(0x4af6702966bca750), UINT64_C(0xa8b4d353631e2482), UINT64_C(0x5ce04a70f584d238), + UINT64_C(0xfbf5b2cdc0394772), UINT64_C(0x104d44c77b80b6ae), UINT64_C(0xbe8e5a49d6ee3335), UINT64_C(0x5bf8f3f9a05f36f9), + UINT64_C(0x4be7aeb57af4a56a), UINT64_C(0xa09e9cd11d6ef9a7), UINT64_C(0x091ecc28674a929a), UINT64_C(0xad2c90bc1f89d87f), + UINT64_C(0xbf25df5f95456364), UINT64_C(0x7b104f2289b28c07), UINT64_C(0x902272c148ddc16d), UINT64_C(0x3285c7b614a096f3), + UINT64_C(0x6491973c285a2f0f), UINT64_C(0x31f84ba2ce5e3755), UINT64_C(0x3300c615947fd40c), UINT64_C(0x3c4747adf437f115), + UINT64_C(0x04fa56d556527742), UINT64_C(0xd7b45d6644b42059), UINT64_C(0x4cdea756d6091a28), UINT64_C(0x2431ed986745785b), + }, }, - }, // Level 7 - { - UINT64_C(0x1249b1f513689151), UINT64_C(0xc658fcfbfabe77d5), UINT64_C(0x000000042), 0, // sum of coeff and dummy { - UINT64_C(0xabaaefde77273dcd), UINT64_C(0xe737f9d4fba6ee5b), UINT64_C(0xc2c8521e524e50e7), UINT64_C(0xb6347dd4ecff2e08), - UINT64_C(0x81cc14e56b826c78), UINT64_C(0x7e96733438db219f), UINT64_C(0x93f66e8959ad9a5d), UINT64_C(0xad77e6ffafdfa01b), - UINT64_C(0x79842c77afd94c9a), UINT64_C(0xb2fe351094030a32), UINT64_C(0x04f00838dc236276), UINT64_C(0x1064827c937cd78b), - UINT64_C(0xa914296fc9de0469), UINT64_C(0x4a87b2d1971b2b6e), UINT64_C(0x1ef28858c6e99de6), UINT64_C(0x23429a77bea42f46), - UINT64_C(0xf771817be7a38b16), UINT64_C(0xcc348f7a13deb19a), UINT64_C(0x0a91d46fb1ae97e8), UINT64_C(0x753cdb5468c83c10), - UINT64_C(0x65cc613edbcd3f84), UINT64_C(0xcb157fac042d9ab2), UINT64_C(0x18e6a31aed525487), UINT64_C(0x5924230b1281b56d), - UINT64_C(0xb828c042782945ba), UINT64_C(0x2decd50526005abe), UINT64_C(0x05caa6f761c5857a), UINT64_C(0x4c93892d66de5320), - UINT64_C(0xac796b30f48a75b3), UINT64_C(0xe11728c76eab1822), UINT64_C(0xa59ec090b0f3ed2e), UINT64_C(0xada9c2e74edc137b), - UINT64_C(0x4ca60d77ed9f8e0d), UINT64_C(0x6304a44de4bc4219), UINT64_C(0x361436da34a05f49), UINT64_C(0x097fcaec609fd08f), - UINT64_C(0xf9f9ae511316dcce), UINT64_C(0xa62ca6c22fa94122), UINT64_C(0xb32ebc94594cf9c8), UINT64_C(0x1b673219068f53f7), - UINT64_C(0x28a8f7de358ea82b), UINT64_C(0x7d3e002bee6f572f), UINT64_C(0xbe24c789f9ddb580), UINT64_C(0x0257b24167d83acd), - UINT64_C(0x5651f9ac1cfa5113), UINT64_C(0x225aaaa55c5d72d4), UINT64_C(0x1bb9759abf1d08b0), UINT64_C(0x7c36896386d4f50c), - UINT64_C(0xdd4ceaf465f970eb), UINT64_C(0xf349d378bfd4beb9), UINT64_C(0xf2d9ea03c79109d8), UINT64_C(0xe915c84fab4efd66), - UINT64_C(0xe401bb6a403813b6), UINT64_C(0x2171265710c01426), UINT64_C(0x6542b43cba6a4d08), UINT64_C(0x58591c6e1250104f), - UINT64_C(0x77bc044ed6c4a7a0), UINT64_C(0x73b1a5f682fd2d52), UINT64_C(0x6c2b7083b26b9976), UINT64_C(0xf9e3b1347ceaaaca), - UINT64_C(0xa709263b9c304a96), UINT64_C(0x6c6fedc1e78481dc), UINT64_C(0xbec268cc818190e0), UINT64_C(0xbafa9271d75b733b), - UINT64_C(0xeace12cbb37fc677), UINT64_C(0x1176816b69b51d98), UINT64_C(0x62d28bbf94c2762d), UINT64_C(0x142b7d89bcc06043), - UINT64_C(0x8e166c13e205cc00), UINT64_C(0xac3dcf9c75177f8e), UINT64_C(0xc75695f82b7f6c46), UINT64_C(0xdff44c46fe5e7b6d), - UINT64_C(0x932846955828d471), UINT64_C(0x7593c5e733dca4d6), UINT64_C(0xf1efc8ad9718ca14), UINT64_C(0x93a618cb5b6aff34), - UINT64_C(0x1d89f5253c2f819f), UINT64_C(0x419744eb9c63d0b2), UINT64_C(0x2b07ff7747ed7c29), UINT64_C(0x617be6e4454749a0), - UINT64_C(0xaa24d8e4142c5bf4), UINT64_C(0xe25d6c2fe999691d), UINT64_C(0xf78965d974e8e076), UINT64_C(0x8e6203aa0037ae8e), - UINT64_C(0x732c3a3a561c6d79), UINT64_C(0xd61a9622b0da5c93), UINT64_C(0xfc1c73c6152a141b), UINT64_C(0x03a4694838529e5b), - UINT64_C(0x686cb297afba7101), UINT64_C(0xbee9f55d5260fbe2), UINT64_C(0xd53a374387aa4f2a), UINT64_C(0xc6b2494c1a96d781), - UINT64_C(0xbe8aa945ac411c10), UINT64_C(0xbfc814fa4da90048), UINT64_C(0xb46847e8ecaca5f4), UINT64_C(0x83466ccfb2037365), - UINT64_C(0x39bfd895a4917200), UINT64_C(0xfd6106ab889f9c14), UINT64_C(0x87d80fcd94875b38), UINT64_C(0xd05a5e75bdd29067), - UINT64_C(0xc8fbbb4d3e850e9d), UINT64_C(0xef2dc9eb5228f1ae), UINT64_C(0xc3775c3e9ac4da44), UINT64_C(0x12004ef1609624ed), - UINT64_C(0x43ec24f8c096ee25), UINT64_C(0xeb207061723522ad), UINT64_C(0xbd3767314ad773e4), UINT64_C(0x4b2059a2964d28f4), - UINT64_C(0xcd4522a02ed66868), UINT64_C(0x74c6b45b4b5b5657), UINT64_C(0x48bcc161232e14b1), UINT64_C(0x958c3b741a54bd75), - UINT64_C(0x2f64940639fedc7d), UINT64_C(0xc1321efa1c279cc3), UINT64_C(0x0680b3866e485f15), UINT64_C(0x5633b30c0c7c4a96), - UINT64_C(0xb5c9b8539fa9ea3c), UINT64_C(0x1fd67c7175c87172), UINT64_C(0xe03ed40e88bcdf23), UINT64_C(0x81a69e0147fbb776), - UINT64_C(0x244e2bf676590e87), UINT64_C(0x8a86357137c0d611), UINT64_C(0x4fcaad51eba3720f), UINT64_C(0x2b8b7b933f76e019), - UINT64_C(0xecff900b265d06f4), UINT64_C(0xbc3b359d2e438bbc), UINT64_C(0x086c671b288776d9), UINT64_C(0x652c4a2d18d847ba), + UINT64_C(0x1249b1f513689151), UINT64_C(0xc658fcfbfabe77d5), UINT64_C(0x000000042), 0, // sum of coeff and dummy + { + UINT64_C(0xabaaefde77273dcd), UINT64_C(0xe737f9d4fba6ee5b), UINT64_C(0xc2c8521e524e50e7), UINT64_C(0xb6347dd4ecff2e08), + UINT64_C(0x81cc14e56b826c78), UINT64_C(0x7e96733438db219f), UINT64_C(0x93f66e8959ad9a5d), UINT64_C(0xad77e6ffafdfa01b), + UINT64_C(0x79842c77afd94c9a), UINT64_C(0xb2fe351094030a32), UINT64_C(0x04f00838dc236276), UINT64_C(0x1064827c937cd78b), + UINT64_C(0xa914296fc9de0469), UINT64_C(0x4a87b2d1971b2b6e), UINT64_C(0x1ef28858c6e99de6), UINT64_C(0x23429a77bea42f46), + UINT64_C(0xf771817be7a38b16), UINT64_C(0xcc348f7a13deb19a), UINT64_C(0x0a91d46fb1ae97e8), UINT64_C(0x753cdb5468c83c10), + UINT64_C(0x65cc613edbcd3f84), UINT64_C(0xcb157fac042d9ab2), UINT64_C(0x18e6a31aed525487), UINT64_C(0x5924230b1281b56d), + UINT64_C(0xb828c042782945ba), UINT64_C(0x2decd50526005abe), UINT64_C(0x05caa6f761c5857a), UINT64_C(0x4c93892d66de5320), + UINT64_C(0xac796b30f48a75b3), UINT64_C(0xe11728c76eab1822), UINT64_C(0xa59ec090b0f3ed2e), UINT64_C(0xada9c2e74edc137b), + UINT64_C(0x4ca60d77ed9f8e0d), UINT64_C(0x6304a44de4bc4219), UINT64_C(0x361436da34a05f49), UINT64_C(0x097fcaec609fd08f), + UINT64_C(0xf9f9ae511316dcce), UINT64_C(0xa62ca6c22fa94122), UINT64_C(0xb32ebc94594cf9c8), UINT64_C(0x1b673219068f53f7), + UINT64_C(0x28a8f7de358ea82b), UINT64_C(0x7d3e002bee6f572f), UINT64_C(0xbe24c789f9ddb580), UINT64_C(0x0257b24167d83acd), + UINT64_C(0x5651f9ac1cfa5113), UINT64_C(0x225aaaa55c5d72d4), UINT64_C(0x1bb9759abf1d08b0), UINT64_C(0x7c36896386d4f50c), + UINT64_C(0xdd4ceaf465f970eb), UINT64_C(0xf349d378bfd4beb9), UINT64_C(0xf2d9ea03c79109d8), UINT64_C(0xe915c84fab4efd66), + UINT64_C(0xe401bb6a403813b6), UINT64_C(0x2171265710c01426), UINT64_C(0x6542b43cba6a4d08), UINT64_C(0x58591c6e1250104f), + UINT64_C(0x77bc044ed6c4a7a0), UINT64_C(0x73b1a5f682fd2d52), UINT64_C(0x6c2b7083b26b9976), UINT64_C(0xf9e3b1347ceaaaca), + UINT64_C(0xa709263b9c304a96), UINT64_C(0x6c6fedc1e78481dc), UINT64_C(0xbec268cc818190e0), UINT64_C(0xbafa9271d75b733b), + UINT64_C(0xeace12cbb37fc677), UINT64_C(0x1176816b69b51d98), UINT64_C(0x62d28bbf94c2762d), UINT64_C(0x142b7d89bcc06043), + UINT64_C(0x8e166c13e205cc00), UINT64_C(0xac3dcf9c75177f8e), UINT64_C(0xc75695f82b7f6c46), UINT64_C(0xdff44c46fe5e7b6d), + UINT64_C(0x932846955828d471), UINT64_C(0x7593c5e733dca4d6), UINT64_C(0xf1efc8ad9718ca14), UINT64_C(0x93a618cb5b6aff34), + UINT64_C(0x1d89f5253c2f819f), UINT64_C(0x419744eb9c63d0b2), UINT64_C(0x2b07ff7747ed7c29), UINT64_C(0x617be6e4454749a0), + UINT64_C(0xaa24d8e4142c5bf4), UINT64_C(0xe25d6c2fe999691d), UINT64_C(0xf78965d974e8e076), UINT64_C(0x8e6203aa0037ae8e), + UINT64_C(0x732c3a3a561c6d79), UINT64_C(0xd61a9622b0da5c93), UINT64_C(0xfc1c73c6152a141b), UINT64_C(0x03a4694838529e5b), + UINT64_C(0x686cb297afba7101), UINT64_C(0xbee9f55d5260fbe2), UINT64_C(0xd53a374387aa4f2a), UINT64_C(0xc6b2494c1a96d781), + UINT64_C(0xbe8aa945ac411c10), UINT64_C(0xbfc814fa4da90048), UINT64_C(0xb46847e8ecaca5f4), UINT64_C(0x83466ccfb2037365), + UINT64_C(0x39bfd895a4917200), UINT64_C(0xfd6106ab889f9c14), UINT64_C(0x87d80fcd94875b38), UINT64_C(0xd05a5e75bdd29067), + UINT64_C(0xc8fbbb4d3e850e9d), UINT64_C(0xef2dc9eb5228f1ae), UINT64_C(0xc3775c3e9ac4da44), UINT64_C(0x12004ef1609624ed), + UINT64_C(0x43ec24f8c096ee25), UINT64_C(0xeb207061723522ad), UINT64_C(0xbd3767314ad773e4), UINT64_C(0x4b2059a2964d28f4), + UINT64_C(0xcd4522a02ed66868), UINT64_C(0x74c6b45b4b5b5657), UINT64_C(0x48bcc161232e14b1), UINT64_C(0x958c3b741a54bd75), + UINT64_C(0x2f64940639fedc7d), UINT64_C(0xc1321efa1c279cc3), UINT64_C(0x0680b3866e485f15), UINT64_C(0x5633b30c0c7c4a96), + UINT64_C(0xb5c9b8539fa9ea3c), UINT64_C(0x1fd67c7175c87172), UINT64_C(0xe03ed40e88bcdf23), UINT64_C(0x81a69e0147fbb776), + UINT64_C(0x244e2bf676590e87), UINT64_C(0x8a86357137c0d611), UINT64_C(0x4fcaad51eba3720f), UINT64_C(0x2b8b7b933f76e019), + UINT64_C(0xecff900b265d06f4), UINT64_C(0xbc3b359d2e438bbc), UINT64_C(0x086c671b288776d9), UINT64_C(0x652c4a2d18d847ba), + }, }, - }, }; -//STATIC_ASSERT(PMPML_64_LEVELS <= 8, "Only 8 levels of data currently exist"); +// STATIC_ASSERT(PMPML_64_LEVELS <= 8, "Only 8 levels of data currently exist"); //------------------------------------------------------------- // Common math routines -static inline uint32_t fmix32_short(uint32_t h) { - h ^= h >> 13; - h *= 0xab3be54f; - h ^= h >> 16; +static inline uint32_t fmix32_short( uint32_t h ) { + h ^= h >> 13; + h *= 0xab3be54f; + h ^= h >> 16; - return h; + return h; } -static inline uint64_t fmix64_short(uint64_t k) { - k ^= k >> 33; - k *= UINT64_C(0xc4ceb9fe1a85ec53 ); - k ^= k >> 33; +static inline uint64_t fmix64_short( uint64_t k ) { + k ^= k >> 33; + k *= UINT64_C(0xc4ceb9fe1a85ec53); + k ^= k >> 33; - return k; + return k; } #define UInt32x32To64(a, b) ((uint64_t)(((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)))) @@ -664,1158 +661,1253 @@ static inline uint64_t fmix64_short(uint64_t k) { // 32-bit hash static inline -void multiply32x32to64(uint32_t& rhi, uint32_t& rlo, uint32_t a, uint32_t b) { +void multiply32x32to64( uint32_t & rhi, uint32_t & rlo, uint32_t a, uint32_t b ) { mult32_64(rlo, rhi, a, b); } static inline -void add64(uint32_t& loWord, uint32_t& hiWord, uint32_t& hhWord, uint32_t& loAdd, uint32_t& hiAdd, uint32_t& hhAdd) { +void add64( uint32_t & loWord, uint32_t & hiWord, uint32_t & hhWord, uint32_t & loAdd, uint32_t & hiAdd, uint32_t & hhAdd ) { add96(loWord, hiWord, hhWord, loAdd, hiAdd, hhAdd); } static FORCE_INLINE -void mul32x32to64addto96(uint32_t& loWord, uint32_t& hiWord, uint32_t& hhWord, uint32_t a, uint32_t b) { +void mul32x32to64addto96( uint32_t & loWord, uint32_t & hiWord, uint32_t & hhWord, uint32_t a, uint32_t b ) { fma32_96(loWord, hiWord, hhWord, a, b); } #define PMPML_CHUNK_LOOP_INTRO_L0 \ - uint32_t ctr; \ - ctr = 0; \ + uint32_t ctr; \ + ctr = 0; \ ULARGE_INTEGER__XX mul; // Input data is read in 32-bit chunks. -#define PMPML_CHUNK_LOOP_BODY_ULI_T1( i ) \ - /*multiply32x32to64(mul.HighPart, mul.LowPart, x[i], coeff[ i ]); \ +#define PMPML_CHUNK_LOOP_BODY_ULI_T1( i ) \ + /*multiply32x32to64(mul.HighPart, mul.LowPart, x[i], coeff[ i ]); \ add64(constTerm.LowPart, constTerm.HighPart, ctr, mul.LowPart, mul.HighPart, zero);*/ \ mul32x32to64addto96(constTerm.LowPart, constTerm.HighPart, ctr, GET_U32((const uint8_t*)x, (i)*sizeof(x[0])), coeff[ i ]); // Hash data from previous blocks is read in 64-bit chunks, and always // in native endian format. -#define PMPML_CHUNK_LOOP_BODY_ULI_T1_64( i ) \ - /*multiply32x32to64(mul.HighPart, mul.LowPart, x[i], coeff[ i ]); \ +#define PMPML_CHUNK_LOOP_BODY_ULI_T1_64( i ) \ + /*multiply32x32to64(mul.HighPart, mul.LowPart, x[i], coeff[ i ]); \ add64(constTerm.LowPart, constTerm.HighPart, ctr, mul.LowPart, mul.HighPart, zero);*/ \ mul32x32to64addto96(constTerm.LowPart, constTerm.HighPart, ctr, GET_U64((const uint8_t*)x, (i)*sizeof(x[0])), coeff[ i ]); -#define PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST \ - /*multiply32x32to64(mul.HighPart, mul.LowPart, xLast, coeff[ size ]); \ - add64(constTerm.LowPart, constTerm.HighPart, ctr, mul.LowPart, mul.HighPart);*/ \ +#define PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST \ + /*multiply32x32to64(mul.HighPart, mul.LowPart, xLast, coeff[ size ]); \ + add64(constTerm.LowPart, constTerm.HighPart, ctr, mul.LowPart, mul.HighPart);*/ \ mul32x32to64addto96(constTerm.LowPart, constTerm.HighPart, ctr, xLast, coeff[ size ]); \ #define PMPML_CHUNK_LOOP_PRE_REDUCE_L0 /* -#define PMPML_MOD_2_32_PLUS_15( x, y ) \ - x = (uint32_t)x + UINT64_C(0xF000000E1) - (( (uint64_t)x >> 32 ) << 4) + ( x >> 32 ); \ - y = (uint32_t)x; \ - y -= ((uint32_t)(x >> 32 )) * 15; \ - if ( y < 0 ) y += PMPML_MAIN_PRIME; // y += PMPML_MAIN_PRIME * ( y < 0 ); - */ + #define PMPML_MOD_2_32_PLUS_15( x, y ) \ + * x = (uint32_t)x + UINT64_C(0xF000000E1) - (( (uint64_t)x >> 32 ) << 4) + ( x >> 32 ); \ + * y = (uint32_t)x; \ + * y -= ((uint32_t)(x >> 32 )) * 15; \ + * if ( y < 0 ) y += PMPML_MAIN_PRIME; // y += PMPML_MAIN_PRIME * ( y < 0 ); + */ #define PMPML_CHUNK_REDUCE_96_TO_64 -#define PMPML_CHUNK_REDUCE_64_TO_32 \ -{ \ - uint32_t lo, hi; \ - multiply32x32to64(hi, lo, constTerm.HighPart, 15); \ - uint32_t part = ctr * 225 + (hi << 4) - hi + 15; \ - constTerm.LowPart += part; \ - constTerm.HighPart = 1 + (constTerm.LowPart < part); \ - constTerm.HighPart -= (constTerm.LowPart < lo); \ - constTerm.LowPart -= lo; \ - if ( likely( constTerm.LowPart >= 30) ) { constTerm.LowPart -= constTerm.HighPart * 15; constTerm.HighPart = 0; } \ - else \ - { \ - if ( constTerm.HighPart ) \ - { \ - constTerm.LowPart -= constTerm.HighPart * 15; \ - constTerm.HighPart = 1; \ - if ( likely( constTerm.LowPart >= 15)) { constTerm.LowPart -= 15; constTerm.HighPart = 0; } \ - else \ - { \ - constTerm.LowPart -= 15; \ - constTerm.HighPart = 0; \ - } \ - } \ - } \ +#define PMPML_CHUNK_REDUCE_64_TO_32 \ +{ \ + uint32_t lo, hi; \ + multiply32x32to64(hi, lo, constTerm.HighPart, 15);\ + uint32_t part = ctr * 225 + (hi << 4) - hi + 15;\ + constTerm.LowPart += part; \ + constTerm.HighPart = 1 + (constTerm.LowPart < part);\ + constTerm.HighPart -= (constTerm.LowPart < lo);\ + constTerm.LowPart -= lo; \ + if ( likely( constTerm.LowPart >= 30) ) { \ + constTerm.LowPart -= constTerm.HighPart * 15; \ + constTerm.HighPart = 0; \ + } else { \ + if ( constTerm.HighPart ) { \ + constTerm.LowPart -= constTerm.HighPart * 15;\ + constTerm.HighPart = 1; \ + if ( likely( constTerm.LowPart >= 15)) {\ + constTerm.LowPart -= 15; \ + constTerm.HighPart = 0; \ + } else { \ + constTerm.LowPart -= 15;\ + constTerm.HighPart = 0;\ + } \ + } \ + } \ } #define PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN \ - PMPML_CHUNK_REDUCE_96_TO_64 \ - PMPML_CHUNK_REDUCE_64_TO_32 \ + PMPML_CHUNK_REDUCE_96_TO_64 \ + PMPML_CHUNK_REDUCE_64_TO_32 \ return constTerm.QuadPart; -#define PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY \ -{ \ - constTerm.QuadPart = constTerm.LowPart + PMPML_MAIN_PRIME - constTerm.HighPart * UINT64_C( 15 ); \ - if ( likely( constTerm.LowPart >= 30) ) { constTerm.LowPart -= (constTerm.HighPart << 4) - constTerm.HighPart; return fmix32_short( constTerm.LowPart ); } \ - else \ - { \ - constTerm.LowPart -= constTerm.HighPart * 15; \ - if ( constTerm.LowPart < 30 ) return fmix32_short( constTerm.LowPart ); \ - else \ - { \ - constTerm.LowPart += 15; \ - return fmix32_short( constTerm.LowPart ); \ - } \ - } \ +#define PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY \ +{ \ + constTerm.QuadPart = constTerm.LowPart + PMPML_MAIN_PRIME - constTerm.HighPart * UINT64_C( 15 );\ + if ( likely( constTerm.LowPart >= 30) ) { \ + constTerm.LowPart -= (constTerm.HighPart << 4) - constTerm.HighPart; \ + return fmix32_short( constTerm.LowPart ); \ + } else { \ + constTerm.LowPart -= constTerm.HighPart * 15; \ + if ( constTerm.LowPart < 30 ) { \ + return fmix32_short( constTerm.LowPart ); \ + } else { \ + constTerm.LowPart += 15; \ + return fmix32_short( constTerm.LowPart ); \ + } \ + } \ } -#define PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN \ -{ \ - uint32_t lo, hi; \ - multiply32x32to64(hi, lo, constTerm.HighPart, 15); \ - uint32_t part = ctr * 225 + (hi << 4) - hi + 15; \ - constTerm.LowPart += part; \ - constTerm.HighPart = 1 + (constTerm.LowPart < part); \ - constTerm.HighPart -= (constTerm.LowPart < lo); \ - constTerm.LowPart -= lo; \ - if ( likely( constTerm.LowPart >= 30) ) { constTerm.LowPart -= (constTerm.HighPart << 4) - constTerm.HighPart/*constTerm.HighPart * 15*/; return fmix32_short( constTerm.LowPart ); } \ - else \ - { \ - if ( constTerm.HighPart ) \ - { \ - constTerm.LowPart -= constTerm.HighPart * 15 - 15; \ - constTerm.HighPart = 1; \ - if ( likely( constTerm.LowPart >= 15)) { constTerm.LowPart -= 15; return fmix32_short( constTerm.LowPart ); } \ - else \ - { \ - return constTerm.LowPart; \ - } \ - } \ - else \ - return fmix32_short( constTerm.LowPart ); \ - } \ +#define PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN \ +{ \ + uint32_t lo, hi; \ + multiply32x32to64(hi, lo, constTerm.HighPart, 15); \ + uint32_t part = ctr * 225 + (hi << 4) - hi + 15; \ + constTerm.LowPart += part; \ + constTerm.HighPart = 1 + (constTerm.LowPart < part); \ + constTerm.HighPart -= (constTerm.LowPart < lo); \ + constTerm.LowPart -= lo; \ + if ( likely( constTerm.LowPart >= 30) ) { \ + constTerm.LowPart -= (constTerm.HighPart << 4) - constTerm.HighPart/*constTerm.HighPart * 15*/; \ + return fmix32_short( constTerm.LowPart ); \ + } else { \ + if ( constTerm.HighPart ) { \ + constTerm.LowPart -= constTerm.HighPart * 15 - 15; \ + constTerm.HighPart = 1; \ + if ( likely( constTerm.LowPart >= 15)) { \ + constTerm.LowPart -= 15; \ + return fmix32_short( constTerm.LowPart ); \ + } else { \ + return constTerm.LowPart; \ + } \ + } else { \ + return fmix32_short( constTerm.LowPart ); \ + } \ + } \ } -class PMP_Multilinear_Hasher_32 -{ +class PMP_Multilinear_Hasher_32 { private: - random_data_for_PMPML_32* curr_rd; - uint64_t coeff0; + random_data_for_PMPML_32 * curr_rd; + uint64_t coeff0; - // calls to be done from LEVEL=0 - template < bool bswap > - FORCE_INLINE uint64_t hash_of_string_chunk_compact( const uint32_t* coeff, ULARGE_INTEGER__XX constTerm, const uint32_t* x ) const { - PMPML_CHUNK_LOOP_INTRO_L0 + // calls to be done from LEVEL=0 + template + FORCE_INLINE uint64_t hash_of_string_chunk_compact( const uint32_t * coeff, + ULARGE_INTEGER__XX constTerm, const uint32_t * x ) const { + PMPML_CHUNK_LOOP_INTRO_L0 #if defined(HAVE_AVX2) && (PMPML_32_CHUNK_SIZE_LOG2 >= 3) __m256i ctr0, ctr1, mask_low; - __m256i a, data, product, temp; - uint64_t temp_fin; - int i; - - ctr0 = _mm256_setzero_si256 (); // Sets the 128-bit value to zero. - ctr1 = _mm256_setzero_si256 (); - mask_low = _mm256_set_epi32 ( 0, -1, 0 , -1, 0, -1, 0 , -1 ); - - uint32_t *x1, *x2, *x3, *c1, *c2, *c3; - -#if (PMPML_32_CHUNK_SIZE_LOG2 >= 6) - for ( i=0; i 3) - - a = _mm256_load_si256 ((__m256i *)(coeff+i+8)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+8)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - -#endif -#if (PMPML_32_CHUNK_SIZE_LOG2 > 4) - - a = _mm256_load_si256 ((__m256i *)(coeff+i+16)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+16)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_load_si256 ((__m256i *)(coeff+i+24)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+24)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - -#endif -#if (PMPML_32_CHUNK_SIZE_LOG2 > 5) - - a = _mm256_load_si256 ((__m256i *)(coeff+i+32)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+32)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_load_si256 ((__m256i *)(coeff+i+40)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+40)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_load_si256 ((__m256i *)(coeff+i+48)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+48)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_load_si256 ((__m256i *)(coeff+i+56)); - data = _mm256_loadu_si256 ((__m256i *)(x+i+56)); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); - - a = _mm256_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm256_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm256_mul_epu32 ( data, a); - temp = _mm256_srli_epi64( product, 32 ); - ctr1 = _mm256_add_epi64 ( ctr1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - ctr0 = _mm256_add_epi64 ( ctr0, product );//ctr0 = _mm256_add_epi64 ( ctr0, temp ); -#endif - } - - temp = _mm256_unpackhi_epi64 ( ctr0, ctr1 ); // Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := b1 ; ... - data = _mm256_unpacklo_epi64 ( ctr0, ctr1 ); // Interleaves the lower signed or unsigned 64-bit integer in a with the lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := b0 ; ... - ctr1 = _mm256_add_epi64 ( data, temp ); - - uint64_t lo = *(uint64_t*)(&ctr1) + ((uint64_t*)(&ctr1))[2]; - uint64_t hi = ((uint64_t*)(&ctr1))[1] + ((uint64_t*)(&ctr1))[3]; - uint32_t lohi = lo >> 32; - uint32_t hilo = hi; - uint32_t diff = lohi - hilo; - hi += diff; - lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32 ); - constTerm.QuadPart += lo; - ctr += constTerm.QuadPart < lo; - ctr += hi >> 32; + __m256i a, data, product, temp; + uint64_t temp_fin; + int i; + + ctr0 = _mm256_setzero_si256(); // Sets the 128-bit value to zero. + ctr1 = _mm256_setzero_si256(); + mask_low = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + + uint32_t * x1, * x2, * x3, * c1, * c2, * c3; + + #if (PMPML_32_CHUNK_SIZE_LOG2 >= 6) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 64) + #elif (PMPML_32_CHUNK_SIZE_LOG2 == 5) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 32) + #elif (PMPML_32_CHUNK_SIZE_LOG2 == 4) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 16) + #else + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 8) + #endif + { + a = _mm256_load_si256((__m256i * )(coeff + i)); // Loads 256-bit value. Address p must be 32-byte + // aligned. + data = _mm256_loadu_si256((__m256i *)(x + i)); // Loads 256-bit value. Address p does not need be + // 32-byte aligned. + product = _mm256_mul_epu32(data, a); // A 256-bit value that contains four 64-bit unsigned + // integers. The result can be expressed by the + // following equations. r0 := a0 * b0; r1 := a2 * b2; + // ... + temp = _mm256_srli_epi64(product, 32); // Shifts the 4 signed or unsigned 64-bit integers in + // a right by count bits while shifting in zeros. + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + #if (PMPML_32_CHUNK_SIZE_LOG2 > 3) + + a = _mm256_load_si256((__m256i * )(coeff + i + 8)); + data = _mm256_loadu_si256((__m256i *)(x + i + 8)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + #endif + #if (PMPML_32_CHUNK_SIZE_LOG2 > 4) + + a = _mm256_load_si256((__m256i * )(coeff + i + 16)); + data = _mm256_loadu_si256((__m256i *)(x + i + 16)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_load_si256((__m256i * )(coeff + i + 24)); + data = _mm256_loadu_si256((__m256i *)(x + i + 24)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + #endif + #if (PMPML_32_CHUNK_SIZE_LOG2 > 5) + + a = _mm256_load_si256((__m256i * )(coeff + i + 32)); + data = _mm256_loadu_si256((__m256i *)(x + i + 32)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_load_si256((__m256i * )(coeff + i + 40)); + data = _mm256_loadu_si256((__m256i *)(x + i + 40)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_load_si256((__m256i * )(coeff + i + 48)); + data = _mm256_loadu_si256((__m256i *)(x + i + 48)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_load_si256((__m256i * )(coeff + i + 56)); + data = _mm256_loadu_si256((__m256i *)(x + i + 56)); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + + a = _mm256_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm256_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm256_mul_epu32(data, a); + temp = _mm256_srli_epi64(product, 32); + ctr1 = _mm256_add_epi64(ctr1, temp ); + // temp = _mm256_and_si256 ( mask_low, product ); + ctr0 = _mm256_add_epi64(ctr0, product); // ctr0 = _mm256_add_epi64 ( ctr0, temp ); + #endif + } + + temp = _mm256_unpackhi_epi64(ctr0, ctr1); // Interleaves the upper signed or unsigned 64-bit integer in a with + // the upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := + // b1 ; ... + data = _mm256_unpacklo_epi64(ctr0, ctr1); // Interleaves the lower signed or unsigned 64-bit integer in a with + // the lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := + // b0 ; ... + ctr1 = _mm256_add_epi64(data, temp); + + uint64_t lo = *(uint64_t *)(&ctr1) + ((uint64_t *)(&ctr1))[2]; + uint64_t hi = ((uint64_t *)(&ctr1))[1] + ((uint64_t *)(&ctr1))[3]; + uint32_t lohi = lo >> 32; + uint32_t hilo = hi; + uint32_t diff = lohi - hilo; + hi += diff; + lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32); + constTerm.QuadPart += lo; + ctr += constTerm.QuadPart < lo; + ctr += hi >> 32; #elif defined(HAVE_SSE_2) && (PMPML_32_CHUNK_SIZE_LOG2 >= 2) - __m128i ctr0, ctr1, mask_low; - __m128i a, data, product, temp; - uint64_t temp_fin; - int i; - - ctr0 = _mm_setzero_si128 (); // Sets the 128-bit value to zero. - ctr1 = _mm_setzero_si128 (); - mask_low = _mm_set_epi32 ( 0, -1, 0 , -1 ); - - uint32_t *x1, *x2, *x3, *c1, *c2, *c3; - -#if (PMPML_32_CHUNK_SIZE_LOG2 >= 6) - for ( i=0; i 2) - - a = _mm_load_si128 ((__m128i *)(coeff+i+4)); - data = _mm_loadu_si128 ((__m128i *)(x+i+4)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - -#endif -#if (PMPML_32_CHUNK_SIZE_LOG2 > 3) - - a = _mm_load_si128 ((__m128i *)(coeff+i+8)); - data = _mm_loadu_si128 ((__m128i *)(x+i+8)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(coeff+i+12)); - data = _mm_loadu_si128 ((__m128i *)(x+i+12)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - -#endif -#if (PMPML_32_CHUNK_SIZE_LOG2 > 4) - - a = _mm_load_si128 ((__m128i *)(coeff+i+16)); - data = _mm_loadu_si128 ((__m128i *)(x+i+16)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(coeff+i+20)); - data = _mm_loadu_si128 ((__m128i *)(x+i+20)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(coeff+i+24)); - data = _mm_loadu_si128 ((__m128i *)(x+i+24)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(coeff+i+28)); - data = _mm_loadu_si128 ((__m128i *)(x+i+28)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - -#endif -#if (PMPML_32_CHUNK_SIZE_LOG2 > 5) - - x1 = const_cast( x+i+36 ); - x2 = const_cast( x+i+40 ); - x3 = const_cast( x+i+44 ); - c1 = const_cast( coeff+i+36 ); - c2 = const_cast( coeff+i+40 ); - c3 = const_cast( coeff+i+44 ); - a = _mm_load_si128 ((__m128i *)(coeff+i+32)); - data = _mm_loadu_si128 ((__m128i *)(x+i+32)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c1)); - data = _mm_loadu_si128 ((__m128i *)(x1)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c2)); - data = _mm_loadu_si128 ((__m128i *)(x2)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c3)); - data = _mm_loadu_si128 ((__m128i *)(x3)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - x1 = const_cast( x+i+52 ); - x2 = const_cast( x+i+56 ); - x3 = const_cast( x+i+60 ); - c1 = const_cast( coeff+i+52 ); - c2 = const_cast( coeff+i+56 ); - c3 = const_cast( coeff+i+60 ); - a = _mm_load_si128 ((__m128i *)(coeff+i+48)); - data = _mm_loadu_si128 ((__m128i *)(x+i+48)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c1)); - data = _mm_loadu_si128 ((__m128i *)(x1)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c2)); - data = _mm_loadu_si128 ((__m128i *)(x2)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - - a = _mm_load_si128 ((__m128i *)(c3)); - data = _mm_loadu_si128 ((__m128i *)(x3)); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); - - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); - temp = _mm_srli_epi64( product, 32 ); - ctr1 = _mm_add_epi64 ( ctr1, temp ); - //temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, product );//ctr0 = _mm_add_epi64 ( ctr0, temp ); -#endif - } - - temp = _mm_unpackhi_epi64 ( ctr0, ctr1 ); // Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := b1 - data = _mm_unpacklo_epi64 ( ctr0, ctr1 ); // Interleaves the lower signed or unsigned 64-bit integer in a with the lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := b0 - ctr1 = _mm_add_epi64 ( data, temp ); - -#if defined(_MSC_VER) - constTerm.QuadPart += ctr1.m128i_u32[0]; // Microsoft specific - ctr.QuadPart += ctr1.m128i_u64[1] + ctr1.m128i_u32[1]; -#elif defined(HAVE_SSE_4_1) - constTer.QuadPart += _mm_extract_epi32(ctr1,0); - ctr.QuadPart += _mm_extract_epi64(ctr1,0) + _mm_extract_epi32(ctr1,1); -#elif (defined __arm__ || defined __aarch64__) + __m128i ctr0, ctr1, mask_low; + __m128i a, data, product, temp; + uint64_t temp_fin; + int i; + + ctr0 = _mm_setzero_si128(); // Sets the 128-bit value to zero. + ctr1 = _mm_setzero_si128(); + mask_low = _mm_set_epi32(0, -1, 0, -1); + + uint32_t * x1, * x2, * x3, * c1, * c2, * c3; + + #if (PMPML_32_CHUNK_SIZE_LOG2 >= 6) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 64) + #elif (PMPML_32_CHUNK_SIZE_LOG2 == 5) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 32) + #elif (PMPML_32_CHUNK_SIZE_LOG2 == 4) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 16) + #elif (PMPML_32_CHUNK_SIZE_LOG2 == 3) + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 8) + #else + for (i = 0; i < PMPML_32_CHUNK_SIZE; i += 4) + #endif + { + a = _mm_load_si128((__m128i * )(coeff + i)); // Loads 128-bit value. Address p must be 16-byte + // aligned. + data = _mm_loadu_si128((__m128i *)(x + i)); // Loads 128-bit value. Address p does not need be + // 16-byte aligned. + product = _mm_mul_epu32(data, a); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + temp = _mm_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + #if (PMPML_32_CHUNK_SIZE_LOG2 > 2) + + a = _mm_load_si128((__m128i * )(coeff + i + 4)); + data = _mm_loadu_si128((__m128i *)(x + i + 4)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + #endif + #if (PMPML_32_CHUNK_SIZE_LOG2 > 3) + + a = _mm_load_si128((__m128i * )(coeff + i + 8)); + data = _mm_loadu_si128((__m128i *)(x + i + 8)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(coeff + i + 12)); + data = _mm_loadu_si128((__m128i *)(x + i + 12)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + #endif + #if (PMPML_32_CHUNK_SIZE_LOG2 > 4) + + a = _mm_load_si128((__m128i * )(coeff + i + 16)); + data = _mm_loadu_si128((__m128i *)(x + i + 16)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(coeff + i + 20)); + data = _mm_loadu_si128((__m128i *)(x + i + 20)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(coeff + i + 24)); + data = _mm_loadu_si128((__m128i *)(x + i + 24)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(coeff + i + 28)); + data = _mm_loadu_si128((__m128i *)(x + i + 28)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + #endif + #if (PMPML_32_CHUNK_SIZE_LOG2 > 5) + + x1 = const_cast(x + i + 36); + x2 = const_cast(x + i + 40); + x3 = const_cast(x + i + 44); + c1 = const_cast(coeff + i + 36); + c2 = const_cast(coeff + i + 40); + c3 = const_cast(coeff + i + 44); + a = _mm_load_si128((__m128i * )(coeff + i + 32)); + data = _mm_loadu_si128((__m128i *)(x + i + 32)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c1)); + data = _mm_loadu_si128((__m128i *)(x1)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c2)); + data = _mm_loadu_si128((__m128i *)(x2)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c3)); + data = _mm_loadu_si128((__m128i *)(x3)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + x1 = const_cast(x + i + 52); + x2 = const_cast(x + i + 56); + x3 = const_cast(x + i + 60); + c1 = const_cast(coeff + i + 52); + c2 = const_cast(coeff + i + 56); + c3 = const_cast(coeff + i + 60); + a = _mm_load_si128((__m128i * )(coeff + i + 48)); + data = _mm_loadu_si128((__m128i *)(x + i + 48)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c1)); + data = _mm_loadu_si128((__m128i *)(x1)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c2)); + data = _mm_loadu_si128((__m128i *)(x2)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_load_si128((__m128i * )(c3)); + data = _mm_loadu_si128((__m128i *)(x3)); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); + temp = _mm_srli_epi64(product, 32); + ctr1 = _mm_add_epi64(ctr1, temp ); + // temp = _mm_and_si128 ( mask_low, product ); + ctr0 = _mm_add_epi64(ctr0, product); // ctr0 = _mm_add_epi64 ( ctr0, temp ); + #endif + } + + temp = _mm_unpackhi_epi64(ctr0, ctr1); // Interleaves the upper signed or unsigned 64-bit integer in a with the + // upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := b1 + data = _mm_unpacklo_epi64(ctr0, ctr1); // Interleaves the lower signed or unsigned 64-bit integer in a with the + // lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := b0 + ctr1 = _mm_add_epi64(data, temp); + + #if defined(_MSC_VER) + constTerm.QuadPart += ctr1.m128i_u32[0]; // Microsoft specific + ctr.QuadPart += ctr1.m128i_u64[1] + ctr1.m128i_u32[1]; + #elif defined(HAVE_SSE_4_1) + constTer.QuadPart += _mm_extract_epi32(ctr1, 0); + ctr.QuadPart += _mm_extract_epi64(ctr1, 0) + _mm_extract_epi32(ctr1, 1); + #elif (defined __arm__ || defined __aarch64__) uint32_t b[4]; - _mm_storeu_si128((__m128i *)b,ctr1); - constTerm.QuadPart += b[0]; - ctr.QuadPart += b[1] + b[2] + ((uint64_t) b[3] <<32); -#else - uint64_t lo = ((uint64_t*)(&ctr1))[0]; - uint64_t hi = ((uint64_t*)(&ctr1))[1]; -/* constTerm.QuadPart += lo; - ctr += constTerm.QuadPart < lo; - constTerm.HighPart += ((uint32_t*)(&ctr1))[2]; - ctr += constTerm.HighPart < ((uint32_t*)(&ctr1))[2]; - ctr += ((uint32_t*)(&ctr1))[3];*/ - uint32_t lohi = lo >> 32; - uint32_t hilo = hi; - uint32_t diff = lohi - hilo; - hi += diff; - lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32 ); - constTerm.QuadPart += lo; - ctr += constTerm.QuadPart < lo; - ctr += hi >> 32; -#endif + _mm_storeu_si128((__m128i *)b, ctr1); + constTerm.QuadPart += b[0]; + ctr.QuadPart += b[1] + b[2] + ((uint64_t)b[3] << 32); + #else + uint64_t lo = ((uint64_t *)(&ctr1))[0]; + uint64_t hi = ((uint64_t *)(&ctr1))[1]; +/* + * constTerm.QuadPart += lo; + * ctr += constTerm.QuadPart < lo; + * constTerm.HighPart += ((uint32_t*)(&ctr1))[2]; + * ctr += constTerm.HighPart < ((uint32_t*)(&ctr1))[2]; + * ctr += ((uint32_t*)(&ctr1))[3]; + */ + uint32_t lohi = lo >> 32; + uint32_t hilo = hi; + uint32_t diff = lohi - hilo; + hi += diff; + lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32); + constTerm.QuadPart += lo; + ctr += constTerm.QuadPart < lo; + ctr += hi >> 32; + #endif #else // No AVX2 and no SSE - for ( uint32_t i=0; i 2) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 7 + i ) -#endif - } + for (uint32_t i = 0; i < PMPML_32_CHUNK_SIZE; i += 8) { + PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(2 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + i) + #if (PMPML_32_CHUNK_SIZE_LOG2 > 2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(5 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(6 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(7 + i) + #endif + } #endif // PMPML_USE_SSE - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN - } + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN + } - template < bool bswap > - FORCE_INLINE uint64_t hash_of_beginning_of_string_chunk_type2( const uint32_t* coeff, ULARGE_INTEGER__XX constTerm, const unsigned char* tail, unsigned int tail_size ) const - { - PMPML_CHUNK_LOOP_INTRO_L0 - uint32_t size = tail_size >> PMPML_32_WORD_SIZE_BYTES_LOG2; - const uint32_t* x = (const uint32_t*)tail; + template + FORCE_INLINE uint64_t hash_of_beginning_of_string_chunk_type2( const uint32_t * coeff, ULARGE_INTEGER__XX constTerm, + const unsigned char * tail, unsigned int tail_size ) const { + PMPML_CHUNK_LOOP_INTRO_L0 + uint32_t size = tail_size >> PMPML_32_WORD_SIZE_BYTES_LOG2; + const uint32_t * x = (const uint32_t *)tail; #if defined(HAVE_SSE_2) - __m128i ctr0, ctr1, a, data, product, temp, mask_low; - int i; - - ctr0 = _mm_setzero_si128 (); // Sets the 128-bit value to zero. - ctr1 = _mm_setzero_si128 (); - mask_low = _mm_set_epi32 ( 0, -1, 0 , -1 ); - - for ( i=0; i<(size&0xFFFFFFF8); i+=4 ) - { - a = _mm_load_si128 ((__m128i *)(coeff+i)); // Loads 128-bit value. Address p must be 16-byte aligned. - data = _mm_loadu_si128 ((__m128i *)(x+i)); // Loads 128-bit value. Address p does not need be 16-byte aligned. - product = _mm_mul_epu32 ( data, a); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - temp = _mm_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - ctr1 = _mm_add_epi64 ( ctr1, temp ); - temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, temp ); - -// a = _mm_srli_epi64 ( a, 32 ); -// data = _mm_srli_epi64 ( data, 32 ); - a = _mm_shuffle_epi32( a, 1*1+0*4+3*16+2*64 ); - data = _mm_shuffle_epi32( data, 1*1+0*4+3*16+2*64 ); - product = _mm_mul_epu32 ( data, a); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - temp = _mm_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - ctr1 = _mm_add_epi64 ( ctr1, temp ); - temp = _mm_and_si128 ( mask_low, product ); - ctr0 = _mm_add_epi64 ( ctr0, temp ); - } - - temp = _mm_unpackhi_epi64 ( ctr0, ctr1 ); // Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := b1 - data = _mm_unpacklo_epi64 ( ctr0, ctr1 ); // Interleaves the lower signed or unsigned 64-bit integer in a with the lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := b0 - ctr1 = _mm_add_epi64 ( data, temp ); - -#if defined(_MSC_VER) + __m128i ctr0, ctr1, a, data, product, temp, mask_low; + int i; + + ctr0 = _mm_setzero_si128(); // Sets the 128-bit value to zero. + ctr1 = _mm_setzero_si128(); + mask_low = _mm_set_epi32(0, -1, 0, -1); + + for (i = 0; i < (size & 0xFFFFFFF8); i += 4) { + a = _mm_load_si128((__m128i * )(coeff + i)); // Loads 128-bit value. Address p must be 16-byte + // aligned. + data = _mm_loadu_si128((__m128i *)(x + i)); // Loads 128-bit value. Address p does not need be + // 16-byte aligned. + product = _mm_mul_epu32(data, a); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + temp = _mm_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + ctr1 = _mm_add_epi64(ctr1, temp); + temp = _mm_and_si128(mask_low, product); + ctr0 = _mm_add_epi64(ctr0, temp); + +// a = _mm_srli_epi64 ( a, 32 ); +// data = _mm_srli_epi64 ( data, 32 ); + a = _mm_shuffle_epi32(a , 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + data = _mm_shuffle_epi32(data, 1 * 1 + 0 * 4 + 3 * 16 + 2 * 64); + product = _mm_mul_epu32(data, a); // A 128-bit value that contains two 64-bit unsigned integers. The + // result can be expressed by the following equations. r0 := a0 * b0; + // r1 := a2 * b2 + temp = _mm_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a right by + // count bits while shifting in zeros. + ctr1 = _mm_add_epi64(ctr1, temp); + temp = _mm_and_si128(mask_low, product); + ctr0 = _mm_add_epi64(ctr0, temp); + } + + temp = _mm_unpackhi_epi64(ctr0, ctr1); // Interleaves the upper signed or unsigned 64-bit integer in a with the + // upper signed or unsigned 64-bit integer in b. r0 := a1 ; r1 := b1 + data = _mm_unpacklo_epi64(ctr0, ctr1); // Interleaves the lower signed or unsigned 64-bit integer in a with the + // lower signed or unsigned 64-bit integer in b. r0 := a0 ; r1 := b0 + ctr1 = _mm_add_epi64(data, temp); + + #if defined(_MSC_VER) constTerm.QuadPart += ctr1.m128i_u32[0]; // Microsoft specific - ctr.QuadPart += ctr1.m128i_u64[1] + ctr1.m128i_u32[1]; -#elif 0 && defined( __SSE4_1__) - constTerm.QuadPart += _mm_extract_epi32(ctr1,0); - ctr.QuadPart += _mm_extract_epi64(ctr1,0) + _mm_extract_epi32(ctr1,1); -#elif 0 && defined(IDEK) + ctr.QuadPart += ctr1.m128i_u64[1] + ctr1.m128i_u32[1]; + #elif 0 && defined(__SSE4_1__) + constTerm.QuadPart += _mm_extract_epi32(ctr1, 0); + ctr.QuadPart += _mm_extract_epi64(ctr1, 0) + _mm_extract_epi32(ctr1, 1); + #elif 0 && defined(IDEK) uint32_t b[4]; - _mm_storeu_si128((__m128i *)b,ctr1); + _mm_storeu_si128((__m128i *)b, ctr1); constTerm.QuadPart += b[0]; - ctr.QuadPart += b[1] + b[2] + ((uint64_t) b[3] <<32); -#else - constTerm.QuadPart += *(uint64_t*)(&ctr1); - ctr += constTerm.QuadPart < *(uint64_t*)(&ctr1); - constTerm.HighPart += ((uint32_t*)(&ctr1))[2]; - ctr += constTerm.HighPart < ((uint32_t*)(&ctr1))[2]; - ctr += ((uint32_t*)(&ctr1))[3]; -#endif + ctr.QuadPart += b[1] + b[2] + ((uint64_t)b[3] << 32); + #else + constTerm.QuadPart += * (uint64_t *)(&ctr1); + ctr += constTerm.QuadPart < *(uint64_t *)(&ctr1); + constTerm.HighPart += ((uint32_t *)(&ctr1))[2]; + ctr += constTerm.HighPart < ((uint32_t *)(&ctr1))[2]; + ctr += ((uint32_t *)(&ctr1))[3]; + #endif #else // HAVE_SSE_2 - for ( uint32_t i=0; i<(size&0xFFFFFFF8); i+=8 ) - { - PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 + i ) -#if (PMPML_32_CHUNK_SIZE_LOG2 > 2) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 7 + i ) -#endif - } + for (uint32_t i = 0; i < (size & 0xFFFFFFF8); i += 8) { + PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(2 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + i) + #if (PMPML_32_CHUNK_SIZE_LOG2 > 2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(5 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(6 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1(7 + i) + #endif + } #endif // HAVE_SSE_2 - uint32_t offset = size & 0xFFFFFFF8; - - switch( size & 0x7 ) - { - case 0: { break; } - case 1: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) } break; - case 2: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) } break; - case 3: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) } break; - case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) } break; - case 5: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) } break; - case 6: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + offset ) } break; - case 7: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + offset ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 + offset ) } break; - } - - uint32_t xLast; - switch ( tail_size & ( PMPML_32_WORD_SIZE_BYTES - 1 ) ) - { - case 0: { xLast = 0x1; break;} - case 1: { xLast = 0x100 | tail[tail_size-1]; break;} - case 2: { xLast = GET_U16(tail + tail_size - 2, 0) | 0x10000; break; } - case 3: { xLast = tail[ tail_size - 1 ]; xLast = ( xLast << 16 ) | GET_U16(tail + tail_size - 3, 0) | 0x1000000; break;} - } - - PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST - - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN - } + uint32_t offset = size & 0xFFFFFFF8; + + switch (size & 0x7) { + case 0: { break; } + case 1: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) } + break; + case 2: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) } + break; + case 3: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1( + 2 + offset) } + break; + case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1( + 2 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + offset) } + break; + case 5: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1( + 2 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(4 + offset) } + break; + case 6: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1( + 2 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(4 + + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(5 + offset) } + break; + case 7: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(1 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1( + 2 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(3 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(4 + + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(5 + offset) PMPML_CHUNK_LOOP_BODY_ULI_T1(6 + offset) } + break; + } + + uint32_t xLast; + switch (tail_size & (PMPML_32_WORD_SIZE_BYTES - 1)) { + case 0: { xLast = 0x1; break; } + case 1: { xLast = 0x100 | tail[tail_size - 1]; break; } + case 2: { xLast = GET_U16(tail + tail_size - 2, 0) | 0x10000; break; } + case 3: { xLast = tail[tail_size - 1]; + xLast = (xLast << 16) | GET_U16(tail + tail_size - 3, 0) | 0x1000000; break; } + } + + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST + + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 + + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN + } - // a call to be done from subsequent levels - FORCE_INLINE uint64_t hash_of_num_chunk( const uint32_t* coeff, ULARGE_INTEGER__XX constTerm, const uint64_t* x ) const - { - PMPML_CHUNK_LOOP_INTRO_L0 - - for ( uint32_t i=0; i 2) - PMPML_CHUNK_LOOP_BODY_ULI_T1_64( 4 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1_64( 5 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1_64( 6 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1_64( 7 + i ) + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(4 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(5 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(6 + i) + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(7 + i) #endif - } + } - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN - } + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN + } - // a call to be done from subsequent levels - FORCE_INLINE uint64_t hash_of_num_chunk_incomplete( const uint32_t* coeff, ULARGE_INTEGER__XX constTerm, ULARGE_INTEGER__XX prevConstTerm, ULARGE_INTEGER__XX coeffSum, const uint64_t* x, size_t count ) const - { - PMPML_CHUNK_LOOP_INTRO_L0 - - ULARGE_INTEGER__XX c_ctr; - c_ctr.QuadPart = 0; - - uint32_t i; - - if ( count < ( PMPML_32_CHUNK_SIZE >> 1 ) ) - { - for ( i=0; i 4 ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 + i ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 7 + i ) -#endif - }*/ + // a call to be done from subsequent levels + FORCE_INLINE uint64_t hash_of_num_chunk_incomplete( const uint32_t * coeff, ULARGE_INTEGER__XX constTerm, + ULARGE_INTEGER__XX prevConstTerm, ULARGE_INTEGER__XX coeffSum, const uint64_t * x, size_t count ) const { + PMPML_CHUNK_LOOP_INTRO_L0 + + ULARGE_INTEGER__XX c_ctr; + + c_ctr.QuadPart = 0; + + uint32_t i; + + if (count < (PMPML_32_CHUNK_SIZE >> 1)) { + for (i = 0; i < count; i++) { + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(0 + i) + c_ctr.QuadPart += coeff[i]; + } + c_ctr.QuadPart = coeffSum.QuadPart - c_ctr.QuadPart; + } else { + for (i = 0; i < count; i++) { + PMPML_CHUNK_LOOP_BODY_ULI_T1_64(0 + i) + for (; i < PMPML_32_CHUNK_SIZE; i++) { + c_ctr.QuadPart += coeff[i]; + } + } + } + + ULARGE_INTEGER__XX lowProduct; + lowProduct.QuadPart = UInt32x32To64(c_ctr.LowPart, prevConstTerm.LowPart ); + ULARGE_INTEGER__XX midProduct; + midProduct.QuadPart = UInt32x32To64(c_ctr.LowPart, prevConstTerm.HighPart) + UInt32x32To64( + c_ctr.HighPart, prevConstTerm.LowPart); + midProduct.QuadPart += lowProduct.HighPart; + lowProduct.HighPart = midProduct.LowPart; + uint32_t hiProduct = c_ctr.HighPart * prevConstTerm.HighPart + midProduct.HighPart; + + constTerm.QuadPart += lowProduct.QuadPart; + ctr += hiProduct + (constTerm.QuadPart < lowProduct.QuadPart); - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 +/* + * for ( uint32_t i=0; i 4 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 + i ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 + i ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 + i ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 7 + i ) + #endif + * } + */ - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN - } + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - template < bool bswap > - FORCE_INLINE void procesNextValue( int level, uint64_t value, uint64_t * allValues, unsigned int * cnts, unsigned int& flag ) const - { - for ( int i=level;;i++ ) - { - // NOTE: it's not necessary to check whether ( i < PMPML_LEVELS ), - // if it is guaranteed that the string size is less than 1 << USHF_MACHINE_WORD_SIZE_BITS - allValues[ ( i << PMPML_32_CHUNK_SIZE_LOG2 ) + cnts[ i ] ] = value; - (cnts[ i ]) ++; - if ( cnts[ i ] != PMPML_32_CHUNK_SIZE ) - break; - cnts[ i ] = 0; - value = hash_of_num_chunk( curr_rd[ i ].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[i].const_term)), allValues + ( i << PMPML_32_CHUNK_SIZE_LOG2 ) ); - if ( ( flag & ( 1 << i ) ) == 0 ) - { - cnts[ i + 1] = 0; - flag |= 1 << i; - } - } - } + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN + } - template < bool bswap > - FORCE_INLINE uint64_t finalize( int level, uint64_t * allValues, unsigned int * cnts, unsigned int& flag ) const - { - for ( int i=level;;i++ ) - { -// assert ( level != PMPML_LEVELS ) - if ( ( ( flag & ( 1 << i ) ) == 0 ) && cnts[ i ] == 1 ) - { - return allValues[ i << PMPML_32_CHUNK_SIZE_LOG2 ]; - } - if ( cnts[ i ] ) - { -/* for ( int j=cnts[ i ]; j( i + 1, -/* hash_of_num_chunk( curr_rd[ i ].random_coeff, - *(ULARGE_INTEGER__XX*)(&(curr_rd[i].const_term)), - allValues + ( i << PMPML_CHUNK_SIZE_LOG2 ) ), */ - hash_of_num_chunk_incomplete( curr_rd[ i ].random_coeff, - *(ULARGE_INTEGER__XX*)(&(curr_rd[i].const_term)), - *(ULARGE_INTEGER__XX*)(&(curr_rd[i-1].const_term)), - *(ULARGE_INTEGER__XX*)(&(curr_rd[i].cachedSum)), - allValues + ( i << PMPML_32_CHUNK_SIZE_LOG2 ), - cnts[ i ]), - allValues, cnts, flag ); - } - } - } + template + FORCE_INLINE void procesNextValue( int level, uint64_t value, uint64_t * allValues, + unsigned int * cnts, unsigned int & flag ) const { + for (int i = level;; i++) { + // NOTE: it's not necessary to check whether ( i < PMPML_LEVELS ), + // if it is guaranteed that the string size is less than 1 << USHF_MACHINE_WORD_SIZE_BITS + allValues[(i << PMPML_32_CHUNK_SIZE_LOG2) + cnts[i]] = value; + (cnts[i])++; + if (cnts[i] != PMPML_32_CHUNK_SIZE) { + break; + } + cnts[i] = 0; + value = hash_of_num_chunk(curr_rd[i].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[i].const_term)), + allValues + (i << PMPML_32_CHUNK_SIZE_LOG2)); + if ((flag & (1 << i)) == 0) { + cnts[i + 1] = 0; + flag |= 1 << i; + } + } + } + + template + FORCE_INLINE uint64_t finalize( int level, uint64_t * allValues, unsigned int * cnts, unsigned int & flag ) const { + for (int i = level;; i++) { +// assert ( level != PMPML_LEVELS ) + if (((flag & (1 << i)) == 0) && (cnts[i] == 1)) { + return allValues[i << PMPML_32_CHUNK_SIZE_LOG2]; + } + if (cnts[i]) { +/* + * for ( int j=cnts[ i ]; j(i + 1, +/* + * hash_of_num_chunk( curr_rd[ i ].random_coeff, + *(ULARGE_INTEGER__XX*)(&(curr_rd[i].const_term)), + * allValues + ( i << PMPML_CHUNK_SIZE_LOG2 ) ), + */ + hash_of_num_chunk_incomplete(curr_rd[i].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[i].const_term)), + *(ULARGE_INTEGER__XX *)(&(curr_rd[i - 1].const_term)), *(ULARGE_INTEGER__XX *)(&(curr_rd[i].cachedSum)), + allValues + (i << PMPML_32_CHUNK_SIZE_LOG2), cnts[i]), allValues, cnts, flag); + } + } + } #if defined(_MSC_VER) && defined(HAVE_32BIT_PLATFORM) - template < uint32_t N, bool bswap > - static FORCE_INLINE uint32_t hash_size_SMALL_N( const unsigned char* chars ) const - { - const uint32_t* coeff = curr_rd[0].random_coeff; - ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)); - uint32_t xLast; - - switch(N) { - case 0: break; - case 1: xLast = 0x100 + chars[0]; break - case 2: xLast = GET_U16(chars,0) + 0x10000; break; - case 3: xLast = chars[ 2 ]; xLast = ( xLast << 16 ) + GET_U16(chars,0) + 0x1000000; break; - case 4: xLast = GET_U32(chars, 0) + coeff[ 1 ]; break; - } - - if (N != 0) { - constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], xLast ); - } else { - constTerm.QuadPart += coeff[ 0 ]; - } - - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; - } -#define HASH_SIZE_XXX_BEGIN( XXX ) \ + template + static FORCE_INLINE uint32_t hash_size_SMALL_N( const unsigned char * chars ) const { + const uint32_t * coeff = curr_rd[0].random_coeff; + ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)); + uint32_t xLast; + + switch (N) { + case 0: break; + case 1: xLast = 0x100 + chars[0]; break + case 2: xLast = GET_U16(chars, 0) + 0x10000; break; + case 3: xLast = chars[2]; xLast = (xLast << 16) + GET_U16(chars, 0) + 0x1000000; break; + case 4: xLast = GET_U32(chars, 0) + coeff[1]; break; + } + + if (N != 0) { + constTerm.QuadPart += UInt32x32To64(coeff[0], xLast); + } else { + constTerm.QuadPart += coeff[0]; + } + + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; + } + +#define HASH_SIZE_XXX_BEGIN( XXX ) \ static FORCE_INLINE uint32_t hash_size_##XXX( const unsigned char* chars ) const \ - { \ - const uint32_t* coeff = curr_rd[0].random_coeff; \ - const uint32_t* x = (const uint32_t*)chars; \ - ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)); \ - uint32_t xLast; \ - PMPML_CHUNK_LOOP_INTRO_L0 \ - uint32_t size = XXX >> PMPML_WORD_SIZE_BYTES_LOG2; - -#define HASH_SIZE_XXX_END \ - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 \ - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN \ + { \ + const uint32_t* coeff = curr_rd[0].random_coeff; \ + const uint32_t* x = (const uint32_t*)chars; \ + ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term));\ + uint32_t xLast; \ + PMPML_CHUNK_LOOP_INTRO_L0 \ + uint32_t size = XXX >> PMPML_WORD_SIZE_BYTES_LOG2; + +#define HASH_SIZE_XXX_END \ + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 \ + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN\ } -HASH_SIZE_XXX_BEGIN(28 ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; -HASH_SIZE_XXX_END - -HASH_SIZE_XXX_BEGIN(29 ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x100 + chars[28]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; -HASH_SIZE_XXX_END - -HASH_SIZE_XXX_BEGIN(30 ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = *((const unsigned short*)(chars + 28 )) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST -HASH_SIZE_XXX_END - -HASH_SIZE_XXX_BEGIN(31 ) - PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = chars[ 30 ]; xLast = ( xLast << 16 ) + *((const unsigned short*)(chars + 28 )) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; -HASH_SIZE_XXX_END + HASH_SIZE_XXX_BEGIN(28) + PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) PMPML_CHUNK_LOOP_BODY_ULI_T1(6) xLast = + 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; + HASH_SIZE_XXX_END + + HASH_SIZE_XXX_BEGIN( 29 ) + PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) PMPML_CHUNK_LOOP_BODY_ULI_T1(6) xLast = + 0x100 + chars[28]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; + HASH_SIZE_XXX_END + + HASH_SIZE_XXX_BEGIN( 30 ) + PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) PMPML_CHUNK_LOOP_BODY_ULI_T1(6) xLast = + *((const unsigned short *)(chars + 28)) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST + HASH_SIZE_XXX_END + + HASH_SIZE_XXX_BEGIN( 31 ) + PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) PMPML_CHUNK_LOOP_BODY_ULI_T1(6) xLast = chars[30]; + xLast = (xLast << 16) + *((const unsigned short *)(chars + 28)) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; + HASH_SIZE_XXX_END #endif // PMPML_MSC_32_WORKAROUND - template < bool bswap > - NEVER_INLINE uint32_t _hash_noRecursionNoInline_forLessThanChunk(const unsigned char* chars, unsigned int cnt) const - { - unsigned int i; - ULARGE_INTEGER__XX tmp_hash; - - tmp_hash.QuadPart = hash_of_beginning_of_string_chunk_type2( curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), chars, cnt ); - if ( tmp_hash.HighPart == 0 ) //LIKELY - { - return fmix32_short( tmp_hash.LowPart ); - } - return tmp_hash.LowPart; - } + template + NEVER_INLINE uint32_t _hash_noRecursionNoInline_forLessThanChunk( const unsigned char * chars, unsigned int cnt ) const { + unsigned int i; + ULARGE_INTEGER__XX tmp_hash; + + tmp_hash.QuadPart = hash_of_beginning_of_string_chunk_type2(curr_rd[0].random_coeff, + *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), chars, cnt); + if (tmp_hash.HighPart == 0) { // LIKELY + return fmix32_short(tmp_hash.LowPart); + } + return tmp_hash.LowPart; + } - template < bool bswap > - NEVER_INLINE uint32_t _hash_noRecursionNoInline_type2(const unsigned char* chars, unsigned int cnt) const - { - uint64_t allValues[ PMPML_32_LEVELS * PMPML_32_CHUNK_SIZE ]; - unsigned int cnts[ PMPML_32_LEVELS ]; - unsigned int flag; - cnts[ 1 ] = 0; - flag = 0; - - unsigned int i; - ULARGE_INTEGER__XX tmp_hash; - - // process full chunks - for ( i=0; i<(cnt>>PMPML_32_CHUNK_SIZE_BYTES_LOG2); i++ ) - { - tmp_hash.QuadPart = hash_of_string_chunk_compact( curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), ((const uint32_t*)(chars)) + ( i << PMPML_32_CHUNK_SIZE_LOG2 ) ); - procesNextValue( 1, tmp_hash.QuadPart, allValues, cnts, flag ); - } - - // process remaining incomplete chunk(s) - // note: if string size is a multiple of chunk size, we create a new chunk (1,0,0,...0), - // so THIS PROCESSING IS ALWAYS PERFORMED - unsigned int tailCnt = cnt & ( PMPML_32_CHUNK_SIZE_BYTES - 1 ); - const unsigned char* tail = chars + ( (cnt>>PMPML_32_CHUNK_SIZE_BYTES_LOG2) << PMPML_32_CHUNK_SIZE_BYTES_LOG2 ); - - tmp_hash.QuadPart = hash_of_beginning_of_string_chunk_type2( curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), tail, tailCnt ); - procesNextValue( 1, tmp_hash.QuadPart, allValues, cnts, flag ); - ULARGE_INTEGER__XX ret64; - ret64.QuadPart = finalize( 1, allValues, cnts, flag ); - if ( ret64.HighPart == 0 ) //LIKELY - { - return fmix32_short( ret64.LowPart ); - } - return ret64.LowPart; - } + template + NEVER_INLINE uint32_t _hash_noRecursionNoInline_type2( const unsigned char * chars, unsigned int cnt ) const { + uint64_t allValues[PMPML_32_LEVELS * PMPML_32_CHUNK_SIZE]; + unsigned int cnts[PMPML_32_LEVELS]; + unsigned int flag; + + cnts[1] = 0; + flag = 0; + + unsigned int i; + ULARGE_INTEGER__XX tmp_hash; + + // process full chunks + for (i = 0; i < (cnt >> PMPML_32_CHUNK_SIZE_BYTES_LOG2); i++) { + tmp_hash.QuadPart = hash_of_string_chunk_compact(curr_rd[0].random_coeff, + *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), + ((const uint32_t *)(chars)) + (i << PMPML_32_CHUNK_SIZE_LOG2)); + procesNextValue(1, tmp_hash.QuadPart, allValues, cnts, flag); + } + + // process remaining incomplete chunk(s) + // note: if string size is a multiple of chunk size, we create a new chunk (1,0,0,...0), + // so THIS PROCESSING IS ALWAYS PERFORMED + unsigned int tailCnt = cnt & (PMPML_32_CHUNK_SIZE_BYTES - 1); + const unsigned char * tail = chars + ((cnt >> PMPML_32_CHUNK_SIZE_BYTES_LOG2) << PMPML_32_CHUNK_SIZE_BYTES_LOG2); + + tmp_hash.QuadPart = hash_of_beginning_of_string_chunk_type2(curr_rd[0].random_coeff, + *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), tail, tailCnt); + procesNextValue(1, tmp_hash.QuadPart, allValues, cnts, flag); + ULARGE_INTEGER__XX ret64; + ret64.QuadPart = finalize(1, allValues, cnts, flag); + if (ret64.HighPart == 0) { // LIKELY + return fmix32_short(ret64.LowPart); + } + return ret64.LowPart; + } + + public: -public: - template < bool bswap > - FORCE_INLINE uint32_t hash( const unsigned char* chars, unsigned int cnt ) const - { - if ( likely(cnt < 32) ) - { - const uint32_t* coeff = curr_rd[0].random_coeff; - ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)); - PMPML_CHUNK_LOOP_INTRO_L0 - uint32_t size = cnt >> PMPML_32_WORD_SIZE_BYTES_LOG2; - uint32_t xLast; + template + FORCE_INLINE uint32_t hash( const unsigned char * chars, unsigned int cnt ) const { + if (likely(cnt < 32)) { + const uint32_t * coeff = curr_rd[0].random_coeff; + ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)); + PMPML_CHUNK_LOOP_INTRO_L0 + uint32_t size = cnt >> PMPML_32_WORD_SIZE_BYTES_LOG2; + uint32_t xLast; - const uint32_t* x = (const uint32_t*)chars; + const uint32_t * x = (const uint32_t *)chars; #if defined(_MSC_VER) && defined(HAVE_32BIT_PLATFORM) // enables MSVC-specific code that appears to be more efficient than a regular one; comment out, if not desired - switch ( cnt ) - { -/* case 0: { xLast = 0x1; constTerm.QuadPart += coeff[ 0 ]; PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } - case 1: { xLast = 0x100 + chars[cnt-1]; constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } - case 2: { xLast = *((const unsigned short*)(chars + cnt - 2 )) + 0x10000; constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } - case 3: { xLast = chars[ cnt - 1 ]; xLast = ( xLast << 16 ) + *((const unsigned short*)(chars + cnt - 3 )) + 0x1000000; constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } - - case 0: { xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 1: { xLast = 0x100 + chars[cnt-1]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 2: { xLast = *((const unsigned short*)(chars + cnt - 2 )) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 3: { xLast = chars[ cnt - 1 ]; xLast = ( xLast << 16 ) + *((const unsigned short*)(chars + cnt - 3 )) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } -*/ - case 0: { return hash_size_SMALL_N<0, bswap>( chars ); } - case 1: { return hash_size_SMALL_N<1, bswap>( chars ); } - case 2: { return hash_size_SMALL_N<2, bswap>( chars ); } - case 3: { return hash_size_SMALL_N<3, bswap>( chars ); } - case 4: { return hash_size_SMALL_N<4, bswap>( chars ); } -// case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 5: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) xLast = 0x100 + chars[4]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 6: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) xLast = GET_U16(chars, 4) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 7: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) xLast = chars[ 6 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 4) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 8: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 9: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) xLast = 0x100 + chars[8]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 10: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) xLast = GET_U16(chars, 8) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 11: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) xLast = chars[ 10 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 8) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 12: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 13: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) xLast = 0x100 + chars[12]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 14: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) xLast = GET_U16(chars, 12) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 15: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) xLast = chars[ 14 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 12) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 16: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 17: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) xLast = 0x100 + chars[16]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 18: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) xLast = GET_U16(chars, 16) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 19: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) xLast = chars[ 18 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 16) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 20: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 21: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) xLast = 0x100 + chars[20]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 22: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) xLast = GET_U16(chars, 20) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 23: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) xLast = chars[ 22 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 20) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 24: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 25: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) xLast = 0x100 + chars[24]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 26: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) xLast = GET_U16(chars, 24) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 27: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) xLast = chars[ 26 ]; xLast = ( xLast << 16 ) + GET_U16(chars, 24) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - -/* case 28: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } - case 29: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x100 + chars[28]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - case 30: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = *((const unsigned short*)(chars + 28 )) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } - default: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = chars[ 30 ]; xLast = ( xLast << 16 ) + *((const unsigned short*)(chars + 28 )) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } -*/ - case 28: { return hash_size_28( chars ); } - case 29: { return hash_size_29( chars ); } - case 30: { return hash_size_30( chars ); } - default: { return hash_size_31( chars ); } - } + switch (cnt) { +/* + * case 0: { xLast = 0x1; constTerm.QuadPart += coeff[ 0 ]; PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } + * case 1: { xLast = 0x100 + chars[cnt-1]; constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], + * xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } + * case 2: { xLast = *((const unsigned short*)(chars + cnt - 2 )) + 0x10000; constTerm.QuadPart + * += UInt32x32To64( coeff[ 0 ], xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } + * case 3: { xLast = chars[ cnt - 1 ]; xLast = ( xLast << 16 ) + *((const unsigned + * short*)(chars + cnt - 3 )) + 0x1000000; constTerm.QuadPart += UInt32x32To64( coeff[ 0 ], xLast ); PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN_32x32_ONLY; } + * + * case 0: { xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + * case 1: { xLast = 0x100 + chars[cnt-1]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + * case 2: { xLast = *((const unsigned short*)(chars + cnt - 2 )) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + * case 3: { xLast = chars[ cnt - 1 ]; xLast = ( xLast << 16 ) + *((const unsigned short*)(chars + + * cnt - 3 )) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + */ + case 0 : { return hash_size_SMALL_N<0, bswap>(chars); } + case 1 : { return hash_size_SMALL_N<1, bswap>(chars); } + case 2 : { return hash_size_SMALL_N<2, bswap>(chars); } + case 3 : { return hash_size_SMALL_N<3, bswap>(chars); } + case 4 : { return hash_size_SMALL_N<4, bswap>(chars); } +// case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + case 5 : { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) xLast = 0x100 + + chars[4]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 6 : { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) xLast = + GET_U16(chars, 4) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 7 : { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) xLast = chars[6]; + xLast = (xLast << 16) + GET_U16(chars, 4) + 0x1000000; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 8: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) xLast = + 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + case 9: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) xLast = + 0x100 + chars[8]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 10: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) xLast = + GET_U16(chars, 8) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 11: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) xLast = chars[10]; + xLast = (xLast << 16) + GET_U16(chars, 8) + 0x1000000; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 12: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) xLast = + 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + case 13: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) xLast = + 0x100 + chars[12]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 14: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) xLast = + GET_U16(chars, 12) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 15: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) xLast = + chars[14]; + xLast = (xLast << 16) + GET_U16(chars, 12) + 0x1000000; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 16: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; + break; } + case 17: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) xLast = 0x100 + chars[16]; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 18: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) xLast = + GET_U16(chars, 16) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 19: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) xLast = chars[18]; xLast = (xLast << 16) + GET_U16( + chars, 16) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 20: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) xLast = + 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + case 21: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) xLast = + 0x100 + chars[20]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 22: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) xLast = + GET_U16(chars, 20) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 23: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) xLast = chars[22]; + xLast = (xLast << 16) + GET_U16(chars, 20) + 0x1000000; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 24: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) + xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + case 25: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) + xLast = 0x100 + chars[24]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 26: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) + xLast = GET_U16(chars, 24) + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + case 27: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) + xLast = chars[26]; + xLast = (xLast << 16) + GET_U16(chars, 24) + 0x1000000; + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + +/* + * case 28: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( + * 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x1; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST_FOR_JUST_1; break; } + * case 29: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = 0x100 + chars[28]; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + * case 30: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = *((const unsigned short*)(chars + 28 )) + * + 0x10000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + * default: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) + * PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) xLast = chars[ 30 ]; xLast = ( xLast << 16 ) + + * *((const unsigned short*)(chars + 28 )) + 0x1000000; PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST; break; } + */ + case 28: { return hash_size_28(chars); } + case 29: { return hash_size_29(chars); } + case 30: { return hash_size_30(chars); } + default: { return hash_size_31(chars); } + } #else - switch( size ) - { - case 0: { break; } - case 1: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) } break; - case 2: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) } break; - case 3: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) } break; - case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) } break; - case 5: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) } break; - case 6: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) } break; - default: { PMPML_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 1 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 3 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 5 ) PMPML_CHUNK_LOOP_BODY_ULI_T1( 6 ) } break; - } - - switch ( cnt & ( PMPML_32_WORD_SIZE_BYTES - 1 ) ) - { - case 0: { xLast = 0x1; break;} - case 1: { xLast = 0x100 + chars[cnt-1]; break;} - case 2: { xLast = GET_U16(chars + cnt - 2, 0) + 0x10000; break; } - default: { xLast = chars[ cnt - 1 ]; xLast = ( xLast << 16 ) + GET_U16(chars + cnt - 3, 0) + 0x1000000; break;} - } - - PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST + switch (size) { + case 0: { break; } + case 1: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) } + break; + case 2: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) } + break; + case 3: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) } + break; + case 4: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) } + break; + case 5: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) } + break; + case 6: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) } + break; + default: { PMPML_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_CHUNK_LOOP_BODY_ULI_T1(1) PMPML_CHUNK_LOOP_BODY_ULI_T1(2) + PMPML_CHUNK_LOOP_BODY_ULI_T1(3) PMPML_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_CHUNK_LOOP_BODY_ULI_T1(5) + PMPML_CHUNK_LOOP_BODY_ULI_T1(6) } + break; + } + + switch (cnt & (PMPML_32_WORD_SIZE_BYTES - 1)) { + case 0: { xLast = 0x1; break; } + case 1: { xLast = 0x100 + chars[cnt - 1]; break; } + case 2: { xLast = GET_U16(chars + cnt - 2, 0) + 0x10000; break; } + default: { xLast = chars[cnt - 1]; xLast = (xLast << 16) + GET_U16(chars + cnt - 3, 0) + 0x1000000; break; } + } + + PMPML_CHUNK_LOOP_BODY_ULI_T1_LAST #endif // PMPML_MSC_32_WORKAROUND - PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - - PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN - } - else if ( cnt < PMPML_32_CHUNK_SIZE_BYTES ) - { - return _hash_noRecursionNoInline_forLessThanChunk( chars, cnt ); - } - else - { - return _hash_noRecursionNoInline_type2( chars, cnt ); - } - } + PMPML_CHUNK_LOOP_PRE_REDUCE_L0 - PMP_Multilinear_Hasher_32() - { - curr_rd = (random_data_for_PMPML_32*)rd_for_PMPML_32; - coeff0 = curr_rd[0].const_term; - } - void seed( uint64_t seed ) - { - curr_rd[0].const_term = coeff0 ^ seed; - } -}; + PMPML_FULL_REDUCE_MOD_2_32_PLUS_15_AND_RETURN_RETURN + } else if (cnt < PMPML_32_CHUNK_SIZE_BYTES) { + return _hash_noRecursionNoInline_forLessThanChunk(chars, cnt); + } else { + return _hash_noRecursionNoInline_type2(chars, cnt); + } + } + + PMP_Multilinear_Hasher_32() { + curr_rd = (random_data_for_PMPML_32 *)rd_for_PMPML_32; + coeff0 = curr_rd[0].const_term; + } + + void seed( uint64_t seed ) { + curr_rd[0].const_term = coeff0 ^ seed; + } +}; // class PMP_Multilinear_Hasher_32 //------------------------------------------------------------- // 64-bit hash -static FORCE_INLINE void MultiplyWordLoHi(uint64_t& rlo, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void MultiplyWordLoHi( uint64_t & rlo, uint64_t & rhi, uint64_t a, uint64_t b ) { mult64_128(rlo, rhi, a, b); } @@ -1823,7 +1915,7 @@ static FORCE_INLINE void MultiplyWordLoHi(uint64_t& rlo, uint64_t& rhi, uint64_t * Adds the 64-bit value in alo into the 128-bit * value spread across rhi:rlo. */ -static FORCE_INLINE void AccumulateLoHi(uint64_t& rlo, uint64_t& rhi, uint64_t alo) { +static FORCE_INLINE void AccumulateLoHi( uint64_t & rlo, uint64_t & rhi, uint64_t alo ) { add128(rlo, rhi, alo); } @@ -1831,7 +1923,8 @@ static FORCE_INLINE void AccumulateLoHi(uint64_t& rlo, uint64_t& rhi, uint64_t a * Adds the 192-bit value spread across ahi:ami:alo into the 192-bit * value spread across rhi:rmi:rlo. */ -static FORCE_INLINE void AccumulateLoMidHi(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, uint64_t alo, uint64_t ami, uint64_t ahi) { +static FORCE_INLINE void AccumulateLoMidHi( uint64_t & rlo, uint64_t & rmi, + uint64_t & rhi, uint64_t alo, uint64_t ami, uint64_t ahi ) { add192(rlo, rmi, rhi, alo, ami, ahi); } @@ -1839,7 +1932,7 @@ static FORCE_INLINE void AccumulateLoMidHi(uint64_t& rlo, uint64_t& rmi, uint64_ * Does a 64x64->128 multiply on a and b, and adds the result into the * 192-bit value spread across rhi:rmi:rlo. */ -static FORCE_INLINE void MultiplyAccumulateWordLoMidHi(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void MultiplyAccumulateWordLoMidHi( uint64_t & rlo, uint64_t & rmi, uint64_t & rhi, uint64_t a, uint64_t b ) { fma64_192(rlo, rmi, rhi, a, b); } @@ -1847,713 +1940,775 @@ static FORCE_INLINE void MultiplyAccumulateWordLoMidHi(uint64_t& rlo, uint64_t& * Does a 64x64->128 multiply on a and b, and adds the result into the * 128-bit value spread across rhi:rlo. */ -static FORCE_INLINE void MultiplyAccumulateWordLoHi(uint64_t& rlo, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void MultiplyAccumulateWordLoHi( uint64_t & rlo, uint64_t & rhi, uint64_t a, uint64_t b ) { fma64_128(rlo, rhi, a, b); } -#define ADD_SHIFT_ADD_NORMALIZE( lo, hi ) { \ - uint32_t lohi = lo >> 32; \ - uint32_t hilo = hi; \ - uint32_t diff = lohi - hilo; \ - hi += diff; \ - lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32 ); \ - hi >>= 32; \ +#define ADD_SHIFT_ADD_NORMALIZE( lo, hi ) { \ + uint32_t lohi = lo >> 32; \ + uint32_t hilo = hi; \ + uint32_t diff = lohi - hilo; \ + hi += diff; \ + lo = (uint32_t)lo + (((uint64_t)(uint32_t)hi) << 32 );\ + hi >>= 32; \ } -#define ADD_SHIFT_ADD_NORMALIZE_TO_UPPER( lo, hi ) {\ - uint32_t lohi = lo >> 32; \ - uint32_t hilo = hi; \ - uint32_t diff = lohi - hilo; \ - hi += diff; \ - lo = (uint32_t)lo; \ +#define ADD_SHIFT_ADD_NORMALIZE_TO_UPPER( lo, hi ) { \ + uint32_t lohi = lo >> 32; \ + uint32_t hilo = hi; \ + uint32_t diff = lohi - hilo; \ + hi += diff; \ + lo = (uint32_t)lo; \ } -#define PMPML_CHUNK_LOOP_INTRO_L0_64 \ - ULARGE_INTEGER__XX ctr0, ctr1, ctr2; \ - ctr0.QuadPart = constTerm.QuadPart; \ - ctr1.QuadPart = 0; \ - ctr2.QuadPart = 0;\ - ULARGE_INTEGER__XX ctr2_0, ctr2_1, ctr2_2, ctr2_3; \ - ctr2_0.QuadPart = 0; \ - ctr2_1.QuadPart = 0;\ - ctr2_2.QuadPart = 0; \ - ctr2_3.QuadPart = 0;\ +#define PMPML_CHUNK_LOOP_INTRO_L0_64 \ + ULARGE_INTEGER__XX ctr0, ctr1, ctr2; \ + ctr0.QuadPart = constTerm.QuadPart; \ + ctr1.QuadPart = 0; \ + ctr2.QuadPart = 0; \ + ULARGE_INTEGER__XX ctr2_0, ctr2_1, ctr2_2, ctr2_3;\ + ctr2_0.QuadPart = 0; \ + ctr2_1.QuadPart = 0; \ + ctr2_2.QuadPart = 0; \ + ctr2_3.QuadPart = 0; \ ULARGE_INTEGER__XX mulLow, mulHigh; -#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( i ) { \ - uint64_t xi = GET_U64((const uint8_t*)x,(i)*8); \ +#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( i ) { \ + uint64_t xi = GET_U64((const uint8_t*)x,(i)*8); \ MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, xi, coeff[i]); \ } -#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( ii ) { \ - uint64_t xii = GET_U64((const uint8_t*)x,(ii)*8); \ +#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( ii ) { \ + uint64_t xii = GET_U64((const uint8_t*)x,(ii)*8); \ MultiplyAccumulateWordLoMidHi(ctr2_0.QuadPart, ctr2_1.QuadPart, ctr2_2.QuadPart, xii, coeff[ii]); \ } #define PMPML_64_CHUNK_LOOP_BODY_ULI_T1(i) PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(i) -#define _compensate_ { \ +#define _compensate_ { \ AccumulateLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, ctr2_0.QuadPart, ctr2_1.QuadPart, ctr2_2.QuadPart); \ } -#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST( size ) { \ +#define PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST( size ) { \ MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, xLast, coeff[size]); \ } -#define PMPML_64_CHUNK_LOOP_BODY_ULI_T2( i ) { \ - if (likely(x[i].HighPart == 0)) { \ - MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, x[i].LowPart, coeff[i]); \ - } else { \ - MultiplyWordLoHi(mulLow.QuadPart, mulHigh.QuadPart, x[i].LowPart, coeff[i]); \ - mulHigh.QuadPart += x[i].HighPart * coeff[i]; \ +#define PMPML_64_CHUNK_LOOP_BODY_ULI_T2( i ) { \ + if (likely(x[i].HighPart == 0)) { \ + MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, x[i].LowPart, coeff[i]); \ + } else { \ + MultiplyWordLoHi(mulLow.QuadPart, mulHigh.QuadPart, x[i].LowPart, coeff[i]); \ + mulHigh.QuadPart += x[i].HighPart * coeff[i]; \ MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, mulLow.QuadPart, mulHigh.QuadPart); \ - } \ + } \ } -#define PMPML_64_CHUNK_LOOP_BODY_ULI_ADD_COEFF( i ) { \ - AccumulateLoHi(c_ctr0.QuadPart, c_ctr1.QuadPart, coeff[i]); \ +#define PMPML_64_CHUNK_LOOP_BODY_ULI_ADD_COEFF( i ) { \ + AccumulateLoHi(c_ctr0.QuadPart, c_ctr1.QuadPart, coeff[i]); \ } -#define PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_COEFF_64( i ) { \ - AccumulateLoHi(c_ctr0.QuadPart, c_ctr1.QuadPart, coeff[i]); \ - if (likely(x[i].HighPart == 0)) { \ - MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, x[i].LowPart, coeff[i]); \ - } else { \ - MultiplyWordLoHi(mulLow.QuadPart, mulHigh.QuadPart, x[i].LowPart, coeff[i]); \ - mulHigh.QuadPart += x[i].HighPart * coeff[i]; \ +#define PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_COEFF_64( i ) { \ + AccumulateLoHi(c_ctr0.QuadPart, c_ctr1.QuadPart, coeff[i]); \ + if (likely(x[i].HighPart == 0)) { \ + MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, x[i].LowPart, coeff[i]); \ + } else { \ + MultiplyWordLoHi(mulLow.QuadPart, mulHigh.QuadPart, x[i].LowPart, coeff[i]); \ + mulHigh.QuadPart += x[i].HighPart * coeff[i]; \ MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, mulLow.QuadPart, mulHigh.QuadPart); \ - } \ + } \ } -#define PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_SUM_OF_COEFF_64 { \ +#define PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_SUM_OF_COEFF_64 { \ MultiplyAccumulateWordLoMidHi(ctr0.QuadPart, ctr1.QuadPart, ctr2.QuadPart, c_ctr0.QuadPart, prevConstTerm); \ - MultiplyAccumulateWordLoHi(ctr1.QuadPart, ctr2.QuadPart, c_ctr1.QuadPart, prevConstTerm); \ + MultiplyAccumulateWordLoHi(ctr1.QuadPart, ctr2.QuadPart, c_ctr1.QuadPart, prevConstTerm); \ } #define PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 -#define PMPML_CHUNK_REDUCE_128_TO_64 \ -{ \ - uint64_t hi, lo; \ +#define PMPML_CHUNK_REDUCE_128_TO_64 \ +{ \ + uint64_t hi, lo; \ MultiplyWordLoHi(lo, hi, ctr1.QuadPart, 13); \ - uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13; \ - ctr0.QuadPart += part; \ - ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ - ctr1.QuadPart -= (ctr0.QuadPart < lo); \ - ctr0.QuadPart -= lo; \ - if ( likely( ctr0.QuadPart >= 26) ) { ctr0.QuadPart -= ctr1.QuadPart * 13; ctr1.QuadPart = 0; } \ - else \ - { \ + uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13;\ + ctr0.QuadPart += part; \ + ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ + ctr1.QuadPart -= (ctr0.QuadPart < lo); \ + ctr0.QuadPart -= lo; \ + if ( likely( ctr0.QuadPart >= 26) ) { \ + ctr0.QuadPart -= ctr1.QuadPart * 13; \ + ctr1.QuadPart = 0; \ + } else { \ ctr0.QuadPart -= ctr1.QuadPart * 13; \ - if ( ctr0.QuadPart < 26 ) ctr1.QuadPart = 0; \ - else \ - { \ - ctr0.QuadPart += 13; \ - if ( ctr0.QuadPart < 13 ) ctr1.QuadPart = 1; \ - else ctr1.QuadPart = 0; \ - } \ - } \ + if ( ctr0.QuadPart < 26 ) { \ + ctr1.QuadPart = 0; \ + } else { \ + ctr0.QuadPart += 13; \ + if ( ctr0.QuadPart < 13 ) { \ + ctr1.QuadPart = 1; \ + } else { \ + ctr1.QuadPart = 0; \ + } \ + } \ + } \ } -#define PMPML_CHUNK_REDUCE_128_TO_64____ \ -{ \ - _compensate_ \ - uint64_t hi, lo; \ +#define PMPML_CHUNK_REDUCE_128_TO_64____ \ +{ \ + _compensate_ \ + uint64_t hi, lo; \ MultiplyWordLoHi(lo, hi, ctr1.QuadPart, 13); \ - uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13; \ - ctr0.QuadPart += part; \ - ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ - ctr1.QuadPart -= (ctr0.QuadPart < lo); \ - ctr0.QuadPart -= lo; \ - if ( likely( ctr0.QuadPart >= 26) ) { ctr0.QuadPart -= ctr1.QuadPart * 13; ctr1.QuadPart = 0; } \ - else \ - { \ + uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13;\ + ctr0.QuadPart += part; \ + ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ + ctr1.QuadPart -= (ctr0.QuadPart < lo); \ + ctr0.QuadPart -= lo; \ + if ( likely( ctr0.QuadPart >= 26) ) { \ + ctr0.QuadPart -= ctr1.QuadPart * 13; \ + ctr1.QuadPart = 0; \ + } else { \ ctr0.QuadPart -= ctr1.QuadPart * 13; \ - if ( ctr0.QuadPart < 26 ) ctr1.QuadPart = 0; \ - else \ - { \ - ctr0.QuadPart += 13; \ - if ( ctr0.QuadPart < 13 ) ctr1.QuadPart = 1; \ - else ctr1.QuadPart = 0; \ - } \ - } \ + if ( ctr0.QuadPart < 26 ) { \ + ctr1.QuadPart = 0; \ + } else { \ + ctr0.QuadPart += 13; \ + if ( ctr0.QuadPart < 13 ) { \ + ctr1.QuadPart = 1; \ + } else { \ + ctr1.QuadPart = 0; \ + } \ + } \ + } \ } -#define PMPML_CHUNK_REDUCE_128_TO_64_AND_RETURN \ -{ \ - uint64_t hi, lo; \ +#define PMPML_CHUNK_REDUCE_128_TO_64_AND_RETURN \ +{ \ + uint64_t hi, lo; \ MultiplyWordLoHi(lo, hi, ctr1.QuadPart, 13); \ - uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13; \ - ctr0.QuadPart += part; \ - ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ - ctr1.QuadPart -= (ctr0.QuadPart < lo); \ - ctr0.QuadPart -= lo; \ - if ( likely( ctr0.QuadPart >= 26) ) { ctr0.QuadPart -= ctr1.QuadPart * 13; return fmix64_short( ctr0.QuadPart ); } \ - else \ - { \ + uint64_t part = ctr2.QuadPart * 169 + hi * 13 + 13;\ + ctr0.QuadPart += part; \ + ctr1.QuadPart = 1 + (ctr0.QuadPart < part); \ + ctr1.QuadPart -= (ctr0.QuadPart < lo); \ + ctr0.QuadPart -= lo; \ + if ( likely( ctr0.QuadPart >= 26) ) { \ + ctr0.QuadPart -= ctr1.QuadPart * 13; \ + return fmix64_short( ctr0.QuadPart ); \ + } else { \ ctr0.QuadPart -= ctr1.QuadPart * 13; \ - if ( ctr0.QuadPart < 26 ) return fmix64_short( ctr0.QuadPart ); \ - else \ - { \ - ctr0.QuadPart += 13; \ - return fmix64_short( ctr0.QuadPart ); \ - } \ - } \ + if ( ctr0.QuadPart < 26 ) { \ + return fmix64_short( ctr0.QuadPart ); \ + } else { \ + ctr0.QuadPart += 13; \ + return fmix64_short( ctr0.QuadPart );\ + } \ + } \ } -template < bool bswap > -static uint64_t ReadTail(const uint8_t * tail, uint64_t tail_size) { - uint64_t xLast; - - switch (tail_size & (PMPML_64_WORD_SIZE_BYTES - 1)) { - case 0: { xLast = 0x1; break;} - case 1: { xLast = 0x100 + tail[tail_size-1]; break;} - case 2: { xLast = GET_U16(tail + tail_size - 2, 0) + 0x10000; break; } - case 3: { xLast = tail[ tail_size - 1 ]; xLast = ( xLast << 16 ) + GET_U16(tail + tail_size - 3, 0) + 0x1000000; break;} - case 4: { xLast = GET_U32(tail + tail_size - 4, 0) + UINT64_C( 0x100000000 ); break; } - case 5: { xLast = tail[ tail_size - 1 ]; xLast = ( xLast << 32 ) + UINT64_C( 0x10000000000 ) + GET_U32(tail + tail_size - 5, 0); break;} - case 6: { xLast = GET_U16(tail + tail_size - 2, 0); xLast = ( xLast << 32 ) + UINT64_C( 0x1000000000000 ) + GET_U32(tail + tail_size - 6, 0); break;} - default: { xLast = tail[ tail_size - 1 ]; xLast <<= 48; uint64_t xLast1 = GET_U16(tail + tail_size - 3, 0); xLast += (xLast1<<32) + UINT64_C( 0x100000000000000 ) + GET_U32(tail + tail_size - 7, 0); break;} - } +template +static uint64_t ReadTail( const uint8_t * tail, uint64_t tail_size ) { + uint64_t xLast; + + switch (tail_size & (PMPML_64_WORD_SIZE_BYTES - 1)) { + case 0: { xLast = 0x1; break; } + case 1: { xLast = 0x100 + tail[tail_size - 1]; break; } + case 2: { xLast = GET_U16(tail + tail_size - 2, 0) + 0x10000; break; } + case 3: { xLast = tail[tail_size - 1]; xLast = (xLast << 16) + GET_U16(tail + tail_size - 3, 0) + 0x1000000; break; } + case 4: { xLast = GET_U32(tail + tail_size - 4, 0) + UINT64_C(0x100000000); break; } + case 5: { xLast = tail[tail_size - 1]; xLast = (xLast << 32) + UINT64_C(0x10000000000) + GET_U32( + tail + tail_size - 5, 0); break; } + case 6: { xLast = GET_U16(tail + tail_size - 2, 0); xLast = (xLast << 32) + UINT64_C(0x1000000000000) + GET_U32( + tail + tail_size - 6, 0); break; } + default: { xLast = tail[tail_size - 1]; xLast <<= 48; + uint64_t xLast1 = GET_U16(tail + tail_size - 3, 0); + xLast += (xLast1 << 32) + UINT64_C(0x100000000000000) + GET_U32(tail + tail_size - 7, 0); break; } + } - return xLast; + return xLast; } -class PMP_Multilinear_Hasher_64 -{ +class PMP_Multilinear_Hasher_64 { private: - random_data_for_PMPML_64* curr_rd; - uint64_t coeff0; + random_data_for_PMPML_64 * curr_rd; + uint64_t coeff0; - // calls to be done from LEVEL=0 - template < bool bswap > - FORCE_INLINE void hash_of_string_chunk_compact( const uint64_t* coeff, ULARGE_INTEGER__XX constTerm, const uint64_t* x, ULARGELARGE_INTEGER__XX& ret ) const - { - PMPML_CHUNK_LOOP_INTRO_L0_64 + // calls to be done from LEVEL=0 + template + FORCE_INLINE void hash_of_string_chunk_compact( const uint64_t * coeff, ULARGE_INTEGER__XX constTerm, + const uint64_t * x, ULARGELARGE_INTEGER__XX & ret ) const { + PMPML_CHUNK_LOOP_INTRO_L0_64 #if defined(HAVE_AVX2) && (PMPML_64_CHUNK_SIZE_LOG2 >= 3) - __m256i sse_ctr0_0, sse_ctr0_1, sse_ctr1, sse_ctr2, sse_ctr3_0, sse_ctr3_1, a, a_shifted, a_low, data, data_low, product, temp, mask_low; - sse_ctr0_0 = _mm256_setzero_si256 (); // Sets the 128-bit value to zero. - sse_ctr0_1 = _mm256_setzero_si256 (); // Sets the 128-bit value to zero. - sse_ctr1 = _mm256_setzero_si256 (); - sse_ctr2 = _mm256_setzero_si256 (); - sse_ctr3_0 = _mm256_setzero_si256 (); - sse_ctr3_1 = _mm256_setzero_si256 (); - mask_low = _mm256_set_epi32 ( 0, -1, 0 , -1, 0, -1, 0 , -1 ); - -#if (PMPML_64_CHUNK_SIZE_LOG2 >= 4) - for (uint64_t i=0; i<(PMPML_64_CHUNK_SIZE); i+=16) -#else - for (uint64_t i=0; i<(PMPML_64_CHUNK_SIZE); i+=8) -#endif - { - a = _mm256_load_si256 ((__m256i *)(coeff+i)); // Loads 128-bit value. Address p must be 16-byte aligned. - data = _mm256_loadu_si256 ((__m256i *)(x+i)); // Loads 128-bit value. Address p does not need be 16-byte aligned. - - // lower 32 bits - a_low = _mm256_and_si256 ( mask_low, a ); - data_low = _mm256_and_si256 ( mask_low, data ); - product = _mm256_mul_epu32 ( data_low, a_low); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr0_0 = _mm256_add_epi64 ( sse_ctr0_0, product );//sse_ctr0 = _mm256_add_epi64 ( sse_ctr0, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr0_1 = _mm256_add_epi64 ( sse_ctr0_1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 4 + i ) - - // first cross - a_shifted = _mm256_srli_epi64( a, 32 ); - product = _mm256_mul_epu32 ( data_low, a_shifted ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, product );//sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 5 + i ) - // second cross - data = _mm256_srli_epi64( data, 32 ); - product = _mm256_mul_epu32 ( data, a_low ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, product );//sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 6 + i ) - // upper 32 bits - product = _mm256_mul_epu32 ( data, a_shifted ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr3_0 = _mm256_add_epi64 ( sse_ctr3_0, product );//sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr3_1 = _mm256_add_epi64 ( sse_ctr3_1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 7 + i ) - -#if (PMPML_64_CHUNK_SIZE_LOG2 >= 4) - a = _mm256_load_si256 ((__m256i *)(coeff+i+8)); // Loads 128-bit value. Address p must be 16-byte aligned. - data = _mm256_loadu_si256 ((__m256i *)(x+i+8)); // Loads 128-bit value. Address p does not need be 16-byte aligned. - - // lower 32 bits - a_low = _mm256_and_si256 ( mask_low, a ); - data_low = _mm256_and_si256 ( mask_low, data ); - product = _mm256_mul_epu32 ( data_low, a_low); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr0_0 = _mm256_add_epi64 ( sse_ctr0_0, product );//sse_ctr0 = _mm256_add_epi64 ( sse_ctr0, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr0_1 = _mm256_add_epi64 ( sse_ctr0_1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 12 + i ) - - // first cross - a_shifted = _mm256_srli_epi64( a, 32 ); - product = _mm256_mul_epu32 ( data_low, a_shifted ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, product );//sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 13 + i ) - - // second cross - data = _mm256_srli_epi64( data, 32 ); - product = _mm256_mul_epu32 ( data, a_low ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, product );//sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 14 + i ) - // upper 32 bits - product = _mm256_mul_epu32 ( data, a_shifted ); // A 128-bit value that contains two 64-bit unsigned integers. The result can be expressed by the following equations. r0 := a0 * b0; r1 := a2 * b2 - sse_ctr3_0 = _mm256_add_epi64 ( sse_ctr3_0, product );//sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); - temp = _mm256_srli_epi64( product, 32 ); // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. - sse_ctr3_1 = _mm256_add_epi64 ( sse_ctr3_1, temp ); - //temp = _mm256_and_si256 ( mask_low, product ); - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 15 + i ) -#endif - } - - uint64_t t0_0, t0_1, t1, t2, t3_0, t3_1; - t0_0 = ((uint64_t*)(&sse_ctr0_0))[0] + ((uint64_t*)(&sse_ctr0_0))[1] + ((uint64_t*)(&sse_ctr0_0))[2] + ((uint64_t*)(&sse_ctr0_0))[3]; - t0_1 = ((uint64_t*)(&sse_ctr0_1))[0] + ((uint64_t*)(&sse_ctr0_1))[1] + ((uint64_t*)(&sse_ctr0_1))[2] + ((uint64_t*)(&sse_ctr0_1))[3]; - t1 = ((uint64_t*)(&sse_ctr1))[0] + ((uint64_t*)(&sse_ctr1))[1] + ((uint64_t*)(&sse_ctr1))[2] + ((uint64_t*)(&sse_ctr1))[3]; - t2 = ((uint64_t*)(&sse_ctr2))[0] + ((uint64_t*)(&sse_ctr2))[1] + ((uint64_t*)(&sse_ctr2))[2] + ((uint64_t*)(&sse_ctr2))[3]; - t3_0 = ((uint64_t*)(&sse_ctr3_0))[0] + ((uint64_t*)(&sse_ctr3_0))[1] + ((uint64_t*)(&sse_ctr3_0))[2] + ((uint64_t*)(&sse_ctr3_0))[3]; - t3_1 = ((uint64_t*)(&sse_ctr3_1))[0] + ((uint64_t*)(&sse_ctr3_1))[1] + ((uint64_t*)(&sse_ctr3_1))[2] + ((uint64_t*)(&sse_ctr3_1))[3]; - - ADD_SHIFT_ADD_NORMALIZE_TO_UPPER( t0_0, t0_1 ) - ADD_SHIFT_ADD_NORMALIZE_TO_UPPER( t1, t2 ) - ADD_SHIFT_ADD_NORMALIZE_TO_UPPER( t3_0, t3_1 ) - - uint64_t add_sse1, add_sse2; - - t1 += t0_1; - add_sse1 = t0_0 + ( ((uint64_t)(uint32_t)t1) << 32 ); - ctr0.QuadPart += add_sse1; - add_sse2 = ctr0.QuadPart < add_sse1; - - t2 += t3_0 + (t1>>32); - t3_1 += t2>>32; - - add_sse2 += (uint32_t)t2 + ( ( (uint64_t)(uint32_t)t3_1 ) << 32 ); - ctr1.QuadPart += add_sse2; - - ctr2.QuadPart += (t3_1 >> 32) + (ctr1.QuadPart < add_sse2); + __m256i sse_ctr0_0, sse_ctr0_1, sse_ctr1, sse_ctr2, sse_ctr3_0, sse_ctr3_1, + a, a_shifted, a_low, data, data_low, product, temp, mask_low; + sse_ctr0_0 = _mm256_setzero_si256(); // Sets the 128-bit value to zero. + sse_ctr0_1 = _mm256_setzero_si256(); // Sets the 128-bit value to zero. + sse_ctr1 = _mm256_setzero_si256(); + sse_ctr2 = _mm256_setzero_si256(); + sse_ctr3_0 = _mm256_setzero_si256(); + sse_ctr3_1 = _mm256_setzero_si256(); + mask_low = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + + #if (PMPML_64_CHUNK_SIZE_LOG2 >= 4) + for (uint64_t i = 0; i < (PMPML_64_CHUNK_SIZE); i += 16) + #else + for (uint64_t i = 0; i < (PMPML_64_CHUNK_SIZE); i += 8) + #endif + { + a = _mm256_load_si256((__m256i * )(coeff + i)); // Loads 128-bit value. Address p must be 16-byte + // aligned. + data = _mm256_loadu_si256((__m256i *)(x + i)); // Loads 128-bit value. Address p does not need be + // 16-byte aligned. + + // lower 32 bits + a_low = _mm256_and_si256(mask_low, a ); + data_low = _mm256_and_si256(mask_low, data); + product = _mm256_mul_epu32(data_low, a_low); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr0_0 = _mm256_add_epi64(sse_ctr0_0, product); // sse_ctr0 = _mm256_add_epi64 ( sse_ctr0, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr0_1 = _mm256_add_epi64(sse_ctr0_1, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(4 + i) + + // first cross + a_shifted = _mm256_srli_epi64(a, 32); + product = _mm256_mul_epu32(data_low, a_shifted); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr1 = _mm256_add_epi64(sse_ctr1, product); // sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr2 = _mm256_add_epi64(sse_ctr2, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(5 + i) + // second cross + data = _mm256_srli_epi64(data, 32); + product = _mm256_mul_epu32(data, a_low); // A 128-bit value that contains two 64-bit unsigned integers. + // The result can be expressed by the following equations. r0 := + // a0 * b0; r1 := a2 * b2 + sse_ctr1 = _mm256_add_epi64(sse_ctr1, product); // sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a right by + // count bits while shifting in zeros. + sse_ctr2 = _mm256_add_epi64(sse_ctr2, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(6 + i) + // upper 32 bits + product = _mm256_mul_epu32(data, a_shifted); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr3_0 = _mm256_add_epi64(sse_ctr3_0, product); // sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr3_1 = _mm256_add_epi64(sse_ctr3_1, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(7 + i) + + #if (PMPML_64_CHUNK_SIZE_LOG2 >= 4) + a = _mm256_load_si256((__m256i * )(coeff + i + 8)); // Loads 128-bit value. Address p must be 16-byte + // aligned. + data = _mm256_loadu_si256((__m256i *)(x + i + 8)); // Loads 128-bit value. Address p does not need be + // 16-byte aligned. + + // lower 32 bits + a_low = _mm256_and_si256(mask_low, a ); + data_low = _mm256_and_si256(mask_low, data); + product = _mm256_mul_epu32(data_low, a_low); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr0_0 = _mm256_add_epi64(sse_ctr0_0, product); // sse_ctr0 = _mm256_add_epi64 ( sse_ctr0, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr0_1 = _mm256_add_epi64(sse_ctr0_1, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(12 + i) + + // first cross + a_shifted = _mm256_srli_epi64(a, 32); + product = _mm256_mul_epu32(data_low, a_shifted); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr1 = _mm256_add_epi64(sse_ctr1, product); // sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr2 = _mm256_add_epi64(sse_ctr2, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(13 + i) + + // second cross + data = _mm256_srli_epi64(data, 32); + product = _mm256_mul_epu32(data, a_low); // A 128-bit value that contains two 64-bit unsigned integers. + // The result can be expressed by the following equations. r0 := + // a0 * b0; r1 := a2 * b2 + sse_ctr1 = _mm256_add_epi64(sse_ctr1, product); // sse_ctr1 = _mm256_add_epi64 ( sse_ctr1, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a right by + // count bits while shifting in zeros. + sse_ctr2 = _mm256_add_epi64(sse_ctr2, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(14 + i) + // upper 32 bits + product = _mm256_mul_epu32(data, a_shifted); // A 128-bit value that contains two 64-bit unsigned + // integers. The result can be expressed by the following + // equations. r0 := a0 * b0; r1 := a2 * b2 + sse_ctr3_0 = _mm256_add_epi64(sse_ctr3_0, product); // sse_ctr2 = _mm256_add_epi64 ( sse_ctr2, temp ); + temp = _mm256_srli_epi64(product, 32); // Shifts the 2 signed or unsigned 64-bit integers in a + // right by count bits while shifting in zeros. + sse_ctr3_1 = _mm256_add_epi64(sse_ctr3_1, temp); + // temp = _mm256_and_si256 ( mask_low, product ); + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(15 + i) + #endif + } + + uint64_t t0_0, t0_1, t1, t2, t3_0, t3_1; + t0_0 = ((uint64_t *)(&sse_ctr0_0))[0] + ((uint64_t *)(&sse_ctr0_0))[1] + + ((uint64_t *)(&sse_ctr0_0))[2] + ((uint64_t *)(&sse_ctr0_0))[3]; + t0_1 = ((uint64_t *)(&sse_ctr0_1))[0] + ((uint64_t *)(&sse_ctr0_1))[1] + + ((uint64_t *)(&sse_ctr0_1))[2] + ((uint64_t *)(&sse_ctr0_1))[3]; + t1 = ((uint64_t *)(&sse_ctr1 ))[0] + ((uint64_t *)(&sse_ctr1 ))[1] + + ((uint64_t *)(&sse_ctr1 ))[2] + ((uint64_t *)(&sse_ctr1 ))[3]; + t2 = ((uint64_t *)(&sse_ctr2 ))[0] + ((uint64_t *)(&sse_ctr2 ))[1] + + ((uint64_t *)(&sse_ctr2 ))[2] + ((uint64_t *)(&sse_ctr2 ))[3]; + t3_0 = ((uint64_t *)(&sse_ctr3_0))[0] + ((uint64_t *)(&sse_ctr3_0))[1] + + ((uint64_t *)(&sse_ctr3_0))[2] + ((uint64_t *)(&sse_ctr3_0))[3]; + t3_1 = ((uint64_t *)(&sse_ctr3_1))[0] + ((uint64_t *)(&sse_ctr3_1))[1] + + ((uint64_t *)(&sse_ctr3_1))[2] + ((uint64_t *)(&sse_ctr3_1))[3]; + + ADD_SHIFT_ADD_NORMALIZE_TO_UPPER(t0_0, t0_1) + ADD_SHIFT_ADD_NORMALIZE_TO_UPPER(t1 , t2 ) + ADD_SHIFT_ADD_NORMALIZE_TO_UPPER(t3_0, t3_1) + + uint64_t add_sse1, add_sse2; + + t1 += t0_1; + add_sse1 = t0_0 + (((uint64_t)(uint32_t)t1 ) << 32); + ctr0.QuadPart += add_sse1; + add_sse2 = ctr0.QuadPart < add_sse1; + + t2 += t3_0 + (t1 >> 32); + t3_1 += t2 >> 32; + + add_sse2 += (uint32_t)t2 + (((uint64_t)(uint32_t)t3_1) << 32); + ctr1.QuadPart += add_sse2; + + ctr2.QuadPart += (t3_1 >> 32) + (ctr1.QuadPart < add_sse2); - -/* ctr0.LowPart = (uint32_t)t0_0; - uint64_t upper64 = t0_1 + (t0_0>>32) + (uint64_t)(uint32_t)t1; - ctr0.HighPart = (uint32_t)upper64; - - upper64 = (upper64>>32) + (t1>>32) + t2 + (uint32_t)t3_0; - ctr1.LowPart = (uint32_t)upper64; - - upper64 = (upper64>>32) + (t3_0>>32) + (uint32_t)t3_1; - ctr1.HighPart += (uint32_t)upper64; - - ctr2.QuadPart = (upper64>>32) + (t3_1>>32);*/ +/* + * ctr0.LowPart = (uint32_t)t0_0; + * uint64_t upper64 = t0_1 + (t0_0>>32) + (uint64_t)(uint32_t)t1; + * ctr0.HighPart = (uint32_t)upper64; + * + * upper64 = (upper64>>32) + (t1>>32) + t2 + (uint32_t)t3_0; + * ctr1.LowPart = (uint32_t)upper64; + * + * upper64 = (upper64>>32) + (t3_0>>32) + (uint32_t)t3_1; + * ctr1.HighPart += (uint32_t)upper64; + * + * ctr2.QuadPart = (upper64>>32) + (t3_1>>32); + */ #else // defined(HAVE_AVX2) && (PMPML_64_CHUNK_SIZE_LOG2 >= 3) - for (uint64_t i=0; i<(PMPML_64_CHUNK_SIZE); i+=32) { - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 0 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 1 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 2 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 3 + i ) -#if (PMPML_64_CHUNK_SIZE_LOG2 > 2) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 4 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 5 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 6 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 7 + i ) -#endif -#if (PMPML_64_CHUNK_SIZE_LOG2 > 3) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 8 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 9 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 10 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 11 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 12 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 13 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 14 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 15 + i ) -#endif -#if (PMPML_64_CHUNK_SIZE_LOG2 > 4) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 16 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 17 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 18 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 19 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 20 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 21 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 22 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 23 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 24 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 25 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 26 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 27 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 28 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 29 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST( 30 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND( 31 + i ) -#endif - } + for (uint64_t i = 0; i < (PMPML_64_CHUNK_SIZE); i += 32) { + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(0 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(1 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(2 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(3 + i) + #if (PMPML_64_CHUNK_SIZE_LOG2 > 2) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(4 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(5 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(6 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(7 + i) + #endif + #if (PMPML_64_CHUNK_SIZE_LOG2 > 3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(8 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(9 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(10 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(11 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(12 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(13 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(14 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(15 + i) + #endif + #if (PMPML_64_CHUNK_SIZE_LOG2 > 4) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(16 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(17 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(18 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(19 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(20 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(21 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(22 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(23 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(24 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(25 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(26 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(27 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(28 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(29 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_FIRST(30 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_SECOND(31 + i) + #endif + } #endif // defined(HAVE_AVX2) && (PMPML_64_CHUNK_SIZE_LOG2 >= 3) - PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 + PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 - PMPML_CHUNK_REDUCE_128_TO_64____ - ret.LowPart = ctr0.QuadPart; - ret.HighPart = ctr1.QuadPart; - } + PMPML_CHUNK_REDUCE_128_TO_64____ + ret.LowPart = ctr0.QuadPart; + ret.HighPart = ctr1.QuadPart; + } - template < bool bswap > - FORCE_INLINE void hash_of_beginning_of_string_chunk_short_type2( const uint64_t* coeff, ULARGE_INTEGER__XX constTerm, const uint8_t* tail, std::size_t tail_size, ULARGELARGE_INTEGER__XX& ret ) const - { - PMPML_CHUNK_LOOP_INTRO_L0_64 - std::size_t size = tail_size >> PMPML_64_WORD_SIZE_BYTES_LOG2; - const uint64_t* x = (const uint64_t*)tail; - - switch (size) { - case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) } break; - case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) } break; - case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) } break; - case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) } break; - case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) } break; - case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 ) } break; - case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 6 ) } break; - } - - uint64_t xLast = ReadTail(tail, tail_size); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size) - - PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 - PMPML_CHUNK_REDUCE_128_TO_64 - ret.LowPart = ctr0.QuadPart; - ret.HighPart = ctr1.QuadPart; - } + template + FORCE_INLINE void hash_of_beginning_of_string_chunk_short_type2( const uint64_t * coeff, ULARGE_INTEGER__XX constTerm, + const uint8_t * tail, std::size_t tail_size, ULARGELARGE_INTEGER__XX & ret ) const { + PMPML_CHUNK_LOOP_INTRO_L0_64 + std::size_t size = tail_size >> PMPML_64_WORD_SIZE_BYTES_LOG2; + const uint64_t * x = (const uint64_t *)tail; + + switch (size) { + case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) } + break; + case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) } + break; + case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) } + break; + case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) } + break; + case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) } + break; + case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5) } + break; + case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(6) } + break; + } + + uint64_t xLast = ReadTail(tail, tail_size); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size) + + PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 + PMPML_CHUNK_REDUCE_128_TO_64 + ret.LowPart = ctr0.QuadPart; + ret.HighPart = ctr1.QuadPart; + } - template < bool bswap > - FORCE_INLINE void hash_of_beginning_of_string_chunk_type2( const uint64_t* coeff, ULARGE_INTEGER__XX constTerm, const uint8_t* tail, std::size_t tail_size, ULARGELARGE_INTEGER__XX& ret ) const - { - PMPML_CHUNK_LOOP_INTRO_L0_64 - std::size_t size = tail_size >> PMPML_64_WORD_SIZE_BYTES_LOG2; - const uint64_t* x = (const uint64_t*)tail; - - for ( uint32_t i=0; i<(size>>3); i++ ) - { - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 + ( i << 3 ) ) + template + FORCE_INLINE void hash_of_beginning_of_string_chunk_type2( const uint64_t * coeff, ULARGE_INTEGER__XX constTerm, + const uint8_t * tail, std::size_t tail_size, ULARGELARGE_INTEGER__XX & ret ) const { + PMPML_CHUNK_LOOP_INTRO_L0_64 + std::size_t size = tail_size >> PMPML_64_WORD_SIZE_BYTES_LOG2; + const uint64_t * x = (const uint64_t *)tail; + + for (uint32_t i = 0; i < (size >> 3); i++) { + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3 + (i << 3)) #if (PMPML_64_CHUNK_SIZE_LOG2 > 2) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 6 + ( i << 3 ) ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 7 + ( i << 3 ) ) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(6 + (i << 3)) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(7 + (i << 3)) #endif - } - - uint64_t offset = size & 0xFFFFFFF8; - - switch (size & 0x7) { - case 0: { break; } - case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) } break; - case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) } break; - case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) } break; - case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) } break; - case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) } break; - case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 + offset ) } break; - case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 + offset ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 + offset ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 6 + offset ) } break; - } - - uint64_t xLast = ReadTail(tail, tail_size); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size) - - PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 - PMPML_CHUNK_REDUCE_128_TO_64 - ret.LowPart = ctr0.QuadPart; - ret.HighPart = ctr1.QuadPart; - } + } + + uint64_t offset = size & 0xFFFFFFF8; + + switch (size & 0x7) { + case 0: { break; } + case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) } + break; + case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) } + break; + case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + offset) } + break; + case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3 + offset) } + break; + case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4 + offset) } + break; + case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5 + offset) } + break; + case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4 + offset) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5 + offset) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(6 + offset) } + break; + } + + uint64_t xLast = ReadTail(tail, tail_size); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size) + + PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64 + PMPML_CHUNK_REDUCE_128_TO_64 + ret.LowPart = ctr0.QuadPart; + ret.HighPart = ctr1.QuadPart; + } - // a call to be done from subsequent levels - FORCE_INLINE void hash_of_num_chunk( const uint64_t* coeff, ULARGE_INTEGER__XX constTerm, const ULARGELARGE_INTEGER__XX* x, ULARGELARGE_INTEGER__XX& ret ) const - { - ULARGE_INTEGER__XX ctr0, ctr1, ctr2; - ctr0.QuadPart = constTerm.QuadPart; - ctr1.QuadPart = 0; - ctr2.QuadPart = 0; - ULARGE_INTEGER__XX mulLow, mulHigh; - - for ( uint64_t i=0; i<(PMPML_64_CHUNK_SIZE); i+=32 ) - { - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 0 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 1 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 2 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 3 + i ) + // a call to be done from subsequent levels + FORCE_INLINE void hash_of_num_chunk( const uint64_t * coeff, ULARGE_INTEGER__XX constTerm, + const ULARGELARGE_INTEGER__XX * x, ULARGELARGE_INTEGER__XX & ret ) const { + ULARGE_INTEGER__XX ctr0, ctr1, ctr2; + + ctr0.QuadPart = constTerm.QuadPart; + ctr1.QuadPart = 0; + ctr2.QuadPart = 0; + ULARGE_INTEGER__XX mulLow, mulHigh; + + for (uint64_t i = 0; i < (PMPML_64_CHUNK_SIZE); i += 32) { + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 0 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 1 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 2 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 3 + i) #if (PMPML_64_CHUNK_SIZE_LOG2 > 2) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 4 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 5 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 6 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 7 + i ) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 4 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 5 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 6 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 7 + i) #endif #if (PMPML_64_CHUNK_SIZE_LOG2 > 3) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 8 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 9 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 10 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 11 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 12 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 13 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 14 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 15 + i ) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 8 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 9 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(10 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(11 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(12 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(13 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(14 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(15 + i) #endif #if (PMPML_64_CHUNK_SIZE_LOG2 > 4) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 16 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 17 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 18 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 19 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 20 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 21 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 22 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 23 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 24 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 25 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 26 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 27 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 28 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 29 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 30 + i ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T2( 31 + i ) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(16 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(17 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(18 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(19 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(20 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(21 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(22 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(23 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(24 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(25 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(26 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(27 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(28 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(29 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(30 + i) + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(31 + i) #endif - } + } - PMPML_CHUNK_REDUCE_128_TO_64 - - ret.LowPart = ctr0.QuadPart; - ret.HighPart = ctr1.QuadPart; - } + PMPML_CHUNK_REDUCE_128_TO_64 - // a call to be done from subsequent levels - FORCE_INLINE void hash_of_num_chunk_incomplete( const uint64_t* coeff, uint64_t constTerm, uint64_t prevConstTerm, uint64_t coeffSumLow, uint64_t coeffSumHigh, const ULARGELARGE_INTEGER__XX* x, size_t count, ULARGELARGE_INTEGER__XX& ret ) const - { - ULARGE_INTEGER__XX ctr0, ctr1, ctr2; - ctr0.QuadPart = constTerm; - ctr1.QuadPart = 0; - ctr2.QuadPart = 0; - ULARGE_INTEGER__XX c_ctr0, c_ctr1; - c_ctr0.QuadPart = 0; - c_ctr1.QuadPart = 0; - ULARGE_INTEGER__XX mulLow, mulHigh; - uint64_t i; - if ( count < ( PMPML_64_CHUNK_SIZE >> 1 ) ) - { - for ( i=0; i coeffSumLow ) - c_ctr1.QuadPart = coeffSumHigh - c_ctr1.QuadPart - 1; - else - c_ctr1.QuadPart = coeffSumHigh - c_ctr1.QuadPart; - c_ctr0.QuadPart = coeffSumLow - c_ctr0.QuadPart; - } - else - { - for ( i=0; i> 1)) { + for (i = 0; i < count; i++) { + PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_COEFF_64(i); + } + if (c_ctr0.QuadPart > coeffSumLow) { + c_ctr1.QuadPart = coeffSumHigh - c_ctr1.QuadPart - 1; + } else { + c_ctr1.QuadPart = coeffSumHigh - c_ctr1.QuadPart; + } + c_ctr0.QuadPart = coeffSumLow - c_ctr0.QuadPart; + } else { + for (i = 0; i < count; i++) { + PMPML_64_CHUNK_LOOP_BODY_ULI_T2(i) + } + for (; i < PMPML_64_CHUNK_SIZE; i++) { + PMPML_64_CHUNK_LOOP_BODY_ULI_ADD_COEFF(i) + } + } + PMPML_CHUNK_LOOP_BODY_ULI_T2_AND_ADD_SUM_OF_COEFF_64 + + PMPML_CHUNK_REDUCE_128_TO_64 + + ret.LowPart = ctr0.QuadPart; + ret.HighPart = ctr1.QuadPart; + } - FORCE_INLINE _ULARGELARGE_INTEGER__XX& finalize( int level, _ULARGELARGE_INTEGER__XX * allValues, std::size_t * cnts, std::size_t& flag ) const - { - ULARGELARGE_INTEGER__XX value; - for ( int i=level;;i++ ) - { -// ASSERT ( level != PMPML_LEVELS ) - if ( ( ( flag & ( 1 << i ) ) == 0 ) && cnts[ i ] == 1 ) - { - return allValues[ i << PMPML_64_CHUNK_SIZE_LOG2 ]; - } - if ( cnts[ i ] ) - { - if ( ( flag & ( 1 << i ) ) == 0 ) - { - cnts[ i + 1] = 0; - flag |= 1 << i; - } - hash_of_num_chunk_incomplete(curr_rd[ i ].random_coeff, - curr_rd[i].const_term, curr_rd[i].const_term, - curr_rd[i].cachedSumLow, curr_rd[i].cachedSumHigh, - allValues + (i << PMPML_64_CHUNK_SIZE_LOG2), cnts[i], value ); - procesNextValue( i + 1, value, allValues, cnts, flag ); - } - } - } + FORCE_INLINE void procesNextValue( int level, _ULARGELARGE_INTEGER__XX & value, _ULARGELARGE_INTEGER__XX * allValues, + std::size_t * cnts, std::size_t & flag ) const { + for (int i = level;; i++) { + // NOTE: it's not necessary to check whether ( i < PMPML_64_LEVELS ), + // if it is guaranteed that the string size is less than 1 << USHF_MACHINE_WORD_SIZE_BITS + allValues[(i << PMPML_64_CHUNK_SIZE_LOG2) + cnts[i]] = value; + (cnts[i])++; + if (cnts[i] != PMPML_64_CHUNK_SIZE) { + break; + } + cnts[i] = 0; + hash_of_num_chunk(curr_rd[i].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[i].const_term)), + allValues + (i << PMPML_64_CHUNK_SIZE_LOG2), value); + if ((flag & (1 << i)) == 0) { + cnts[i + 1] = 0; + flag |= 1 << i; + } + } + } - template < bool bswap > - NEVER_INLINE uint64_t _hash_noRecursionNoInline_SingleChunk( const uint8_t* chars, std::size_t cnt ) const - { - _ULARGELARGE_INTEGER__XX tmp_hash; - hash_of_beginning_of_string_chunk_type2( curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), chars, cnt, tmp_hash ); - if ( tmp_hash.HighPart == 0 ) { - return fmix64_short( tmp_hash.LowPart ); + FORCE_INLINE _ULARGELARGE_INTEGER__XX & finalize( int level, _ULARGELARGE_INTEGER__XX * allValues, + std::size_t * cnts, std::size_t & flag ) const { + ULARGELARGE_INTEGER__XX value; + + for (int i = level;; i++) { +// ASSERT ( level != PMPML_LEVELS ) + if (((flag & (1 << i)) == 0) && (cnts[i] == 1)) { + return allValues[i << PMPML_64_CHUNK_SIZE_LOG2]; + } + if (cnts[i]) { + if ((flag & (1 << i)) == 0) { + cnts[i + 1] = 0; + flag |= 1 << i; + } + hash_of_num_chunk_incomplete(curr_rd[i].random_coeff, curr_rd[i].const_term, curr_rd[i].const_term, + curr_rd[i].cachedSumLow, curr_rd[i].cachedSumHigh, allValues + (i << PMPML_64_CHUNK_SIZE_LOG2), + cnts[i], value); + procesNextValue(i + 1, value, allValues, cnts, flag); + } + } } - return tmp_hash.LowPart; - } - template < bool bswap > - NEVER_INLINE uint64_t _hash_noRecursionNoInline_type2( const uint8_t* chars, std::size_t cnt ) const - { - _ULARGELARGE_INTEGER__XX allValues[ PMPML_64_LEVELS * PMPML_64_CHUNK_SIZE ]; - std::size_t cnts[ PMPML_64_LEVELS ]; - std::size_t flag; - cnts[ 1 ] = 0; - flag = 0; - - std::size_t i; - _ULARGELARGE_INTEGER__XX tmp_hash; - // process full chunks - for ( i=0; i<(cnt>>PMPML_64_CHUNK_SIZE_BYTES_LOG2); i++ ) { - hash_of_string_chunk_compact(curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), - ((const uint64_t*)(chars)) + ( i << PMPML_64_CHUNK_SIZE_LOG2 ), tmp_hash ); - procesNextValue( 1, tmp_hash, allValues, cnts, flag ); + template + NEVER_INLINE uint64_t _hash_noRecursionNoInline_SingleChunk( const uint8_t * chars, std::size_t cnt ) const { + _ULARGELARGE_INTEGER__XX tmp_hash; + + hash_of_beginning_of_string_chunk_type2(curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), + chars, cnt, tmp_hash); + if (tmp_hash.HighPart == 0) { + return fmix64_short(tmp_hash.LowPart); + } + return tmp_hash.LowPart; } - // process remaining incomplete chunk(s) - // note: if string size is a multiple of chunk size, we create a new chunk (1,0,0,...0), - // so THIS PROCESSING IS ALWAYS PERFORMED - std::size_t tailCnt = cnt & ( PMPML_64_CHUNK_SIZE_BYTES - 1 ); - const uint8_t* tail = chars + ( (cnt>>PMPML_64_CHUNK_SIZE_BYTES_LOG2) << PMPML_64_CHUNK_SIZE_BYTES_LOG2 ); - hash_of_beginning_of_string_chunk_type2( curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)), - tail, tailCnt, tmp_hash ); - procesNextValue( 1, tmp_hash, allValues, cnts, flag ); - _ULARGELARGE_INTEGER__XX finRet = finalize( 1, allValues, cnts, flag ); - if ( finRet.HighPart == 0 ) { //LIKELY - return fmix64_short( finRet.LowPart ); + + template + NEVER_INLINE uint64_t _hash_noRecursionNoInline_type2( const uint8_t * chars, std::size_t cnt ) const { + _ULARGELARGE_INTEGER__XX allValues[PMPML_64_LEVELS * PMPML_64_CHUNK_SIZE]; + std::size_t cnts[PMPML_64_LEVELS]; + std::size_t flag; + + cnts[1] = 0; + flag = 0; + + std::size_t i; + _ULARGELARGE_INTEGER__XX tmp_hash; + // process full chunks + for (i = 0; i < (cnt >> PMPML_64_CHUNK_SIZE_BYTES_LOG2); i++) { + hash_of_string_chunk_compact(curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), + ((const uint64_t *)(chars)) + (i << PMPML_64_CHUNK_SIZE_LOG2), tmp_hash); + procesNextValue(1, tmp_hash, allValues, cnts, flag); + } + // process remaining incomplete chunk(s) + // note: if string size is a multiple of chunk size, we create a new chunk (1,0,0,...0), + // so THIS PROCESSING IS ALWAYS PERFORMED + std::size_t tailCnt = cnt & (PMPML_64_CHUNK_SIZE_BYTES - 1); + const uint8_t * tail = chars + ((cnt >> PMPML_64_CHUNK_SIZE_BYTES_LOG2) << PMPML_64_CHUNK_SIZE_BYTES_LOG2); + hash_of_beginning_of_string_chunk_type2(curr_rd[0].random_coeff, *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)), + tail, tailCnt, tmp_hash); + procesNextValue(1, tmp_hash, allValues, cnts, flag); + _ULARGELARGE_INTEGER__XX finRet = finalize(1, allValues, cnts, flag); + if (finRet.HighPart == 0) { // LIKELY + return fmix64_short(finRet.LowPart); + } + return finRet.LowPart; } - return finRet.LowPart; - } -public: - template < bool bswap > - FORCE_INLINE uint64_t hash( const uint8_t* chars, std::size_t cnt ) const - { - if (likely(cnt < 64)) { - const uint64_t* coeff = curr_rd[0].random_coeff; - ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX*)(&(curr_rd[0].const_term)); - PMPML_CHUNK_LOOP_INTRO_L0_64 - std::size_t size = cnt >> PMPML_64_WORD_SIZE_BYTES_LOG2; - const uint64_t* x = (const uint64_t*)chars; - - switch (size) { - case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) } break; - case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) } break; - case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) } break; - case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) } break; - case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) } break; - case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 ) } break; - case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 0 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 1 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 2 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 3 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 4 ) PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 5 ) - PMPML_64_CHUNK_LOOP_BODY_ULI_T1( 6 ) } break; - } - - uint64_t xLast = ReadTail(chars, cnt); - - PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size); - - PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64; - PMPML_CHUNK_REDUCE_128_TO_64_AND_RETURN; - } else if (cnt < PMPML_64_CHUNK_SIZE) { - return _hash_noRecursionNoInline_SingleChunk( chars, cnt ); - } else { - return _hash_noRecursionNoInline_type2( chars, cnt ); + public: + + template + FORCE_INLINE uint64_t hash( const uint8_t * chars, std::size_t cnt ) const { + if (likely(cnt < 64)) { + const uint64_t * coeff = curr_rd[0].random_coeff; + ULARGE_INTEGER__XX constTerm = *(ULARGE_INTEGER__XX *)(&(curr_rd[0].const_term)); + PMPML_CHUNK_LOOP_INTRO_L0_64 + std::size_t size = cnt >> PMPML_64_WORD_SIZE_BYTES_LOG2; + const uint64_t * x = (const uint64_t *)chars; + + switch (size) { + case 1: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) } + break; + case 2: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) } + break; + case 3: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) } + break; + case 4: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) } + break; + case 5: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) } + break; + case 6: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5) } + break; + case 7: { PMPML_64_CHUNK_LOOP_BODY_ULI_T1(0) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(1) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(2) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(3) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(4) PMPML_64_CHUNK_LOOP_BODY_ULI_T1(5) + PMPML_64_CHUNK_LOOP_BODY_ULI_T1(6) } + break; + } + + uint64_t xLast = ReadTail(chars, cnt); + + PMPML_64_CHUNK_LOOP_BODY_ULI_T1_LAST(size); + + PMPML_CHUNK_LOOP_PRE_REDUCE_L0_64; + PMPML_CHUNK_REDUCE_128_TO_64_AND_RETURN; + } else if (cnt < PMPML_64_CHUNK_SIZE) { + return _hash_noRecursionNoInline_SingleChunk(chars, cnt); + } else { + return _hash_noRecursionNoInline_type2(chars, cnt); + } } - } public: - PMP_Multilinear_Hasher_64() - { - curr_rd = (random_data_for_PMPML_64*)rd_for_PMPML_64; - coeff0 = curr_rd[0].random_coeff[0]; - } - void seed( uint64_t seed ) - { - curr_rd[0].random_coeff[0] = coeff0 ^ seed; - } -}; + + PMP_Multilinear_Hasher_64() { + curr_rd = (random_data_for_PMPML_64 *)rd_for_PMPML_64; + coeff0 = curr_rd[0].random_coeff[0]; + } + + void seed( uint64_t seed ) { + curr_rd[0].random_coeff[0] = coeff0 ^ seed; + } +}; // class PMP_Multilinear_Hasher_64 //------------------------------------------------------------- // SMHasher3 API functions @@ -2561,63 +2716,65 @@ class PMP_Multilinear_Hasher_64 static thread_local PMP_Multilinear_Hasher_32 pmpml_hasher_32; static thread_local PMP_Multilinear_Hasher_64 pmpml_hasher_64; -static uintptr_t PMPML_32_seed(const seed_t seed) { - pmpml_hasher_32.seed((uint64_t)seed); - return (uintptr_t)(&pmpml_hasher_32); +static uintptr_t PMPML_32_seed( const seed_t seed ) { + pmpml_hasher_32.seed((uint64_t)seed); + return (uintptr_t)(&pmpml_hasher_32); } -static uintptr_t PMPML_64_seed(const seed_t seed) { - pmpml_hasher_64.seed((uint64_t)seed); - return (uintptr_t)(&pmpml_hasher_64); +static uintptr_t PMPML_64_seed( const seed_t seed ) { + pmpml_hasher_64.seed((uint64_t)seed); + return (uintptr_t)(&pmpml_hasher_64); } -template < bool bswap > -static void PMPML_32(const void * in, const size_t len, const seed_t seed, void * out) { - PMP_Multilinear_Hasher_32 * p = (PMP_Multilinear_Hasher_32 *)(uintptr_t)seed; - uint32_t h = p->hash((const uint8_t *)in, len); - PUT_U32(h, (uint8_t *)out, 0); +template +static void PMPML_32( const void * in, const size_t len, const seed_t seed, void * out ) { + PMP_Multilinear_Hasher_32 * p = (PMP_Multilinear_Hasher_32 *)(uintptr_t)seed; + uint32_t h = p->hash((const uint8_t *)in, len); + + PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void PMPML_64(const void * in, const size_t len, const seed_t seed, void * out) { - PMP_Multilinear_Hasher_64 * p = (PMP_Multilinear_Hasher_64 *)(uintptr_t)seed; - uint64_t h = p->hash((const uint8_t *)in, len); - PUT_U64(h, (uint8_t *)out, 0); +template +static void PMPML_64( const void * in, const size_t len, const seed_t seed, void * out ) { + PMP_Multilinear_Hasher_64 * p = (PMP_Multilinear_Hasher_64 *)(uintptr_t)seed; + uint64_t h = p->hash((const uint8_t *)in, len); + + PUT_U64(h, (uint8_t *)out, 0); } REGISTER_FAMILY(PMP_mutilinear, - $.src_url = "https://github.com/lemire/StronglyUniversalStringHashing", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/lemire/StronglyUniversalStringHashing", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(PMP_Multilinear_32, - $.desc = "PMP_Multilinear 32-bit", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xF3199670, - $.verification_BE = 0xF602E963, - $.seedfn = PMPML_32_seed, - $.hashfn_native = PMPML_32, - $.hashfn_bswap = PMPML_32 -); + $.desc = "PMP_Multilinear 32-bit", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xF3199670, + $.verification_BE = 0xF602E963, + $.seedfn = PMPML_32_seed, + $.hashfn_native = PMPML_32, + $.hashfn_bswap = PMPML_32 + ); REGISTER_HASH(PMP_Multilinear_64, - $.desc = "PMP_Multilinear 64-bit", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE , - $.impl_flags = - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0xB776D2B9, - $.verification_BE = 0x8E1E0CDF, - $.seedfn = PMPML_64_seed, - $.hashfn_native = PMPML_64, - $.hashfn_bswap = PMPML_64 -); + $.desc = "PMP_Multilinear 64-bit", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE, + $.impl_flags = + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0xB776D2B9, + $.verification_BE = 0x8E1E0CDF, + $.seedfn = PMPML_64_seed, + $.hashfn_native = PMPML_64, + $.hashfn_bswap = PMPML_64 + ); diff --git a/hashes/poly_mersenne.cpp b/hashes/poly_mersenne.cpp index f27160c9..b6fe5444 100644 --- a/hashes/poly_mersenne.cpp +++ b/hashes/poly_mersenne.cpp @@ -4,7 +4,7 @@ * Copyright (c) 2020-2021 Reini Urban * Copyright (c) 2020 Thomas Dybdahl Ahle * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. + * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -48,11 +48,11 @@ // test it with the RNG you plan on using to seed it. static uint64_t BSD_nextrand; -static void BSD_srand(uint64_t seed) { +static void BSD_srand( uint64_t seed ) { BSD_nextrand = seed; } -static uint32_t BSD_rand(void) { +static uint32_t BSD_rand( void ) { /* * Compute x = (7^5 * x) mod (2^31 - 1) * without overflowing 31 bits: @@ -61,41 +61,43 @@ static uint32_t BSD_rand(void) { * Park and Miller, Communications of the ACM, vol. 31, no. 10, * October 1988, p. 1195. */ - uint64_t hi, lo, x; - - x = (BSD_nextrand % 0x7ffffffe) + 1; - hi = x / 127773; - lo = x % 127773; - x = 16807 * lo - 2836 * hi; - if (x < 0) - x += 0x7fffffff; + uint64_t hi, lo, x; + + x = (BSD_nextrand % 0x7ffffffe) + 1; + hi = x / 127773; + lo = x % 127773; + x = 16807 * lo - 2836 * hi; + if (x < 0) { + x += 0x7fffffff; + } BSD_nextrand = --x; - return x; + return x; } -const static uint64_t MERSENNE_61 = (1ull << 61) - 1; +const static uint64_t MERSENNE_61 = (1ull << 61) - 1; const static uint32_t POLY_MERSENNE_MAX_K = 4; -static uint64_t poly_mersenne_random[POLY_MERSENNE_MAX_K+1]; -static uint64_t poly_mersenne_a; -static uint64_t poly_mersenne_b; +static uint64_t poly_mersenne_random[POLY_MERSENNE_MAX_K + 1]; +static uint64_t poly_mersenne_a; +static uint64_t poly_mersenne_b; -static uint128_t rand_u128(void) { +static uint128_t rand_u128( void ) { // We don't know how many bits we get from rand(), // but it is at least 16, so we concattenate a couple. uint128_t r = BSD_rand(); + for (int i = 0; i < 7; i++) { r <<= 16; - r ^= BSD_rand(); + r ^= BSD_rand(); } return r; } -static uintptr_t poly_mersenne_seed_init(const seed_t seed) { +static uintptr_t poly_mersenne_seed_init( const seed_t seed ) { BSD_srand(seed); // a has be at most 2^60, or the lazy modular reduction won't work. - poly_mersenne_a = rand_u128() % (MERSENNE_61/2); + poly_mersenne_a = rand_u128() % (MERSENNE_61 / 2); poly_mersenne_b = rand_u128() % MERSENNE_61; - for (int i = 0; i < POLY_MERSENNE_MAX_K+1; i++) { + for (int i = 0; i < POLY_MERSENNE_MAX_K + 1; i++) { // The random values should be at most 2^61-2, or the lazy // modular reduction won't work. poly_mersenne_random[i] = rand_u128() % MERSENNE_61; @@ -103,20 +105,21 @@ static uintptr_t poly_mersenne_seed_init(const seed_t seed) { return 0; } -static uint64_t mult_combine61(uint64_t h, uint64_t x, uint64_t a) { +static uint64_t mult_combine61( uint64_t h, uint64_t x, uint64_t a ) { uint64_t rhi = 0, rlo = a; + fma64_128(rlo, rhi, h, x); - rhi <<= (64 - 61); - rhi |= (rlo >> 61); - rlo &= MERSENNE_61; + rhi <<= (64 - 61); + rhi |= (rlo >> 61); + rlo &= MERSENNE_61; return rlo + rhi; } // This function ignores the seed, because it uses a separate seeding function. -template < uint32_t K, bool bswap > -static void Poly_Mersenne(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void Poly_Mersenne( const void * in, const size_t len, const seed_t seed, void * out ) { const uint8_t * buf = (const uint8_t *)in; // We first combine hashes using a polynomial in `a`: @@ -128,7 +131,7 @@ static void Poly_Mersenne(const void * in, const size_t len, const seed_t seed, // We use the length as the first character. uint64_t h = len; - for (size_t i = 0; i < len/4; i++, buf += 4) { + for (size_t i = 0; i < len / 4; i++, buf += 4) { // Partial modular reduction. Since each round adds 32 bits, and this // subtracts (up to) 61 bits, we make sure to never overflow. h = mult_combine61(h, a, GET_U32(buf, 0)); @@ -138,8 +141,8 @@ static void Poly_Mersenne(const void * in, const size_t len, const seed_t seed, int remaining_bytes = len % 4; if (remaining_bytes) { uint32_t last = 0; - if (remaining_bytes & 2) {last = GET_U16(buf, 0); buf += 2;} - if (remaining_bytes & 1) {last = (last << 8) | (*buf);} + if (remaining_bytes & 2) { last = GET_U16(buf, 0); buf += 2; } + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } h = mult_combine61(h, a, last); } @@ -161,81 +164,81 @@ static void Poly_Mersenne(const void * in, const size_t len, const seed_t seed, } REGISTER_FAMILY(poly_mersenne, - $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(poly_mersenne__deg1, - $.desc = "Degree 1 Hashing mod 2^61-1", - $.hash_flags = - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_128BIT | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0x50526DA4, - $.verification_BE = 0xBB8CF709, - $.seedfn = poly_mersenne_seed_init, - $.hashfn_native = Poly_Mersenne<1, false>, - $.hashfn_bswap = Poly_Mersenne<1, true> -); + $.desc = "Degree 1 Hashing mod 2^61-1", + $.hash_flags = + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_128BIT | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0x50526DA4, + $.verification_BE = 0xBB8CF709, + $.seedfn = poly_mersenne_seed_init, + $.hashfn_native = Poly_Mersenne<1, false>, + $.hashfn_bswap = Poly_Mersenne<1, true> + ); REGISTER_HASH(poly_mersenne__deg2, - $.desc = "Degree 2 Hashing mod 2^61-1", - $.hash_flags = - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_128BIT | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xCDDDA91B, - $.verification_BE = 0x9507D811, - $.seedfn = poly_mersenne_seed_init, - $.hashfn_native = Poly_Mersenne<2, false>, - $.hashfn_bswap = Poly_Mersenne<2, true>, - $.badseeds = {0x60e8512c}, - $.seedfixfn = excludeBadseeds -); + $.desc = "Degree 2 Hashing mod 2^61-1", + $.hash_flags = + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_128BIT | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xCDDDA91B, + $.verification_BE = 0x9507D811, + $.seedfn = poly_mersenne_seed_init, + $.hashfn_native = Poly_Mersenne<2, false>, + $.hashfn_bswap = Poly_Mersenne<2, true>, + $.badseeds = { 0x60e8512c }, + $.seedfixfn = excludeBadseeds + ); REGISTER_HASH(poly_mersenne__deg3, - $.desc = "Degree 3 Hashing mod 2^61-1", - $.hash_flags = - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_128BIT | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0x7D822707, - $.verification_BE = 0x7273EB0A, - $.seedfn = poly_mersenne_seed_init, - $.hashfn_native = Poly_Mersenne<3, false>, - $.hashfn_bswap = Poly_Mersenne<3, true>, - $.badseeds = {0x3d25f745}, - $.seedfixfn = excludeBadseeds -); + $.desc = "Degree 3 Hashing mod 2^61-1", + $.hash_flags = + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_128BIT | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0x7D822707, + $.verification_BE = 0x7273EB0A, + $.seedfn = poly_mersenne_seed_init, + $.hashfn_native = Poly_Mersenne<3, false>, + $.hashfn_bswap = Poly_Mersenne<3, true>, + $.badseeds = { 0x3d25f745 }, + $.seedfixfn = excludeBadseeds + ); REGISTER_HASH(poly_mersenne__deg4, - $.desc = "Degree 4 Hashing mod 2^61-1", - $.hash_flags = - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_128BIT | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xBF0273E6, - $.verification_BE = 0xAA526413, - $.seedfn = poly_mersenne_seed_init, - $.hashfn_native = Poly_Mersenne<4, false>, - $.hashfn_bswap = Poly_Mersenne<4, true> -); + $.desc = "Degree 4 Hashing mod 2^61-1", + $.hash_flags = + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_128BIT | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xBF0273E6, + $.verification_BE = 0xAA526413, + $.seedfn = poly_mersenne_seed_init, + $.hashfn_native = Poly_Mersenne<4, false>, + $.hashfn_bswap = Poly_Mersenne<4, true> + ); #else REGISTER_FAMILY(poly_mersenne); diff --git a/hashes/prvhash.cpp b/hashes/prvhash.cpp index e514b1e8..072f9a87 100644 --- a/hashes/prvhash.cpp +++ b/hashes/prvhash.cpp @@ -36,19 +36,19 @@ * @param MsgEnd Message's end pointer. * @param fb Final byte used for padding. */ -template < bool bswap > -static inline uint64_t prvhash_lpu64ec( const uint8_t * const Msg, - const uint8_t * const MsgEnd, uint64_t fb ) { -const int l = (int)(MsgEnd - Msg); - fb <<= ( l << 3 ); +template +static inline uint64_t prvhash_lpu64ec( const uint8_t * const Msg, const uint8_t * const MsgEnd, uint64_t fb ) { + const int l = (int)(MsgEnd - Msg); - if( l > 3 ) { + fb <<= (l << 3); + + if (l > 3) { fb |= (uint64_t)GET_U32(Msg, 0); - if( l > 4 ) { + if (l > 4) { fb |= (uint64_t)Msg[4] << 32; - if( l > 5 ) { + if (l > 5) { fb |= (uint64_t)Msg[5] << 40; - if( l > 6 ) { + if (l > 6) { fb |= (uint64_t)Msg[6] << 48; } } @@ -56,11 +56,11 @@ const int l = (int)(MsgEnd - Msg); return fb; } - if( l != 0 ) { + if (l != 0) { fb |= Msg[0]; - if( l > 1 ) { + if (l > 1) { fb |= (uint64_t)Msg[1] << 8; - if( l > 2 ) { + if (l > 2) { fb |= (uint64_t)Msg[2] << 16; } } @@ -69,11 +69,11 @@ const int l = (int)(MsgEnd - Msg); return fb; } -static inline uint64_t prvhash_core64(uint64_t & Seed, uint64_t & lcg, uint64_t & Hash) { +static inline uint64_t prvhash_core64( uint64_t & Seed, uint64_t & lcg, uint64_t & Hash ) { Seed *= lcg * 2 + 1; const uint64_t rs = Seed >> 32 | Seed << 32; - Hash += rs + UINT64_C(0xAAAAAAAAAAAAAAAA); - lcg += Seed + UINT64_C(0x5555555555555555); + Hash += rs + UINT64_C(0xAAAAAAAAAAAAAAAA); + lcg += Seed + UINT64_C(0x5555555555555555); Seed ^= Hash; const uint64_t out = lcg ^ rs; @@ -98,17 +98,17 @@ static inline uint64_t prvhash_core64(uint64_t & Seed, uint64_t & lcg, uint64_t * @param Hash2p Location to write the second 8-byte hash result to, * if width128 == true. */ -template < bool bswap, bool width128 > -static inline uint64_t prvhash64_64m(const void * const Msg0, - const size_t MsgLen, const uint64_t UseSeed, uint64_t * Hash2p = NULL) { - const uint8_t * Msg = (const uint8_t *)Msg0; - const uint8_t* const MsgEnd = Msg + MsgLen; - - uint64_t Seed = UINT64_C(0x217992B44669F46A); // The state after 5 PRVHASH rounds - uint64_t lcg = UINT64_C(0xB5E2CC2FE9F0B35B); // from the "zero-state". - uint64_t Hash = UINT64_C(0x949B5E0A608D76D5); +template +static inline uint64_t prvhash64_64m( const void * const Msg0, const size_t MsgLen, + const uint64_t UseSeed, uint64_t * Hash2p = NULL ) { + const uint8_t * Msg = (const uint8_t *)Msg0; + const uint8_t * const MsgEnd = Msg + MsgLen; + + uint64_t Seed = UINT64_C(0x217992B44669F46A); // The state after 5 PRVHASH rounds + uint64_t lcg = UINT64_C(0xB5E2CC2FE9F0B35B); // from the "zero-state". + uint64_t Hash = UINT64_C(0x949B5E0A608D76D5); uint64_t Hash2 = 0; - bool hc = true; + bool hc = true; Hash ^= UseSeed; @@ -120,15 +120,15 @@ static inline uint64_t prvhash64_64m(const void * const Msg0, while (1) { if (Msg < (MsgEnd - (sizeof(uint64_t) - 1))) { - const uint64_t msgw = GET_U64(Msg, 0); + const uint64_t msgw = GET_U64 (Msg, 0); Seed ^= msgw; - lcg ^= msgw; + lcg ^= msgw; } else if (Msg <= MsgEnd) { const uint64_t msgw = prvhash_lpu64ec(Msg, MsgEnd, fb); Seed ^= msgw; - lcg ^= msgw; + lcg ^= msgw; } else { break; } @@ -156,11 +156,11 @@ static inline uint64_t prvhash64_64m(const void * const Msg0, uint64_t h; if (hc) { - h = prvhash_core64(Seed, lcg, Hash); + h = prvhash_core64(Seed, lcg, Hash ); *Hash2p = prvhash_core64(Seed, lcg, Hash2); } else { *Hash2p = prvhash_core64(Seed, lcg, Hash2); - h = prvhash_core64(Seed, lcg, Hash); + h = prvhash_core64(Seed, lcg, Hash ); } return h; } @@ -173,21 +173,22 @@ static inline uint64_t prvhash64_64m(const void * const Msg0, * (with a Seed0 of 0) to the official "prvhash64s_oneshot" function * with HashLen == 8 or 16, but returns an immediate result. */ -#define PRVHASH_INIT_COUNT 5 // Common number of initialization rounds. -#define PRH64S_PAR 4 // PRVHASH parallelism +#define PRVHASH_INIT_COUNT 5 // Common number of initialization rounds. +#define PRH64S_PAR 4 // PRVHASH parallelism #define PRH64S_LEN (sizeof(uint64_t) * PRH64S_PAR) // Intermediate block's length. -template < bool bswap, bool width128 > -static inline void prvhash64s_oneshot(const void * const Msg0, - size_t MsgLen0, uint64_t Seed0, uint8_t * const HashOut) { + +template +static inline void prvhash64s_oneshot( const void * const Msg0, size_t MsgLen0, + uint64_t Seed0, uint8_t * const HashOut ) { uint64_t Seed[PRH64S_PAR]; uint64_t lcg[PRH64S_PAR]; uint64_t Hash[2]; - bool hc = true; + bool hc = true; memset(Hash, 0, sizeof(Hash)); for (int i = 0; i < PRH64S_PAR; i++) { Seed[i] = Seed0; - lcg[i] = 0; + lcg[i] = 0; } for (int i = 0; i < PRVHASH_INIT_COUNT; i++) { for (int j = 0; j < PRH64S_PAR; j++) { @@ -195,8 +196,8 @@ static inline void prvhash64s_oneshot(const void * const Msg0, } } - const uint8_t * Msg = (const uint8_t *)Msg0; - size_t MsgLen = MsgLen0; + const uint8_t * Msg = (const uint8_t *)Msg0; + size_t MsgLen = MsgLen0; while (MsgLen >= PRH64S_LEN) { for (int j = 0; j < PRH64S_PAR; j++) { @@ -204,7 +205,7 @@ static inline void prvhash64s_oneshot(const void * const Msg0, Seed[j] ^= m; lcg[j] ^= m; prvhash_core64(Seed[j], lcg[j], hc ? Hash[0] : Hash[1]); - Msg += sizeof(uint64_t); + Msg += sizeof(uint64_t); } if (width128) { hc = !hc; @@ -212,12 +213,11 @@ static inline void prvhash64s_oneshot(const void * const Msg0, MsgLen -= PRH64S_LEN; } - uint8_t fb = (MsgLen0 == 0) ? 1 : - (uint8_t)(1 << (*(Msg + MsgLen - 1) >> 7)); + uint8_t fb = (MsgLen0 == 0) ? 1 : (uint8_t)(1 << (*(Msg + MsgLen - 1) >> 7)); - uint8_t fbytes[PRH64S_LEN * 2 + 24]; - uint8_t * ptr = fbytes; - size_t MsgExtra = 0; + uint8_t fbytes[PRH64S_LEN * 2 + 24]; + uint8_t * ptr = fbytes; + size_t MsgExtra = 0; memcpy(ptr, Msg, MsgLen); ptr += MsgLen; @@ -225,18 +225,17 @@ static inline void prvhash64s_oneshot(const void * const Msg0, memset(ptr, 0, sizeof(fbytes) - MsgLen); ptr[sizeof(uint64_t) - 1] = fb; - ptr += sizeof(uint64_t); + ptr += sizeof(uint64_t); MsgExtra += sizeof(uint64_t); PUT_U64(MsgLen0 + sizeof(uint64_t), ptr, 0); - ptr += sizeof(uint64_t); + ptr += sizeof(uint64_t); MsgExtra += sizeof(uint64_t); - fb = (MsgLen0 == 0) ? 1 : - (uint8_t)(1 << (*(ptr - 1) >> 7)); + fb = (MsgLen0 == 0) ? 1 : (uint8_t)(1 << (*(ptr - 1) >> 7)); ptr[sizeof(uint64_t) - 1] = fb; - ptr += sizeof(uint64_t); + ptr += sizeof(uint64_t); MsgExtra += sizeof(uint64_t); if (((ptr - fbytes) % PRH64S_LEN) != 0) { @@ -244,12 +243,12 @@ static inline void prvhash64s_oneshot(const void * const Msg0, } MsgLen += MsgExtra; - ptr = fbytes; + ptr = fbytes; while (MsgLen >= PRH64S_LEN) { for (int j = 0; j < PRH64S_PAR; j++) { const uint64_t m = GET_U64(ptr, 0); - ptr += sizeof(uint64_t); + ptr += sizeof(uint64_t); Seed[j] ^= m; lcg[j] ^= m; prvhash_core64(Seed[j], lcg[j], hc ? Hash[0] : Hash[1]); @@ -260,8 +259,8 @@ static inline void prvhash64s_oneshot(const void * const Msg0, MsgLen -= PRH64S_LEN; } - const size_t fc = 8 + (!width128 ? 0 : - (16 + (((((MsgLen0 + MsgExtra) < (16 * PRH64S_PAR)) && !hc)) ? 8 : 0))); + const size_t fc = 8 + (!width128 ? + 0 : (16 + (((((MsgLen0 + MsgExtra) < (16 * PRH64S_PAR)) && !hc)) ? 8 : 0))); for (size_t k = 0; k <= fc; k += sizeof(uint64_t)) { for (int j = 0; j < PRH64S_PAR; j++) { prvhash_core64(Seed[j], lcg[j], hc ? Hash[0] : Hash[1]); @@ -287,97 +286,99 @@ static inline void prvhash64s_oneshot(const void * const Msg0, } } -template < bool bswap > -static void prvhash64(const void * in, const size_t len, const seed_t seed, void * out) { - uint64_t h = prvhash64_64m(in, len, (uint64_t)seed); +template +static void prvhash64( const void * in, const size_t len, const seed_t seed, void * out ) { + uint64_t h = prvhash64_64m(in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void prvhash128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void prvhash128( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h1, h2; - h1 = prvhash64_64m(in, len, (uint64_t)seed, &h2); + + h1 = prvhash64_64m(in, len, (uint64_t)seed, &h2); PUT_U64(h1, (uint8_t *)out, 0); PUT_U64(h2, (uint8_t *)out, 8); } -template < bool bswap > -static void prvhash64s(const void * in, const size_t len, const seed_t seed, void * out) { - prvhash64s_oneshot(in, len, (uint64_t)seed, (uint8_t *)out); +template +static void prvhash64s( const void * in, const size_t len, const seed_t seed, void * out ) { + prvhash64s_oneshot(in, len, (uint64_t)seed, (uint8_t *)out); } -template < bool bswap > -static void prvhash128s(const void * in, const size_t len, const seed_t seed, void * out) { - prvhash64s_oneshot(in, len, (uint64_t)seed, (uint8_t *)out); +template +static void prvhash128s( const void * in, const size_t len, const seed_t seed, void * out ) { + prvhash64s_oneshot(in, len, (uint64_t)seed, (uint8_t *)out); } REGISTER_FAMILY(prvhash, - $.src_url = "https://github.com/avaneev/prvhash", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/avaneev/prvhash", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(prvhash_64, - $.desc = "prvhash64 v4.3 64-bit output", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xD37C7E74, - $.verification_BE = 0xBAD02709, - $.hashfn_native = prvhash64, - $.hashfn_bswap = prvhash64 -); + $.desc = "prvhash64 v4.3 64-bit output", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xD37C7E74, + $.verification_BE = 0xBAD02709, + $.hashfn_native = prvhash64, + $.hashfn_bswap = prvhash64 + ); REGISTER_HASH(prvhash_128, - $.desc = "prvhash64 v4.3 128-bit output", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0xB447480F, - $.verification_BE = 0xF93A26FC, - $.hashfn_native = prvhash128, - $.hashfn_bswap = prvhash128 -); + $.desc = "prvhash64 v4.3 128-bit output", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xB447480F, + $.verification_BE = 0xF93A26FC, + $.hashfn_native = prvhash128, + $.hashfn_bswap = prvhash128 + ); REGISTER_HASH(prvhash_64__incr, - $.desc = "prvhash64 v4.3 streaming mode 64-bit output", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x891521D6, - $.verification_BE = 0xD41B8DB5, - $.hashfn_native = prvhash64s, - $.hashfn_bswap = prvhash64s -); + $.desc = "prvhash64 v4.3 streaming mode 64-bit output", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x891521D6, + $.verification_BE = 0xD41B8DB5, + $.hashfn_native = prvhash64s, + $.hashfn_bswap = prvhash64s + ); REGISTER_HASH(prvhash_128__incr, - $.desc = "prvhash64 v4.3 streaming mode 128-bit output", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x0199728A, - $.verification_BE = 0xD2B2DE25, - $.hashfn_native = prvhash128s, - $.hashfn_bswap = prvhash128s -); + $.desc = "prvhash64 v4.3 streaming mode 128-bit output", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x0199728A, + $.verification_BE = 0xD2B2DE25, + $.hashfn_native = prvhash128s, + $.hashfn_bswap = prvhash128s + ); diff --git a/hashes/rmd.cpp b/hashes/rmd.cpp index b343c6d4..20eb959c 100644 --- a/hashes/rmd.cpp +++ b/hashes/rmd.cpp @@ -31,9 +31,9 @@ #include "Hashlib.h" typedef struct { - uint64_t length; - uint8_t buf[64]; - uint32_t curlen, state[8]; + uint64_t length; + uint8_t buf[64]; + uint32_t curlen, state[8]; } rmd_ctx; /* the five basic functions */ @@ -43,448 +43,436 @@ typedef struct { #define I(x, y, z) (((x) & (z)) | ((y) & ~(z))) #define J(x, y, z) ((x) ^ ((y) | ~(z))) -#define OP4(f, a, b, c, d, x, s, k) \ - (a) += f((b), (c), (d)) + (x) + (k); \ +#define OP4(f, a, b, c, d, x, s, k) \ + (a) += f((b), (c), (d)) + (x) + (k); \ (a) = ROTL32((a), (s)); -#define OP5(f, a, b, c, d, e, x, s, k) \ - (a) += f((b), (c), (d)) + (x) + (k); \ - (a) = ROTL32((a), (s)) + (e); \ +#define OP5(f, a, b, c, d, e, x, s, k) \ + (a) += f((b), (c), (d)) + (x) + (k); \ + (a) = ROTL32((a), (s)) + (e); \ (c) = ROTL32((c), 10); -template < uint32_t hashwidth, bool bswap > -static void rmd_compress(rmd_ctx * ctx, const uint8_t * buf) { - uint32_t aa,bb,cc,dd,ee,aaa,bbb,ccc,ddd,eee,X[16]; - int i; - const uint32_t k0 = 0; - const uint32_t k1 = 0x50a28be6; - const uint32_t k2 = 0x5a827999; - const uint32_t k3 = 0x5c4dd124; - const uint32_t k4 = 0x6ed9eba1; - const uint32_t k5 = 0x6d703ef3; - const uint32_t k6 = 0x8f1bbcdc; - const uint32_t k7 = 0; - const uint32_t k8 = 0xa953fd4e; - const uint32_t k9 = 0x7a6d76e9; - - /* load words X */ - for (i = 0; i < 16; i++){ - X[i] = GET_U32(buf, (4 * i)); - } - - /* load state */ - aa = aaa = ctx->state[0]; - bb = bbb = ctx->state[1]; - cc = ccc = ctx->state[2]; - dd = ddd = ctx->state[3]; - if (hashwidth == 160) { - ee = eee = ctx->state[4]; - } else if (hashwidth == 256) { - aaa = ctx->state[4]; - bbb = ctx->state[5]; - ccc = ctx->state[6]; - ddd = ctx->state[7]; - } - - /* round 1 */ - if (hashwidth == 160) { - - OP5(F, aa, bb, cc, dd, ee, X[ 0], 11, k0); - OP5(F, ee, aa, bb, cc, dd, X[ 1], 14, k0); - OP5(F, dd, ee, aa, bb, cc, X[ 2], 15, k0); - OP5(F, cc, dd, ee, aa, bb, X[ 3], 12, k0); - OP5(F, bb, cc, dd, ee, aa, X[ 4], 5, k0); - OP5(F, aa, bb, cc, dd, ee, X[ 5], 8, k0); - OP5(F, ee, aa, bb, cc, dd, X[ 6], 7, k0); - OP5(F, dd, ee, aa, bb, cc, X[ 7], 9, k0); - OP5(F, cc, dd, ee, aa, bb, X[ 8], 11, k0); - OP5(F, bb, cc, dd, ee, aa, X[ 9], 13, k0); - OP5(F, aa, bb, cc, dd, ee, X[10], 14, k0); - OP5(F, ee, aa, bb, cc, dd, X[11], 15, k0); - OP5(F, dd, ee, aa, bb, cc, X[12], 6, k0); - OP5(F, cc, dd, ee, aa, bb, X[13], 7, k0); - OP5(F, bb, cc, dd, ee, aa, X[14], 9, k0); - OP5(F, aa, bb, cc, dd, ee, X[15], 8, k0); - - OP5(J, aaa, bbb, ccc, ddd, eee, X[ 5], 8, k1); - OP5(J, eee, aaa, bbb, ccc, ddd, X[14], 9, k1); - OP5(J, ddd, eee, aaa, bbb, ccc, X[ 7], 9, k1); - OP5(J, ccc, ddd, eee, aaa, bbb, X[ 0], 11, k1); - OP5(J, bbb, ccc, ddd, eee, aaa, X[ 9], 13, k1); - OP5(J, aaa, bbb, ccc, ddd, eee, X[ 2], 15, k1); - OP5(J, eee, aaa, bbb, ccc, ddd, X[11], 15, k1); - OP5(J, ddd, eee, aaa, bbb, ccc, X[ 4], 5, k1); - OP5(J, ccc, ddd, eee, aaa, bbb, X[13], 7, k1); - OP5(J, bbb, ccc, ddd, eee, aaa, X[ 6], 7, k1); - OP5(J, aaa, bbb, ccc, ddd, eee, X[15], 8, k1); - OP5(J, eee, aaa, bbb, ccc, ddd, X[ 8], 11, k1); - OP5(J, ddd, eee, aaa, bbb, ccc, X[ 1], 14, k1); - OP5(J, ccc, ddd, eee, aaa, bbb, X[10], 14, k1); - OP5(J, bbb, ccc, ddd, eee, aaa, X[ 3], 12, k1); - OP5(J, aaa, bbb, ccc, ddd, eee, X[12], 6, k1); - - } else { - - OP4(F, aa, bb, cc, dd, X[ 0], 11, k0); - OP4(F, dd, aa, bb, cc, X[ 1], 14, k0); - OP4(F, cc, dd, aa, bb, X[ 2], 15, k0); - OP4(F, bb, cc, dd, aa, X[ 3], 12, k0); - OP4(F, aa, bb, cc, dd, X[ 4], 5, k0); - OP4(F, dd, aa, bb, cc, X[ 5], 8, k0); - OP4(F, cc, dd, aa, bb, X[ 6], 7, k0); - OP4(F, bb, cc, dd, aa, X[ 7], 9, k0); - OP4(F, aa, bb, cc, dd, X[ 8], 11, k0); - OP4(F, dd, aa, bb, cc, X[ 9], 13, k0); - OP4(F, cc, dd, aa, bb, X[10], 14, k0); - OP4(F, bb, cc, dd, aa, X[11], 15, k0); - OP4(F, aa, bb, cc, dd, X[12], 6, k0); - OP4(F, dd, aa, bb, cc, X[13], 7, k0); - OP4(F, cc, dd, aa, bb, X[14], 9, k0); - OP4(F, bb, cc, dd, aa, X[15], 8, k0); - - OP4(I, aaa, bbb, ccc, ddd, X[ 5], 8, k1); - OP4(I, ddd, aaa, bbb, ccc, X[14], 9, k1); - OP4(I, ccc, ddd, aaa, bbb, X[ 7], 9, k1); - OP4(I, bbb, ccc, ddd, aaa, X[ 0], 11, k1); - OP4(I, aaa, bbb, ccc, ddd, X[ 9], 13, k1); - OP4(I, ddd, aaa, bbb, ccc, X[ 2], 15, k1); - OP4(I, ccc, ddd, aaa, bbb, X[11], 15, k1); - OP4(I, bbb, ccc, ddd, aaa, X[ 4], 5, k1); - OP4(I, aaa, bbb, ccc, ddd, X[13], 7, k1); - OP4(I, ddd, aaa, bbb, ccc, X[ 6], 7, k1); - OP4(I, ccc, ddd, aaa, bbb, X[15], 8, k1); - OP4(I, bbb, ccc, ddd, aaa, X[ 8], 11, k1); - OP4(I, aaa, bbb, ccc, ddd, X[ 1], 14, k1); - OP4(I, ddd, aaa, bbb, ccc, X[10], 14, k1); - OP4(I, ccc, ddd, aaa, bbb, X[ 3], 12, k1); - OP4(I, bbb, ccc, ddd, aaa, X[12], 6, k1); - - if (hashwidth == 256) { - uint64_t tmp = aa; aa = aaa; aaa = tmp; - } - } - - /* round 2 */ - if (hashwidth == 160) { - - OP5(G, ee, aa, bb, cc, dd, X[ 7], 7, k2); - OP5(G, dd, ee, aa, bb, cc, X[ 4], 6, k2); - OP5(G, cc, dd, ee, aa, bb, X[13], 8, k2); - OP5(G, bb, cc, dd, ee, aa, X[ 1], 13, k2); - OP5(G, aa, bb, cc, dd, ee, X[10], 11, k2); - OP5(G, ee, aa, bb, cc, dd, X[ 6], 9, k2); - OP5(G, dd, ee, aa, bb, cc, X[15], 7, k2); - OP5(G, cc, dd, ee, aa, bb, X[ 3], 15, k2); - OP5(G, bb, cc, dd, ee, aa, X[12], 7, k2); - OP5(G, aa, bb, cc, dd, ee, X[ 0], 12, k2); - OP5(G, ee, aa, bb, cc, dd, X[ 9], 15, k2); - OP5(G, dd, ee, aa, bb, cc, X[ 5], 9, k2); - OP5(G, cc, dd, ee, aa, bb, X[ 2], 11, k2); - OP5(G, bb, cc, dd, ee, aa, X[14], 7, k2); - OP5(G, aa, bb, cc, dd, ee, X[11], 13, k2); - OP5(G, ee, aa, bb, cc, dd, X[ 8], 12, k2); - - OP5(I, eee, aaa, bbb, ccc, ddd, X[ 6], 9, k3); - OP5(I, ddd, eee, aaa, bbb, ccc, X[11], 13, k3); - OP5(I, ccc, ddd, eee, aaa, bbb, X[ 3], 15, k3); - OP5(I, bbb, ccc, ddd, eee, aaa, X[ 7], 7, k3); - OP5(I, aaa, bbb, ccc, ddd, eee, X[ 0], 12, k3); - OP5(I, eee, aaa, bbb, ccc, ddd, X[13], 8, k3); - OP5(I, ddd, eee, aaa, bbb, ccc, X[ 5], 9, k3); - OP5(I, ccc, ddd, eee, aaa, bbb, X[10], 11, k3); - OP5(I, bbb, ccc, ddd, eee, aaa, X[14], 7, k3); - OP5(I, aaa, bbb, ccc, ddd, eee, X[15], 7, k3); - OP5(I, eee, aaa, bbb, ccc, ddd, X[ 8], 12, k3); - OP5(I, ddd, eee, aaa, bbb, ccc, X[12], 7, k3); - OP5(I, ccc, ddd, eee, aaa, bbb, X[ 4], 6, k3); - OP5(I, bbb, ccc, ddd, eee, aaa, X[ 9], 15, k3); - OP5(I, aaa, bbb, ccc, ddd, eee, X[ 1], 13, k3); - OP5(I, eee, aaa, bbb, ccc, ddd, X[ 2], 11, k3); - - } else { - - OP4(G, aa, bb, cc, dd, X[ 7], 7, k2); - OP4(G, dd, aa, bb, cc, X[ 4], 6, k2); - OP4(G, cc, dd, aa, bb, X[13], 8, k2); - OP4(G, bb, cc, dd, aa, X[ 1], 13, k2); - OP4(G, aa, bb, cc, dd, X[10], 11, k2); - OP4(G, dd, aa, bb, cc, X[ 6], 9, k2); - OP4(G, cc, dd, aa, bb, X[15], 7, k2); - OP4(G, bb, cc, dd, aa, X[ 3], 15, k2); - OP4(G, aa, bb, cc, dd, X[12], 7, k2); - OP4(G, dd, aa, bb, cc, X[ 0], 12, k2); - OP4(G, cc, dd, aa, bb, X[ 9], 15, k2); - OP4(G, bb, cc, dd, aa, X[ 5], 9, k2); - OP4(G, aa, bb, cc, dd, X[ 2], 11, k2); - OP4(G, dd, aa, bb, cc, X[14], 7, k2); - OP4(G, cc, dd, aa, bb, X[11], 13, k2); - OP4(G, bb, cc, dd, aa, X[ 8], 12, k2); - - OP4(H, aaa, bbb, ccc, ddd, X[ 6], 9, k3); - OP4(H, ddd, aaa, bbb, ccc, X[11], 13, k3); - OP4(H, ccc, ddd, aaa, bbb, X[ 3], 15, k3); - OP4(H, bbb, ccc, ddd, aaa, X[ 7], 7, k3); - OP4(H, aaa, bbb, ccc, ddd, X[ 0], 12, k3); - OP4(H, ddd, aaa, bbb, ccc, X[13], 8, k3); - OP4(H, ccc, ddd, aaa, bbb, X[ 5], 9, k3); - OP4(H, bbb, ccc, ddd, aaa, X[10], 11, k3); - OP4(H, aaa, bbb, ccc, ddd, X[14], 7, k3); - OP4(H, ddd, aaa, bbb, ccc, X[15], 7, k3); - OP4(H, ccc, ddd, aaa, bbb, X[ 8], 12, k3); - OP4(H, bbb, ccc, ddd, aaa, X[12], 7, k3); - OP4(H, aaa, bbb, ccc, ddd, X[ 4], 6, k3); - OP4(H, ddd, aaa, bbb, ccc, X[ 9], 15, k3); - OP4(H, ccc, ddd, aaa, bbb, X[ 1], 13, k3); - OP4(H, bbb, ccc, ddd, aaa, X[ 2], 11, k3); - - if (hashwidth == 256) { - uint64_t tmp = bb; bb = bbb; bbb = tmp; - } - } - - /* round 3 */ - if (hashwidth == 160) { - - OP5(H, dd, ee, aa, bb, cc, X[ 3], 11, k4); - OP5(H, cc, dd, ee, aa, bb, X[10], 13, k4); - OP5(H, bb, cc, dd, ee, aa, X[14], 6, k4); - OP5(H, aa, bb, cc, dd, ee, X[ 4], 7, k4); - OP5(H, ee, aa, bb, cc, dd, X[ 9], 14, k4); - OP5(H, dd, ee, aa, bb, cc, X[15], 9, k4); - OP5(H, cc, dd, ee, aa, bb, X[ 8], 13, k4); - OP5(H, bb, cc, dd, ee, aa, X[ 1], 15, k4); - OP5(H, aa, bb, cc, dd, ee, X[ 2], 14, k4); - OP5(H, ee, aa, bb, cc, dd, X[ 7], 8, k4); - OP5(H, dd, ee, aa, bb, cc, X[ 0], 13, k4); - OP5(H, cc, dd, ee, aa, bb, X[ 6], 6, k4); - OP5(H, bb, cc, dd, ee, aa, X[13], 5, k4); - OP5(H, aa, bb, cc, dd, ee, X[11], 12, k4); - OP5(H, ee, aa, bb, cc, dd, X[ 5], 7, k4); - OP5(H, dd, ee, aa, bb, cc, X[12], 5, k4); - - OP5(H, ddd, eee, aaa, bbb, ccc, X[15], 9, k5); - OP5(H, ccc, ddd, eee, aaa, bbb, X[ 5], 7, k5); - OP5(H, bbb, ccc, ddd, eee, aaa, X[ 1], 15, k5); - OP5(H, aaa, bbb, ccc, ddd, eee, X[ 3], 11, k5); - OP5(H, eee, aaa, bbb, ccc, ddd, X[ 7], 8, k5); - OP5(H, ddd, eee, aaa, bbb, ccc, X[14], 6, k5); - OP5(H, ccc, ddd, eee, aaa, bbb, X[ 6], 6, k5); - OP5(H, bbb, ccc, ddd, eee, aaa, X[ 9], 14, k5); - OP5(H, aaa, bbb, ccc, ddd, eee, X[11], 12, k5); - OP5(H, eee, aaa, bbb, ccc, ddd, X[ 8], 13, k5); - OP5(H, ddd, eee, aaa, bbb, ccc, X[12], 5, k5); - OP5(H, ccc, ddd, eee, aaa, bbb, X[ 2], 14, k5); - OP5(H, bbb, ccc, ddd, eee, aaa, X[10], 13, k5); - OP5(H, aaa, bbb, ccc, ddd, eee, X[ 0], 13, k5); - OP5(H, eee, aaa, bbb, ccc, ddd, X[ 4], 7, k5); - OP5(H, ddd, eee, aaa, bbb, ccc, X[13], 5, k5); - - } else { - - OP4(H, aa, bb, cc, dd, X[ 3], 11, k4); - OP4(H, dd, aa, bb, cc, X[10], 13, k4); - OP4(H, cc, dd, aa, bb, X[14], 6, k4); - OP4(H, bb, cc, dd, aa, X[ 4], 7, k4); - OP4(H, aa, bb, cc, dd, X[ 9], 14, k4); - OP4(H, dd, aa, bb, cc, X[15], 9, k4); - OP4(H, cc, dd, aa, bb, X[ 8], 13, k4); - OP4(H, bb, cc, dd, aa, X[ 1], 15, k4); - OP4(H, aa, bb, cc, dd, X[ 2], 14, k4); - OP4(H, dd, aa, bb, cc, X[ 7], 8, k4); - OP4(H, cc, dd, aa, bb, X[ 0], 13, k4); - OP4(H, bb, cc, dd, aa, X[ 6], 6, k4); - OP4(H, aa, bb, cc, dd, X[13], 5, k4); - OP4(H, dd, aa, bb, cc, X[11], 12, k4); - OP4(H, cc, dd, aa, bb, X[ 5], 7, k4); - OP4(H, bb, cc, dd, aa, X[12], 5, k4); - - OP4(G, aaa, bbb, ccc, ddd, X[15], 9, k5); - OP4(G, ddd, aaa, bbb, ccc, X[ 5], 7, k5); - OP4(G, ccc, ddd, aaa, bbb, X[ 1], 15, k5); - OP4(G, bbb, ccc, ddd, aaa, X[ 3], 11, k5); - OP4(G, aaa, bbb, ccc, ddd, X[ 7], 8, k5); - OP4(G, ddd, aaa, bbb, ccc, X[14], 6, k5); - OP4(G, ccc, ddd, aaa, bbb, X[ 6], 6, k5); - OP4(G, bbb, ccc, ddd, aaa, X[ 9], 14, k5); - OP4(G, aaa, bbb, ccc, ddd, X[11], 12, k5); - OP4(G, ddd, aaa, bbb, ccc, X[ 8], 13, k5); - OP4(G, ccc, ddd, aaa, bbb, X[12], 5, k5); - OP4(G, bbb, ccc, ddd, aaa, X[ 2], 14, k5); - OP4(G, aaa, bbb, ccc, ddd, X[10], 13, k5); - OP4(G, ddd, aaa, bbb, ccc, X[ 0], 13, k5); - OP4(G, ccc, ddd, aaa, bbb, X[ 4], 7, k5); - OP4(G, bbb, ccc, ddd, aaa, X[13], 5, k5); - - if (hashwidth == 256) { - uint64_t tmp = cc; cc = ccc; ccc = tmp; - } - } - - /* round 4 */ - if (hashwidth == 160) { - - OP5(I, cc, dd, ee, aa, bb, X[ 1], 11, k6); - OP5(I, bb, cc, dd, ee, aa, X[ 9], 12, k6); - OP5(I, aa, bb, cc, dd, ee, X[11], 14, k6); - OP5(I, ee, aa, bb, cc, dd, X[10], 15, k6); - OP5(I, dd, ee, aa, bb, cc, X[ 0], 14, k6); - OP5(I, cc, dd, ee, aa, bb, X[ 8], 15, k6); - OP5(I, bb, cc, dd, ee, aa, X[12], 9, k6); - OP5(I, aa, bb, cc, dd, ee, X[ 4], 8, k6); - OP5(I, ee, aa, bb, cc, dd, X[13], 9, k6); - OP5(I, dd, ee, aa, bb, cc, X[ 3], 14, k6); - OP5(I, cc, dd, ee, aa, bb, X[ 7], 5, k6); - OP5(I, bb, cc, dd, ee, aa, X[15], 6, k6); - OP5(I, aa, bb, cc, dd, ee, X[14], 8, k6); - OP5(I, ee, aa, bb, cc, dd, X[ 5], 6, k6); - OP5(I, dd, ee, aa, bb, cc, X[ 6], 5, k6); - OP5(I, cc, dd, ee, aa, bb, X[ 2], 12, k6); - - OP5(G, ccc, ddd, eee, aaa, bbb, X[ 8], 15, k9); - OP5(G, bbb, ccc, ddd, eee, aaa, X[ 6], 5, k9); - OP5(G, aaa, bbb, ccc, ddd, eee, X[ 4], 8, k9); - OP5(G, eee, aaa, bbb, ccc, ddd, X[ 1], 11, k9); - OP5(G, ddd, eee, aaa, bbb, ccc, X[ 3], 14, k9); - OP5(G, ccc, ddd, eee, aaa, bbb, X[11], 14, k9); - OP5(G, bbb, ccc, ddd, eee, aaa, X[15], 6, k9); - OP5(G, aaa, bbb, ccc, ddd, eee, X[ 0], 14, k9); - OP5(G, eee, aaa, bbb, ccc, ddd, X[ 5], 6, k9); - OP5(G, ddd, eee, aaa, bbb, ccc, X[12], 9, k9); - OP5(G, ccc, ddd, eee, aaa, bbb, X[ 2], 12, k9); - OP5(G, bbb, ccc, ddd, eee, aaa, X[13], 9, k9); - OP5(G, aaa, bbb, ccc, ddd, eee, X[ 9], 12, k9); - OP5(G, eee, aaa, bbb, ccc, ddd, X[ 7], 5, k9); - OP5(G, ddd, eee, aaa, bbb, ccc, X[10], 15, k9); - OP5(G, ccc, ddd, eee, aaa, bbb, X[14], 8, k9); - - } else { - - OP4(I, aa, bb, cc, dd, X[ 1], 11, k6); - OP4(I, dd, aa, bb, cc, X[ 9], 12, k6); - OP4(I, cc, dd, aa, bb, X[11], 14, k6); - OP4(I, bb, cc, dd, aa, X[10], 15, k6); - OP4(I, aa, bb, cc, dd, X[ 0], 14, k6); - OP4(I, dd, aa, bb, cc, X[ 8], 15, k6); - OP4(I, cc, dd, aa, bb, X[12], 9, k6); - OP4(I, bb, cc, dd, aa, X[ 4], 8, k6); - OP4(I, aa, bb, cc, dd, X[13], 9, k6); - OP4(I, dd, aa, bb, cc, X[ 3], 14, k6); - OP4(I, cc, dd, aa, bb, X[ 7], 5, k6); - OP4(I, bb, cc, dd, aa, X[15], 6, k6); - OP4(I, aa, bb, cc, dd, X[14], 8, k6); - OP4(I, dd, aa, bb, cc, X[ 5], 6, k6); - OP4(I, cc, dd, aa, bb, X[ 6], 5, k6); - OP4(I, bb, cc, dd, aa, X[ 2], 12, k6); - - OP4(F, aaa, bbb, ccc, ddd, X[ 8], 15, k7); - OP4(F, ddd, aaa, bbb, ccc, X[ 6], 5, k7); - OP4(F, ccc, ddd, aaa, bbb, X[ 4], 8, k7); - OP4(F, bbb, ccc, ddd, aaa, X[ 1], 11, k7); - OP4(F, aaa, bbb, ccc, ddd, X[ 3], 14, k7); - OP4(F, ddd, aaa, bbb, ccc, X[11], 14, k7); - OP4(F, ccc, ddd, aaa, bbb, X[15], 6, k7); - OP4(F, bbb, ccc, ddd, aaa, X[ 0], 14, k7); - OP4(F, aaa, bbb, ccc, ddd, X[ 5], 6, k7); - OP4(F, ddd, aaa, bbb, ccc, X[12], 9, k7); - OP4(F, ccc, ddd, aaa, bbb, X[ 2], 12, k7); - OP4(F, bbb, ccc, ddd, aaa, X[13], 9, k7); - OP4(F, aaa, bbb, ccc, ddd, X[ 9], 12, k7); - OP4(F, ddd, aaa, bbb, ccc, X[ 7], 5, k7); - OP4(F, ccc, ddd, aaa, bbb, X[10], 15, k7); - OP4(F, bbb, ccc, ddd, aaa, X[14], 8, k7); - - if (hashwidth == 256) { - uint64_t tmp = dd; dd = ddd; ddd = tmp; - } - } - - /* round 5 */ - if (hashwidth == 160) { - OP5(J, bb, cc, dd, ee, aa, X[ 4], 9, k8); - OP5(J, aa, bb, cc, dd, ee, X[ 0], 15, k8); - OP5(J, ee, aa, bb, cc, dd, X[ 5], 5, k8); - OP5(J, dd, ee, aa, bb, cc, X[ 9], 11, k8); - OP5(J, cc, dd, ee, aa, bb, X[ 7], 6, k8); - OP5(J, bb, cc, dd, ee, aa, X[12], 8, k8); - OP5(J, aa, bb, cc, dd, ee, X[ 2], 13, k8); - OP5(J, ee, aa, bb, cc, dd, X[10], 12, k8); - OP5(J, dd, ee, aa, bb, cc, X[14], 5, k8); - OP5(J, cc, dd, ee, aa, bb, X[ 1], 12, k8); - OP5(J, bb, cc, dd, ee, aa, X[ 3], 13, k8); - OP5(J, aa, bb, cc, dd, ee, X[ 8], 14, k8); - OP5(J, ee, aa, bb, cc, dd, X[11], 11, k8); - OP5(J, dd, ee, aa, bb, cc, X[ 6], 8, k8); - OP5(J, cc, dd, ee, aa, bb, X[15], 5, k8); - OP5(J, bb, cc, dd, ee, aa, X[13], 6, k8); - - OP5(F, bbb, ccc, ddd, eee, aaa, X[12], 8, k7); - OP5(F, aaa, bbb, ccc, ddd, eee, X[15], 5, k7); - OP5(F, eee, aaa, bbb, ccc, ddd, X[10], 12, k7); - OP5(F, ddd, eee, aaa, bbb, ccc, X[ 4], 9, k7); - OP5(F, ccc, ddd, eee, aaa, bbb, X[ 1], 12, k7); - OP5(F, bbb, ccc, ddd, eee, aaa, X[ 5], 5, k7); - OP5(F, aaa, bbb, ccc, ddd, eee, X[ 8], 14, k7); - OP5(F, eee, aaa, bbb, ccc, ddd, X[ 7], 6, k7); - OP5(F, ddd, eee, aaa, bbb, ccc, X[ 6], 8, k7); - OP5(F, ccc, ddd, eee, aaa, bbb, X[ 2], 13, k7); - OP5(F, bbb, ccc, ddd, eee, aaa, X[13], 6, k7); - OP5(F, aaa, bbb, ccc, ddd, eee, X[14], 5, k7); - OP5(F, eee, aaa, bbb, ccc, ddd, X[ 0], 15, k7); - OP5(F, ddd, eee, aaa, bbb, ccc, X[ 3], 13, k7); - OP5(F, ccc, ddd, eee, aaa, bbb, X[ 9], 11, k7); - OP5(F, bbb, ccc, ddd, eee, aaa, X[11], 11, k7); - } - - /* combine results */ - if (hashwidth == 128) { - ddd += cc + ctx->state[1]; /* final result for MDbuf[0] */ - ctx->state[1] = ctx->state[2] + dd + aaa; - ctx->state[2] = ctx->state[3] + aa + bbb; - ctx->state[3] = ctx->state[0] + bb + ccc; - ctx->state[0] = ddd; - } else if (hashwidth == 160) { - ddd += cc + ctx->state[1]; /* final result for MDbuf[0] */ - ctx->state[1] = ctx->state[2] + dd + eee; - ctx->state[2] = ctx->state[3] + ee + aaa; - ctx->state[3] = ctx->state[4] + aa + bbb; - ctx->state[4] = ctx->state[0] + bb + ccc; - ctx->state[0] = ddd; - } else if (hashwidth == 256) { - ctx->state[0] += aa; - ctx->state[1] += bb; - ctx->state[2] += cc; - ctx->state[3] += dd; - ctx->state[4] += aaa; - ctx->state[5] += bbb; - ctx->state[6] += ccc; - ctx->state[7] += ddd; - } - - return; +template +static void rmd_compress( rmd_ctx * ctx, const uint8_t * buf ) { + uint32_t aa, bb, cc, dd, ee, aaa, bbb, ccc, ddd, eee, X[16]; + int i; + const uint32_t k0 = 0; + const uint32_t k1 = 0x50a28be6; + const uint32_t k2 = 0x5a827999; + const uint32_t k3 = 0x5c4dd124; + const uint32_t k4 = 0x6ed9eba1; + const uint32_t k5 = 0x6d703ef3; + const uint32_t k6 = 0x8f1bbcdc; + const uint32_t k7 = 0; + const uint32_t k8 = 0xa953fd4e; + const uint32_t k9 = 0x7a6d76e9; + + /* load words X */ + for (i = 0; i < 16; i++) { + X[i] = GET_U32(buf, (4 * i)); + } + + /* load state */ + aa = aaa = ctx->state[0]; + bb = bbb = ctx->state[1]; + cc = ccc = ctx->state[2]; + dd = ddd = ctx->state[3]; + if (hashwidth == 160) { + ee = eee = ctx->state[4]; + } else if (hashwidth == 256) { + aaa = ctx->state[4]; + bbb = ctx->state[5]; + ccc = ctx->state[6]; + ddd = ctx->state[7]; + } + + /* round 1 */ + if (hashwidth == 160) { + OP5(F, aa , bb , cc , dd , ee , X[0] , 11, k0); + OP5(F, ee , aa , bb , cc , dd , X[1] , 14, k0); + OP5(F, dd , ee , aa , bb , cc , X[2] , 15, k0); + OP5(F, cc , dd , ee , aa , bb , X[3] , 12, k0); + OP5(F, bb , cc , dd , ee , aa , X[4] , 5, k0); + OP5(F, aa , bb , cc , dd , ee , X[5] , 8, k0); + OP5(F, ee , aa , bb , cc , dd , X[6] , 7, k0); + OP5(F, dd , ee , aa , bb , cc , X[7] , 9, k0); + OP5(F, cc , dd , ee , aa , bb , X[8] , 11, k0); + OP5(F, bb , cc , dd , ee , aa , X[9] , 13, k0); + OP5(F, aa , bb , cc , dd , ee , X[10], 14, k0); + OP5(F, ee , aa , bb , cc , dd , X[11], 15, k0); + OP5(F, dd , ee , aa , bb , cc , X[12], 6, k0); + OP5(F, cc , dd , ee , aa , bb , X[13], 7, k0); + OP5(F, bb , cc , dd , ee , aa , X[14], 9, k0); + OP5(F, aa , bb , cc , dd , ee , X[15], 8, k0); + + OP5(J, aaa, bbb, ccc, ddd, eee, X[5] , 8, k1); + OP5(J, eee, aaa, bbb, ccc, ddd, X[14], 9, k1); + OP5(J, ddd, eee, aaa, bbb, ccc, X[7] , 9, k1); + OP5(J, ccc, ddd, eee, aaa, bbb, X[0] , 11, k1); + OP5(J, bbb, ccc, ddd, eee, aaa, X[9] , 13, k1); + OP5(J, aaa, bbb, ccc, ddd, eee, X[2] , 15, k1); + OP5(J, eee, aaa, bbb, ccc, ddd, X[11], 15, k1); + OP5(J, ddd, eee, aaa, bbb, ccc, X[4] , 5, k1); + OP5(J, ccc, ddd, eee, aaa, bbb, X[13], 7, k1); + OP5(J, bbb, ccc, ddd, eee, aaa, X[6] , 7, k1); + OP5(J, aaa, bbb, ccc, ddd, eee, X[15], 8, k1); + OP5(J, eee, aaa, bbb, ccc, ddd, X[8] , 11, k1); + OP5(J, ddd, eee, aaa, bbb, ccc, X[1] , 14, k1); + OP5(J, ccc, ddd, eee, aaa, bbb, X[10], 14, k1); + OP5(J, bbb, ccc, ddd, eee, aaa, X[3] , 12, k1); + OP5(J, aaa, bbb, ccc, ddd, eee, X[12], 6, k1); + } else { + OP4(F, aa , bb , cc , dd , X[0] , 11, k0); + OP4(F, dd , aa , bb , cc , X[1] , 14, k0); + OP4(F, cc , dd , aa , bb , X[2] , 15, k0); + OP4(F, bb , cc , dd , aa , X[3] , 12, k0); + OP4(F, aa , bb , cc , dd , X[4] , 5, k0); + OP4(F, dd , aa , bb , cc , X[5] , 8, k0); + OP4(F, cc , dd , aa , bb , X[6] , 7, k0); + OP4(F, bb , cc , dd , aa , X[7] , 9, k0); + OP4(F, aa , bb , cc , dd , X[8] , 11, k0); + OP4(F, dd , aa , bb , cc , X[9] , 13, k0); + OP4(F, cc , dd , aa , bb , X[10], 14, k0); + OP4(F, bb , cc , dd , aa , X[11], 15, k0); + OP4(F, aa , bb , cc , dd , X[12], 6, k0); + OP4(F, dd , aa , bb , cc , X[13], 7, k0); + OP4(F, cc , dd , aa , bb , X[14], 9, k0); + OP4(F, bb , cc , dd , aa , X[15], 8, k0); + + OP4(I, aaa, bbb, ccc, ddd, X[5] , 8, k1); + OP4(I, ddd, aaa, bbb, ccc, X[14], 9, k1); + OP4(I, ccc, ddd, aaa, bbb, X[7] , 9, k1); + OP4(I, bbb, ccc, ddd, aaa, X[0] , 11, k1); + OP4(I, aaa, bbb, ccc, ddd, X[9] , 13, k1); + OP4(I, ddd, aaa, bbb, ccc, X[2] , 15, k1); + OP4(I, ccc, ddd, aaa, bbb, X[11], 15, k1); + OP4(I, bbb, ccc, ddd, aaa, X[4] , 5, k1); + OP4(I, aaa, bbb, ccc, ddd, X[13], 7, k1); + OP4(I, ddd, aaa, bbb, ccc, X[6] , 7, k1); + OP4(I, ccc, ddd, aaa, bbb, X[15], 8, k1); + OP4(I, bbb, ccc, ddd, aaa, X[8] , 11, k1); + OP4(I, aaa, bbb, ccc, ddd, X[1] , 14, k1); + OP4(I, ddd, aaa, bbb, ccc, X[10], 14, k1); + OP4(I, ccc, ddd, aaa, bbb, X[3] , 12, k1); + OP4(I, bbb, ccc, ddd, aaa, X[12], 6, k1); + + if (hashwidth == 256) { + uint64_t tmp = aa; aa = aaa; aaa = tmp; + } + } + + /* round 2 */ + if (hashwidth == 160) { + OP5(G, ee , aa , bb , cc , dd , X[7] , 7, k2); + OP5(G, dd , ee , aa , bb , cc , X[4] , 6, k2); + OP5(G, cc , dd , ee , aa , bb , X[13], 8, k2); + OP5(G, bb , cc , dd , ee , aa , X[1] , 13, k2); + OP5(G, aa , bb , cc , dd , ee , X[10], 11, k2); + OP5(G, ee , aa , bb , cc , dd , X[6] , 9, k2); + OP5(G, dd , ee , aa , bb , cc , X[15], 7, k2); + OP5(G, cc , dd , ee , aa , bb , X[3] , 15, k2); + OP5(G, bb , cc , dd , ee , aa , X[12], 7, k2); + OP5(G, aa , bb , cc , dd , ee , X[0] , 12, k2); + OP5(G, ee , aa , bb , cc , dd , X[9] , 15, k2); + OP5(G, dd , ee , aa , bb , cc , X[5] , 9, k2); + OP5(G, cc , dd , ee , aa , bb , X[2] , 11, k2); + OP5(G, bb , cc , dd , ee , aa , X[14], 7, k2); + OP5(G, aa , bb , cc , dd , ee , X[11], 13, k2); + OP5(G, ee , aa , bb , cc , dd , X[8] , 12, k2); + + OP5(I, eee, aaa, bbb, ccc, ddd, X[6] , 9, k3); + OP5(I, ddd, eee, aaa, bbb, ccc, X[11], 13, k3); + OP5(I, ccc, ddd, eee, aaa, bbb, X[3] , 15, k3); + OP5(I, bbb, ccc, ddd, eee, aaa, X[7] , 7, k3); + OP5(I, aaa, bbb, ccc, ddd, eee, X[0] , 12, k3); + OP5(I, eee, aaa, bbb, ccc, ddd, X[13], 8, k3); + OP5(I, ddd, eee, aaa, bbb, ccc, X[5] , 9, k3); + OP5(I, ccc, ddd, eee, aaa, bbb, X[10], 11, k3); + OP5(I, bbb, ccc, ddd, eee, aaa, X[14], 7, k3); + OP5(I, aaa, bbb, ccc, ddd, eee, X[15], 7, k3); + OP5(I, eee, aaa, bbb, ccc, ddd, X[8] , 12, k3); + OP5(I, ddd, eee, aaa, bbb, ccc, X[12], 7, k3); + OP5(I, ccc, ddd, eee, aaa, bbb, X[4] , 6, k3); + OP5(I, bbb, ccc, ddd, eee, aaa, X[9] , 15, k3); + OP5(I, aaa, bbb, ccc, ddd, eee, X[1] , 13, k3); + OP5(I, eee, aaa, bbb, ccc, ddd, X[2] , 11, k3); + } else { + OP4(G, aa , bb , cc , dd , X[7] , 7, k2); + OP4(G, dd , aa , bb , cc , X[4] , 6, k2); + OP4(G, cc , dd , aa , bb , X[13], 8, k2); + OP4(G, bb , cc , dd , aa , X[1] , 13, k2); + OP4(G, aa , bb , cc , dd , X[10], 11, k2); + OP4(G, dd , aa , bb , cc , X[6] , 9, k2); + OP4(G, cc , dd , aa , bb , X[15], 7, k2); + OP4(G, bb , cc , dd , aa , X[3] , 15, k2); + OP4(G, aa , bb , cc , dd , X[12], 7, k2); + OP4(G, dd , aa , bb , cc , X[0] , 12, k2); + OP4(G, cc , dd , aa , bb , X[9] , 15, k2); + OP4(G, bb , cc , dd , aa , X[5] , 9, k2); + OP4(G, aa , bb , cc , dd , X[2] , 11, k2); + OP4(G, dd , aa , bb , cc , X[14], 7, k2); + OP4(G, cc , dd , aa , bb , X[11], 13, k2); + OP4(G, bb , cc , dd , aa , X[8] , 12, k2); + + OP4(H, aaa, bbb, ccc, ddd, X[6] , 9, k3); + OP4(H, ddd, aaa, bbb, ccc, X[11], 13, k3); + OP4(H, ccc, ddd, aaa, bbb, X[3] , 15, k3); + OP4(H, bbb, ccc, ddd, aaa, X[7] , 7, k3); + OP4(H, aaa, bbb, ccc, ddd, X[0] , 12, k3); + OP4(H, ddd, aaa, bbb, ccc, X[13], 8, k3); + OP4(H, ccc, ddd, aaa, bbb, X[5] , 9, k3); + OP4(H, bbb, ccc, ddd, aaa, X[10], 11, k3); + OP4(H, aaa, bbb, ccc, ddd, X[14], 7, k3); + OP4(H, ddd, aaa, bbb, ccc, X[15], 7, k3); + OP4(H, ccc, ddd, aaa, bbb, X[8] , 12, k3); + OP4(H, bbb, ccc, ddd, aaa, X[12], 7, k3); + OP4(H, aaa, bbb, ccc, ddd, X[4] , 6, k3); + OP4(H, ddd, aaa, bbb, ccc, X[9] , 15, k3); + OP4(H, ccc, ddd, aaa, bbb, X[1] , 13, k3); + OP4(H, bbb, ccc, ddd, aaa, X[2] , 11, k3); + + if (hashwidth == 256) { + uint64_t tmp = bb; bb = bbb; bbb = tmp; + } + } + + /* round 3 */ + if (hashwidth == 160) { + OP5(H, dd , ee , aa , bb , cc , X[3] , 11, k4); + OP5(H, cc , dd , ee , aa , bb , X[10], 13, k4); + OP5(H, bb , cc , dd , ee , aa , X[14], 6, k4); + OP5(H, aa , bb , cc , dd , ee , X[4] , 7, k4); + OP5(H, ee , aa , bb , cc , dd , X[9] , 14, k4); + OP5(H, dd , ee , aa , bb , cc , X[15], 9, k4); + OP5(H, cc , dd , ee , aa , bb , X[8] , 13, k4); + OP5(H, bb , cc , dd , ee , aa , X[1] , 15, k4); + OP5(H, aa , bb , cc , dd , ee , X[2] , 14, k4); + OP5(H, ee , aa , bb , cc , dd , X[7] , 8, k4); + OP5(H, dd , ee , aa , bb , cc , X[0] , 13, k4); + OP5(H, cc , dd , ee , aa , bb , X[6] , 6, k4); + OP5(H, bb , cc , dd , ee , aa , X[13], 5, k4); + OP5(H, aa , bb , cc , dd , ee , X[11], 12, k4); + OP5(H, ee , aa , bb , cc , dd , X[5] , 7, k4); + OP5(H, dd , ee , aa , bb , cc , X[12], 5, k4); + + OP5(H, ddd, eee, aaa, bbb, ccc, X[15], 9, k5); + OP5(H, ccc, ddd, eee, aaa, bbb, X[5] , 7, k5); + OP5(H, bbb, ccc, ddd, eee, aaa, X[1] , 15, k5); + OP5(H, aaa, bbb, ccc, ddd, eee, X[3] , 11, k5); + OP5(H, eee, aaa, bbb, ccc, ddd, X[7] , 8, k5); + OP5(H, ddd, eee, aaa, bbb, ccc, X[14], 6, k5); + OP5(H, ccc, ddd, eee, aaa, bbb, X[6] , 6, k5); + OP5(H, bbb, ccc, ddd, eee, aaa, X[9] , 14, k5); + OP5(H, aaa, bbb, ccc, ddd, eee, X[11], 12, k5); + OP5(H, eee, aaa, bbb, ccc, ddd, X[8] , 13, k5); + OP5(H, ddd, eee, aaa, bbb, ccc, X[12], 5, k5); + OP5(H, ccc, ddd, eee, aaa, bbb, X[2] , 14, k5); + OP5(H, bbb, ccc, ddd, eee, aaa, X[10], 13, k5); + OP5(H, aaa, bbb, ccc, ddd, eee, X[0] , 13, k5); + OP5(H, eee, aaa, bbb, ccc, ddd, X[4] , 7, k5); + OP5(H, ddd, eee, aaa, bbb, ccc, X[13], 5, k5); + } else { + OP4(H, aa , bb , cc , dd , X[3] , 11, k4); + OP4(H, dd , aa , bb , cc , X[10], 13, k4); + OP4(H, cc , dd , aa , bb , X[14], 6, k4); + OP4(H, bb , cc , dd , aa , X[4] , 7, k4); + OP4(H, aa , bb , cc , dd , X[9] , 14, k4); + OP4(H, dd , aa , bb , cc , X[15], 9, k4); + OP4(H, cc , dd , aa , bb , X[8] , 13, k4); + OP4(H, bb , cc , dd , aa , X[1] , 15, k4); + OP4(H, aa , bb , cc , dd , X[2] , 14, k4); + OP4(H, dd , aa , bb , cc , X[7] , 8, k4); + OP4(H, cc , dd , aa , bb , X[0] , 13, k4); + OP4(H, bb , cc , dd , aa , X[6] , 6, k4); + OP4(H, aa , bb , cc , dd , X[13], 5, k4); + OP4(H, dd , aa , bb , cc , X[11], 12, k4); + OP4(H, cc , dd , aa , bb , X[5] , 7, k4); + OP4(H, bb , cc , dd , aa , X[12], 5, k4); + + OP4(G, aaa, bbb, ccc, ddd, X[15], 9, k5); + OP4(G, ddd, aaa, bbb, ccc, X[5] , 7, k5); + OP4(G, ccc, ddd, aaa, bbb, X[1] , 15, k5); + OP4(G, bbb, ccc, ddd, aaa, X[3] , 11, k5); + OP4(G, aaa, bbb, ccc, ddd, X[7] , 8, k5); + OP4(G, ddd, aaa, bbb, ccc, X[14], 6, k5); + OP4(G, ccc, ddd, aaa, bbb, X[6] , 6, k5); + OP4(G, bbb, ccc, ddd, aaa, X[9] , 14, k5); + OP4(G, aaa, bbb, ccc, ddd, X[11], 12, k5); + OP4(G, ddd, aaa, bbb, ccc, X[8] , 13, k5); + OP4(G, ccc, ddd, aaa, bbb, X[12], 5, k5); + OP4(G, bbb, ccc, ddd, aaa, X[2] , 14, k5); + OP4(G, aaa, bbb, ccc, ddd, X[10], 13, k5); + OP4(G, ddd, aaa, bbb, ccc, X[0] , 13, k5); + OP4(G, ccc, ddd, aaa, bbb, X[4] , 7, k5); + OP4(G, bbb, ccc, ddd, aaa, X[13], 5, k5); + + if (hashwidth == 256) { + uint64_t tmp = cc; cc = ccc; ccc = tmp; + } + } + + /* round 4 */ + if (hashwidth == 160) { + OP5(I, cc , dd , ee , aa , bb , X[1] , 11, k6); + OP5(I, bb , cc , dd , ee , aa , X[9] , 12, k6); + OP5(I, aa , bb , cc , dd , ee , X[11], 14, k6); + OP5(I, ee , aa , bb , cc , dd , X[10], 15, k6); + OP5(I, dd , ee , aa , bb , cc , X[0] , 14, k6); + OP5(I, cc , dd , ee , aa , bb , X[8] , 15, k6); + OP5(I, bb , cc , dd , ee , aa , X[12], 9, k6); + OP5(I, aa , bb , cc , dd , ee , X[4] , 8, k6); + OP5(I, ee , aa , bb , cc , dd , X[13], 9, k6); + OP5(I, dd , ee , aa , bb , cc , X[3] , 14, k6); + OP5(I, cc , dd , ee , aa , bb , X[7] , 5, k6); + OP5(I, bb , cc , dd , ee , aa , X[15], 6, k6); + OP5(I, aa , bb , cc , dd , ee , X[14], 8, k6); + OP5(I, ee , aa , bb , cc , dd , X[5] , 6, k6); + OP5(I, dd , ee , aa , bb , cc , X[6] , 5, k6); + OP5(I, cc , dd , ee , aa , bb , X[2] , 12, k6); + + OP5(G, ccc, ddd, eee, aaa, bbb, X[8] , 15, k9); + OP5(G, bbb, ccc, ddd, eee, aaa, X[6] , 5, k9); + OP5(G, aaa, bbb, ccc, ddd, eee, X[4] , 8, k9); + OP5(G, eee, aaa, bbb, ccc, ddd, X[1] , 11, k9); + OP5(G, ddd, eee, aaa, bbb, ccc, X[3] , 14, k9); + OP5(G, ccc, ddd, eee, aaa, bbb, X[11], 14, k9); + OP5(G, bbb, ccc, ddd, eee, aaa, X[15], 6, k9); + OP5(G, aaa, bbb, ccc, ddd, eee, X[0] , 14, k9); + OP5(G, eee, aaa, bbb, ccc, ddd, X[5] , 6, k9); + OP5(G, ddd, eee, aaa, bbb, ccc, X[12], 9, k9); + OP5(G, ccc, ddd, eee, aaa, bbb, X[2] , 12, k9); + OP5(G, bbb, ccc, ddd, eee, aaa, X[13], 9, k9); + OP5(G, aaa, bbb, ccc, ddd, eee, X[9] , 12, k9); + OP5(G, eee, aaa, bbb, ccc, ddd, X[7] , 5, k9); + OP5(G, ddd, eee, aaa, bbb, ccc, X[10], 15, k9); + OP5(G, ccc, ddd, eee, aaa, bbb, X[14], 8, k9); + } else { + OP4(I, aa , bb , cc , dd , X[1] , 11, k6); + OP4(I, dd , aa , bb , cc , X[9] , 12, k6); + OP4(I, cc , dd , aa , bb , X[11], 14, k6); + OP4(I, bb , cc , dd , aa , X[10], 15, k6); + OP4(I, aa , bb , cc , dd , X[0] , 14, k6); + OP4(I, dd , aa , bb , cc , X[8] , 15, k6); + OP4(I, cc , dd , aa , bb , X[12], 9, k6); + OP4(I, bb , cc , dd , aa , X[4] , 8, k6); + OP4(I, aa , bb , cc , dd , X[13], 9, k6); + OP4(I, dd , aa , bb , cc , X[3] , 14, k6); + OP4(I, cc , dd , aa , bb , X[7] , 5, k6); + OP4(I, bb , cc , dd , aa , X[15], 6, k6); + OP4(I, aa , bb , cc , dd , X[14], 8, k6); + OP4(I, dd , aa , bb , cc , X[5] , 6, k6); + OP4(I, cc , dd , aa , bb , X[6] , 5, k6); + OP4(I, bb , cc , dd , aa , X[2] , 12, k6); + + OP4(F, aaa, bbb, ccc, ddd, X[8] , 15, k7); + OP4(F, ddd, aaa, bbb, ccc, X[6] , 5, k7); + OP4(F, ccc, ddd, aaa, bbb, X[4] , 8, k7); + OP4(F, bbb, ccc, ddd, aaa, X[1] , 11, k7); + OP4(F, aaa, bbb, ccc, ddd, X[3] , 14, k7); + OP4(F, ddd, aaa, bbb, ccc, X[11], 14, k7); + OP4(F, ccc, ddd, aaa, bbb, X[15], 6, k7); + OP4(F, bbb, ccc, ddd, aaa, X[0] , 14, k7); + OP4(F, aaa, bbb, ccc, ddd, X[5] , 6, k7); + OP4(F, ddd, aaa, bbb, ccc, X[12], 9, k7); + OP4(F, ccc, ddd, aaa, bbb, X[2] , 12, k7); + OP4(F, bbb, ccc, ddd, aaa, X[13], 9, k7); + OP4(F, aaa, bbb, ccc, ddd, X[9] , 12, k7); + OP4(F, ddd, aaa, bbb, ccc, X[7] , 5, k7); + OP4(F, ccc, ddd, aaa, bbb, X[10], 15, k7); + OP4(F, bbb, ccc, ddd, aaa, X[14], 8, k7); + + if (hashwidth == 256) { + uint64_t tmp = dd; dd = ddd; ddd = tmp; + } + } + + /* round 5 */ + if (hashwidth == 160) { + OP5(J, bb , cc , dd , ee , aa , X[4] , 9, k8); + OP5(J, aa , bb , cc , dd , ee , X[0] , 15, k8); + OP5(J, ee , aa , bb , cc , dd , X[5] , 5, k8); + OP5(J, dd , ee , aa , bb , cc , X[9] , 11, k8); + OP5(J, cc , dd , ee , aa , bb , X[7] , 6, k8); + OP5(J, bb , cc , dd , ee , aa , X[12], 8, k8); + OP5(J, aa , bb , cc , dd , ee , X[2] , 13, k8); + OP5(J, ee , aa , bb , cc , dd , X[10], 12, k8); + OP5(J, dd , ee , aa , bb , cc , X[14], 5, k8); + OP5(J, cc , dd , ee , aa , bb , X[1] , 12, k8); + OP5(J, bb , cc , dd , ee , aa , X[3] , 13, k8); + OP5(J, aa , bb , cc , dd , ee , X[8] , 14, k8); + OP5(J, ee , aa , bb , cc , dd , X[11], 11, k8); + OP5(J, dd , ee , aa , bb , cc , X[6] , 8, k8); + OP5(J, cc , dd , ee , aa , bb , X[15], 5, k8); + OP5(J, bb , cc , dd , ee , aa , X[13], 6, k8); + + OP5(F, bbb, ccc, ddd, eee, aaa, X[12], 8, k7); + OP5(F, aaa, bbb, ccc, ddd, eee, X[15], 5, k7); + OP5(F, eee, aaa, bbb, ccc, ddd, X[10], 12, k7); + OP5(F, ddd, eee, aaa, bbb, ccc, X[4] , 9, k7); + OP5(F, ccc, ddd, eee, aaa, bbb, X[1] , 12, k7); + OP5(F, bbb, ccc, ddd, eee, aaa, X[5] , 5, k7); + OP5(F, aaa, bbb, ccc, ddd, eee, X[8] , 14, k7); + OP5(F, eee, aaa, bbb, ccc, ddd, X[7] , 6, k7); + OP5(F, ddd, eee, aaa, bbb, ccc, X[6] , 8, k7); + OP5(F, ccc, ddd, eee, aaa, bbb, X[2] , 13, k7); + OP5(F, bbb, ccc, ddd, eee, aaa, X[13], 6, k7); + OP5(F, aaa, bbb, ccc, ddd, eee, X[14], 5, k7); + OP5(F, eee, aaa, bbb, ccc, ddd, X[0] , 15, k7); + OP5(F, ddd, eee, aaa, bbb, ccc, X[3] , 13, k7); + OP5(F, ccc, ddd, eee, aaa, bbb, X[9] , 11, k7); + OP5(F, bbb, ccc, ddd, eee, aaa, X[11], 11, k7); + } + + /* combine results */ + if (hashwidth == 128) { + ddd += cc + ctx->state[1]; /* final result for MDbuf[0] */ + ctx->state[1] = ctx->state[2] + dd + aaa; + ctx->state[2] = ctx->state[3] + aa + bbb; + ctx->state[3] = ctx->state[0] + bb + ccc; + ctx->state[0] = ddd; + } else if (hashwidth == 160) { + ddd += cc + ctx->state[1]; /* final result for MDbuf[0] */ + ctx->state[1] = ctx->state[2] + dd + eee; + ctx->state[2] = ctx->state[3] + ee + aaa; + ctx->state[3] = ctx->state[4] + aa + bbb; + ctx->state[4] = ctx->state[0] + bb + ccc; + ctx->state[0] = ddd; + } else if (hashwidth == 256) { + ctx->state[0] += aa; + ctx->state[1] += bb; + ctx->state[2] += cc; + ctx->state[3] += dd; + ctx->state[4] += aaa; + ctx->state[5] += bbb; + ctx->state[6] += ccc; + ctx->state[7] += ddd; + } + + return; } -template < uint32_t hashwidth > -static void rmd_init(rmd_ctx * ctx) { - ctx->state[0] = 0x67452301; - ctx->state[1] = 0xefcdab89; - ctx->state[2] = 0x98badcfe; - ctx->state[3] = 0x10325476; - if (hashwidth >= 160) { - ctx->state[4] = 0xc3d2e1f0; - } - if (hashwidth == 256) { - ctx->state[4] = 0x76543210; - ctx->state[5] = 0xfedcba98; - ctx->state[6] = 0x89abcdef; - ctx->state[7] = 0x01234567; - } - ctx->curlen = 0; - ctx->length = 0; - return; +template +static void rmd_init( rmd_ctx * ctx ) { + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xefcdab89; + ctx->state[2] = 0x98badcfe; + ctx->state[3] = 0x10325476; + if (hashwidth >= 160) { + ctx->state[4] = 0xc3d2e1f0; + } + if (hashwidth == 256) { + ctx->state[4] = 0x76543210; + ctx->state[5] = 0xfedcba98; + ctx->state[6] = 0x89abcdef; + ctx->state[7] = 0x01234567; + } + ctx->curlen = 0; + ctx->length = 0; + return; } -template < uint32_t hashwidth, bool bswap > -static void rmd_done(rmd_ctx * ctx, uint8_t * out) { +template +static void rmd_done( rmd_ctx * ctx, uint8_t * out ) { int i; /* increase the length of the message */ @@ -502,7 +490,7 @@ static void rmd_done(rmd_ctx * ctx, uint8_t * out) { while (ctx->curlen < 64) { ctx->buf[ctx->curlen++] = (unsigned char)0; } - rmd_compress(ctx, ctx->buf); + rmd_compress(ctx, ctx->buf); ctx->curlen = 0; } @@ -513,249 +501,291 @@ static void rmd_done(rmd_ctx * ctx, uint8_t * out) { /* store length */ if (isBE()) { - PUT_U64(ctx->length, ctx->buf+56, 0); + PUT_U64(ctx->length, ctx->buf + 56, 0); } else { - PUT_U64(ctx->length, ctx->buf+56, 0); + PUT_U64(ctx->length, ctx->buf + 56, 0); } - rmd_compress(ctx, ctx->buf); + rmd_compress(ctx, ctx->buf); /* copy output */ - for (i = 0; i < (hashwidth/32); i++) { - PUT_U32(ctx->state[i], (uint8_t*)out, 4*i); + for (i = 0; i < (hashwidth / 32); i++) { + PUT_U32(ctx->state[i], (uint8_t *)out, 4 * i); } } -template < uint32_t hashwidth, bool bswap > -static void rmd_update(rmd_ctx * ctx, const uint8_t * data, size_t len) { - while (len > 0) { - if ((ctx->length == 0) && (len >= sizeof(ctx->buf))) { - rmd_compress(ctx, data); - ctx->length += 64*8; - len -= 64; - data += 64; - } else { - size_t n = 64 - ctx->curlen; - if (n > len) { n = len; } - memcpy(&ctx->buf[ctx->curlen], data, n); - ctx->curlen += n; - len -= n; - data += n; - if (ctx->curlen == sizeof(ctx->buf)) { - rmd_compress(ctx, ctx->buf); - ctx->curlen = 0; - ctx->length += 64*8; - } +template +static void rmd_update( rmd_ctx * ctx, const uint8_t * data, size_t len ) { + while (len > 0) { + if ((ctx->length == 0) && (len >= sizeof(ctx->buf))) { + rmd_compress(ctx, data); + ctx->length += 64 * 8; + len -= 64; + data += 64; + } else { + size_t n = 64 - ctx->curlen; + if (n > len) { n = len; } + memcpy(&ctx->buf[ctx->curlen], data, n); + ctx->curlen += n; + len -= n; + data += n; + if (ctx->curlen == sizeof(ctx->buf)) { + rmd_compress(ctx, ctx->buf); + ctx->curlen = 0; + ctx->length += 64 * 8; + } + } } - } } /* Homegrown RMD seeding */ -static void rmd_seed(rmd_ctx * ctx, uint64_t seed) { - const uint32_t seedlo = seed & 0xFFFFFFFF; - const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; - - ctx->state[0] ^= seedlo; - ctx->state[1] ^= seedlo + seedhi; - ctx->state[2] ^= seedhi; - ctx->state[3] ^= seedlo + seedhi; +static void rmd_seed( rmd_ctx * ctx, uint64_t seed ) { + const uint32_t seedlo = seed & 0xFFFFFFFF; + const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + + ctx->state[0] ^= seedlo; + ctx->state[1] ^= seedlo + seedhi; + ctx->state[2] ^= seedhi; + ctx->state[3] ^= seedlo + seedhi; } -template < bool bswap > -static void rmd128(const void * in, const size_t len, const seed_t seed, void * out) { - rmd_ctx ctx; +template +static void rmd128( const void * in, const size_t len, const seed_t seed, void * out ) { + rmd_ctx ctx; - rmd_init<128> (&ctx); - rmd_seed (&ctx, (uint64_t)seed); - rmd_update<128, bswap> (&ctx, (const uint8_t *)in, len); - rmd_done<128, bswap> (&ctx, (uint8_t*)out); + rmd_init<128>(&ctx); + rmd_seed(&ctx, (uint64_t)seed); + rmd_update<128, bswap>(&ctx, (const uint8_t *)in, len); + rmd_done<128, bswap>(&ctx, (uint8_t *)out); } -template < bool bswap > -static void rmd160(const void * in, const size_t len, const seed_t seed, void * out) { - rmd_ctx ctx; +template +static void rmd160( const void * in, const size_t len, const seed_t seed, void * out ) { + rmd_ctx ctx; - rmd_init<160> (&ctx); - rmd_seed (&ctx, (uint64_t)seed); - rmd_update<160, bswap> (&ctx, (const uint8_t *)in, len); - rmd_done<160, bswap> (&ctx, (uint8_t*)out); + rmd_init<160>(&ctx); + rmd_seed(&ctx, (uint64_t)seed); + rmd_update<160, bswap>(&ctx, (const uint8_t *)in, len); + rmd_done<160, bswap>(&ctx, (uint8_t *)out); } -template < bool bswap > -static void rmd256(const void * in, const size_t len, const seed_t seed, void * out) { - rmd_ctx ctx; +template +static void rmd256( const void * in, const size_t len, const seed_t seed, void * out ) { + rmd_ctx ctx; - rmd_init<256> (&ctx); - rmd_seed (&ctx, (uint64_t)seed); - rmd_update<256, bswap> (&ctx, (const uint8_t *)in, len); - rmd_done<256, bswap> (&ctx, (uint8_t*)out); + rmd_init<256>(&ctx); + rmd_seed(&ctx, (uint64_t)seed); + rmd_update<256, bswap>(&ctx, (const uint8_t *)in, len); + rmd_done<256, bswap>(&ctx, (uint8_t *)out); } -static bool rmd_test(void) { - static const struct { - const char *msg; - unsigned char hash128[16]; - unsigned char hash160[20]; - unsigned char hash256[32]; - } tests[] = { - { "", - { 0xcd, 0xf2, 0x62, 0x13, 0xa1, 0x50, 0xdc, 0x3e, - 0xcb, 0x61, 0x0f, 0x18, 0xf6, 0xb3, 0x8b, 0x46 }, - { 0x9c, 0x11, 0x85, 0xa5, 0xc5, 0xe9, 0xfc, 0x54, 0x61, 0x28, - 0x08, 0x97, 0x7e, 0xe8, 0xf5, 0x48, 0xb2, 0x25, 0x8d, 0x31 }, - { 0x02, 0xba, 0x4c, 0x4e, 0x5f, 0x8e, 0xcd, 0x18, - 0x77, 0xfc, 0x52, 0xd6, 0x4d, 0x30, 0xe3, 0x7a, - 0x2d, 0x97, 0x74, 0xfb, 0x1e, 0x5d, 0x02, 0x63, - 0x80, 0xae, 0x01, 0x68, 0xe3, 0xc5, 0x52, 0x2d } - }, - { "a", - { 0x86, 0xbe, 0x7a, 0xfa, 0x33, 0x9d, 0x0f, 0xc7, - 0xcf, 0xc7, 0x85, 0xe7, 0x2f, 0x57, 0x8d, 0x33 }, - { 0x0b, 0xdc, 0x9d, 0x2d, 0x25, 0x6b, 0x3e, 0xe9, 0xda, 0xae, - 0x34, 0x7b, 0xe6, 0xf4, 0xdc, 0x83, 0x5a, 0x46, 0x7f, 0xfe }, - { 0xf9, 0x33, 0x3e, 0x45, 0xd8, 0x57, 0xf5, 0xd9, - 0x0a, 0x91, 0xba, 0xb7, 0x0a, 0x1e, 0xba, 0x0c, - 0xfb, 0x1b, 0xe4, 0xb0, 0x78, 0x3c, 0x9a, 0xcf, - 0xcd, 0x88, 0x3a, 0x91, 0x34, 0x69, 0x29, 0x25 } - }, - { "abc", - { 0xc1, 0x4a, 0x12, 0x19, 0x9c, 0x66, 0xe4, 0xba, - 0x84, 0x63, 0x6b, 0x0f, 0x69, 0x14, 0x4c, 0x77 }, - { 0x8e, 0xb2, 0x08, 0xf7, 0xe0, 0x5d, 0x98, 0x7a, 0x9b, 0x04, - 0x4a, 0x8e, 0x98, 0xc6, 0xb0, 0x87, 0xf1, 0x5a, 0x0b, 0xfc }, - { 0xaf, 0xbd, 0x6e, 0x22, 0x8b, 0x9d, 0x8c, 0xbb, - 0xce, 0xf5, 0xca, 0x2d, 0x03, 0xe6, 0xdb, 0xa1, - 0x0a, 0xc0, 0xbc, 0x7d, 0xcb, 0xe4, 0x68, 0x0e, - 0x1e, 0x42, 0xd2, 0xe9, 0x75, 0x45, 0x9b, 0x65 } - }, - { "message digest", - { 0x9e, 0x32, 0x7b, 0x3d, 0x6e, 0x52, 0x30, 0x62, - 0xaf, 0xc1, 0x13, 0x2d, 0x7d, 0xf9, 0xd1, 0xb8 }, - { 0x5d, 0x06, 0x89, 0xef, 0x49, 0xd2, 0xfa, 0xe5, 0x72, 0xb8, - 0x81, 0xb1, 0x23, 0xa8, 0x5f, 0xfa, 0x21, 0x59, 0x5f, 0x36 }, - { 0x87, 0xe9, 0x71, 0x75, 0x9a, 0x1c, 0xe4, 0x7a, - 0x51, 0x4d, 0x5c, 0x91, 0x4c, 0x39, 0x2c, 0x90, - 0x18, 0xc7, 0xc4, 0x6b, 0xc1, 0x44, 0x65, 0x55, - 0x4a, 0xfc, 0xdf, 0x54, 0xa5, 0x07, 0x0c, 0x0e } - }, - { "abcdefghijklmnopqrstuvwxyz", - { 0xfd, 0x2a, 0xa6, 0x07, 0xf7, 0x1d, 0xc8, 0xf5, - 0x10, 0x71, 0x49, 0x22, 0xb3, 0x71, 0x83, 0x4e }, - { 0xf7, 0x1c, 0x27, 0x10, 0x9c, 0x69, 0x2c, 0x1b, 0x56, 0xbb, - 0xdc, 0xeb, 0x5b, 0x9d, 0x28, 0x65, 0xb3, 0x70, 0x8d, 0xbc }, - { 0x64, 0x9d, 0x30, 0x34, 0x75, 0x1e, 0xa2, 0x16, - 0x77, 0x6b, 0xf9, 0xa1, 0x8a, 0xcc, 0x81, 0xbc, - 0x78, 0x96, 0x11, 0x8a, 0x51, 0x97, 0x96, 0x87, - 0x82, 0xdd, 0x1f, 0xd9, 0x7d, 0x8d, 0x51, 0x33 } - }, - { "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", - { 0xd1, 0xe9, 0x59, 0xeb, 0x17, 0x9c, 0x91, 0x1f, - 0xae, 0xa4, 0x62, 0x4c, 0x60, 0xc5, 0xc7, 0x02 }, - { 0xb0, 0xe2, 0x0b, 0x6e, 0x31, 0x16, 0x64, 0x02, 0x86, 0xed, - 0x3a, 0x87, 0xa5, 0x71, 0x30, 0x79, 0xb2, 0x1f, 0x51, 0x89 }, - { 0x57, 0x40, 0xa4, 0x08, 0xac, 0x16, 0xb7, 0x20, - 0xb8, 0x44, 0x24, 0xae, 0x93, 0x1c, 0xbb, 0x1f, - 0xe3, 0x63, 0xd1, 0xd0, 0xbf, 0x40, 0x17, 0xf1, - 0xa8, 0x9f, 0x7e, 0xa6, 0xde, 0x77, 0xa0, 0xb8 } - } - }; - - int i; - unsigned char tmp[32]; - bool result = true; - - for (i = 0; i < (int)(sizeof(tests)/sizeof(tests[0])); i++) { - if (isLE()) { - rmd128(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } else { - rmd128(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } - if (memcmp(tmp, tests[i].hash128, 16) != 0) { - //printf("128 failure test %d\n", i); - result = false; - } - if (isLE()) { - rmd160(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } else { - rmd160(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } - if (memcmp(tmp, tests[i].hash160, 20) != 0) { - //printf("160 failure test %d\n", i); - result = false; - } - if (isLE()) { - rmd256(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } else { - rmd256(tests[i].msg, strlen(tests[i].msg), 0, tmp); - } - if (memcmp(tmp, tests[i].hash256, 32) != 0) { - //printf("256 failure test %d\n", i); - result = false; - } - } - return result; +static bool rmd_test( void ) { + static conststruct { + const char * msg; + unsigned char hash128[16]; + unsigned char hash160[20]; + unsigned char hash256[32]; + } tests[] = { + { + "", + { + 0xcd, 0xf2, 0x62, 0x13, 0xa1, 0x50, 0xdc, 0x3e, + 0xcb, 0x61, 0x0f, 0x18, 0xf6, 0xb3, 0x8b, 0x46 + }, + { + 0x9c, 0x11, 0x85, 0xa5, 0xc5, 0xe9, 0xfc, 0x54, 0x61, 0x28, + 0x08, 0x97, 0x7e, 0xe8, 0xf5, 0x48, 0xb2, 0x25, 0x8d, 0x31 + }, + { + 0x02, 0xba, 0x4c, 0x4e, 0x5f, 0x8e, 0xcd, 0x18, + 0x77, 0xfc, 0x52, 0xd6, 0x4d, 0x30, 0xe3, 0x7a, + 0x2d, 0x97, 0x74, 0xfb, 0x1e, 0x5d, 0x02, 0x63, + 0x80, 0xae, 0x01, 0x68, 0xe3, 0xc5, 0x52, 0x2d + } + }, + { + "a", + { + 0x86, 0xbe, 0x7a, 0xfa, 0x33, 0x9d, 0x0f, 0xc7, + 0xcf, 0xc7, 0x85, 0xe7, 0x2f, 0x57, 0x8d, 0x33 + }, + { + 0x0b, 0xdc, 0x9d, 0x2d, 0x25, 0x6b, 0x3e, 0xe9, 0xda, 0xae, + 0x34, 0x7b, 0xe6, 0xf4, 0xdc, 0x83, 0x5a, 0x46, 0x7f, 0xfe + }, + { + 0xf9, 0x33, 0x3e, 0x45, 0xd8, 0x57, 0xf5, 0xd9, + 0x0a, 0x91, 0xba, 0xb7, 0x0a, 0x1e, 0xba, 0x0c, + 0xfb, 0x1b, 0xe4, 0xb0, 0x78, 0x3c, 0x9a, 0xcf, + 0xcd, 0x88, 0x3a, 0x91, 0x34, 0x69, 0x29, 0x25 + } + }, + { + "abc", + { + 0xc1, 0x4a, 0x12, 0x19, 0x9c, 0x66, 0xe4, 0xba, + 0x84, 0x63, 0x6b, 0x0f, 0x69, 0x14, 0x4c, 0x77 + }, + { + 0x8e, 0xb2, 0x08, 0xf7, 0xe0, 0x5d, 0x98, 0x7a, 0x9b, 0x04, + 0x4a, 0x8e, 0x98, 0xc6, 0xb0, 0x87, 0xf1, 0x5a, 0x0b, 0xfc + }, + { + 0xaf, 0xbd, 0x6e, 0x22, 0x8b, 0x9d, 0x8c, 0xbb, + 0xce, 0xf5, 0xca, 0x2d, 0x03, 0xe6, 0xdb, 0xa1, + 0x0a, 0xc0, 0xbc, 0x7d, 0xcb, 0xe4, 0x68, 0x0e, + 0x1e, 0x42, 0xd2, 0xe9, 0x75, 0x45, 0x9b, 0x65 + } + }, + { + "message digest", + { + 0x9e, 0x32, 0x7b, 0x3d, 0x6e, 0x52, 0x30, 0x62, + 0xaf, 0xc1, 0x13, 0x2d, 0x7d, 0xf9, 0xd1, 0xb8 + }, + { + 0x5d, 0x06, 0x89, 0xef, 0x49, 0xd2, 0xfa, 0xe5, 0x72, 0xb8, + 0x81, 0xb1, 0x23, 0xa8, 0x5f, 0xfa, 0x21, 0x59, 0x5f, 0x36 + }, + { + 0x87, 0xe9, 0x71, 0x75, 0x9a, 0x1c, 0xe4, 0x7a, + 0x51, 0x4d, 0x5c, 0x91, 0x4c, 0x39, 0x2c, 0x90, + 0x18, 0xc7, 0xc4, 0x6b, 0xc1, 0x44, 0x65, 0x55, + 0x4a, 0xfc, 0xdf, 0x54, 0xa5, 0x07, 0x0c, 0x0e + } + }, + { + "abcdefghijklmnopqrstuvwxyz", + { + 0xfd, 0x2a, 0xa6, 0x07, 0xf7, 0x1d, 0xc8, 0xf5, + 0x10, 0x71, 0x49, 0x22, 0xb3, 0x71, 0x83, 0x4e + }, + { + 0xf7, 0x1c, 0x27, 0x10, 0x9c, 0x69, 0x2c, 0x1b, 0x56, 0xbb, + 0xdc, 0xeb, 0x5b, 0x9d, 0x28, 0x65, 0xb3, 0x70, 0x8d, 0xbc + }, + { + 0x64, 0x9d, 0x30, 0x34, 0x75, 0x1e, 0xa2, 0x16, + 0x77, 0x6b, 0xf9, 0xa1, 0x8a, 0xcc, 0x81, 0xbc, + 0x78, 0x96, 0x11, 0x8a, 0x51, 0x97, 0x96, 0x87, + 0x82, 0xdd, 0x1f, 0xd9, 0x7d, 0x8d, 0x51, 0x33 + } + }, + { + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", + { + 0xd1, 0xe9, 0x59, 0xeb, 0x17, 0x9c, 0x91, 0x1f, + 0xae, 0xa4, 0x62, 0x4c, 0x60, 0xc5, 0xc7, 0x02 + }, + { + 0xb0, 0xe2, 0x0b, 0x6e, 0x31, 0x16, 0x64, 0x02, 0x86, 0xed, + 0x3a, 0x87, 0xa5, 0x71, 0x30, 0x79, 0xb2, 0x1f, 0x51, 0x89 + }, + { + 0x57, 0x40, 0xa4, 0x08, 0xac, 0x16, 0xb7, 0x20, + 0xb8, 0x44, 0x24, 0xae, 0x93, 0x1c, 0xbb, 0x1f, + 0xe3, 0x63, 0xd1, 0xd0, 0xbf, 0x40, 0x17, 0xf1, + 0xa8, 0x9f, 0x7e, 0xa6, 0xde, 0x77, 0xa0, 0xb8 + } + } + }; + + int i; + unsigned char tmp[32]; + bool result = true; + + for (i = 0; i < (int)(sizeof(tests) / sizeof(tests[0])); i++) { + if (isLE()) { + rmd128(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } else { + rmd128(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } + if (memcmp(tmp, tests[i].hash128, 16) != 0) { + // printf("128 failure test %d\n", i); + result = false; + } + if (isLE()) { + rmd160(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } else { + rmd160(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } + if (memcmp(tmp, tests[i].hash160, 20) != 0) { + // printf("160 failure test %d\n", i); + result = false; + } + if (isLE()) { + rmd256(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } else { + rmd256(tests[i].msg, strlen(tests[i].msg), 0, tmp); + } + if (memcmp(tmp, tests[i].hash256, 32) != 0) { + // printf("256 failure test %d\n", i); + result = false; + } + } + return result; } REGISTER_FAMILY(ripemd, - $.src_url = "https://github.com/libtom/libtomcrypt/blob/develop/src/hashes/rmd128.c", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/libtom/libtomcrypt/blob/develop/src/hashes/rmd128.c", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(RIPEMD_128, - $.desc = "RIPE-MD 128", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0xC9B0B675, - $.verification_BE = 0xD1DB09B5, - $.initfn = rmd_test, - $.hashfn_native = rmd128, - $.hashfn_bswap = rmd128 -); + $.desc = "RIPE-MD 128", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0xC9B0B675, + $.verification_BE = 0xD1DB09B5, + $.initfn = rmd_test, + $.hashfn_native = rmd128, + $.hashfn_bswap = rmd128 + ); REGISTER_HASH(RIPEMD_160, - $.desc = "RIPE-MD 160", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 160, - $.verification_LE = 0x8613F5B2, - $.verification_BE = 0x2265C3AA, - $.initfn = rmd_test, - $.hashfn_native = rmd160, - $.hashfn_bswap = rmd160 -); + $.desc = "RIPE-MD 160", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 160, + $.verification_LE = 0x8613F5B2, + $.verification_BE = 0x2265C3AA, + $.initfn = rmd_test, + $.hashfn_native = rmd160, + $.hashfn_bswap = rmd160 + ); REGISTER_HASH(RIPEMD_256, - $.desc = "RIPE-MD 256", - $.hash_flags = - FLAG_HASH_NO_SEED | - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 256, - $.verification_LE = 0x870A973A, - $.verification_BE = 0xF2A877EE, - $.initfn = rmd_test, - $.hashfn_native = rmd256, - $.hashfn_bswap = rmd256 -); + $.desc = "RIPE-MD 256", + $.hash_flags = + FLAG_HASH_NO_SEED | + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 256, + $.verification_LE = 0x870A973A, + $.verification_BE = 0xF2A877EE, + $.initfn = rmd_test, + $.hashfn_native = rmd256, + $.hashfn_bswap = rmd256 + ); diff --git a/hashes/seahash.cpp b/hashes/seahash.cpp index c7565290..2bfd998a 100644 --- a/hashes/seahash.cpp +++ b/hashes/seahash.cpp @@ -31,128 +31,130 @@ #include -static inline uint64_t diffuse(uint64_t val){ - uint64_t a, b; - val *= UINT64_C(0x6eed0e9da4d94a4f); - a = val >> 32; - b = val >> 60; - val ^= a >> b; - val *= UINT64_C(0x6eed0e9da4d94a4f); - return val; +static inline uint64_t diffuse( uint64_t val ) { + uint64_t a, b; + + val *= UINT64_C(0x6eed0e9da4d94a4f); + a = val >> 32; + b = val >> 60; + val ^= a >> b; + val *= UINT64_C(0x6eed0e9da4d94a4f); + return val; } -template < bool bswap > -static uint64_t seahash(const uint8_t * key, size_t len, uint64_t seed) { - uint64_t a, b, c, d; - uint8_t pad[8] = {0}; - const uint64_t orig_len = (uint64_t)len; +template +static uint64_t seahash( const uint8_t * key, size_t len, uint64_t seed ) { + uint64_t a, b, c, d; + uint8_t pad[8] = { 0 }; + const uint64_t orig_len = (uint64_t)len; - a = UINT64_C(0x16f11fe89b0d677c) ^ seed; - b = UINT64_C(0xb480a793d8e6c86c); - c = UINT64_C(0x6fe2e5aaf078ebc9); - d = UINT64_C(0x14f994a4c5259381); + a = UINT64_C(0x16f11fe89b0d677c) ^ seed; + b = UINT64_C(0xb480a793d8e6c86c); + c = UINT64_C(0x6fe2e5aaf078ebc9); + d = UINT64_C(0x14f994a4c5259381); - while (len >= 32) { - a ^= GET_U64(key, 0); - b ^= GET_U64(key, 8); - c ^= GET_U64(key, 16); - d ^= GET_U64(key, 24); - a = diffuse(a); - b = diffuse(b); - c = diffuse(c); - d = diffuse(d); - len -= 32; - key += 32; - } + while (len >= 32) { + a ^= GET_U64(key, 0); + b ^= GET_U64(key, 8); + c ^= GET_U64(key, 16); + d ^= GET_U64(key, 24); + a = diffuse(a); + b = diffuse(b); + c = diffuse(c); + d = diffuse(d); + len -= 32; + key += 32; + } - switch (len) { - case 31: case 30: case 29: case 28: case 27: case 26: case 25: - a ^= GET_U64(key, 0); - b ^= GET_U64(key, 8); - c ^= GET_U64(key, 16); - memcpy(pad, key + 24, len - 24); - d ^= GET_U64(pad, 0); - a = diffuse(a); - b = diffuse(b); - c = diffuse(c); - d = diffuse(d); - break; - case 24: - a ^= GET_U64(key, 0); - b ^= GET_U64(key, 8); - c ^= GET_U64(key, 16); - a = diffuse(a); - b = diffuse(b); - c = diffuse(c); - break; - case 23: case 22: case 21: case 20: case 19: case 18: case 17: - a ^= GET_U64(key, 0); - b ^= GET_U64(key, 8); - memcpy(pad, key + 16, len - 16); - c ^= GET_U64(pad, 0); - a = diffuse(a); - b = diffuse(b); - c = diffuse(c); - break; - case 16: - a ^= GET_U64(key, 0); - b ^= GET_U64(key, 8); - a = diffuse(a); - b = diffuse(b); - break; - case 15: case 14: case 13: case 12: case 11: case 10: case 9: - a ^= GET_U64(key, 0); - memcpy(pad, key + 8, len - 8); - b ^= GET_U64(pad, 0); - a = diffuse(a); - b = diffuse(b); - break; - case 8: - a ^= GET_U64(key, 0); - a = diffuse(a); - break; - case 7: case 6: case 5: case 4: case 3: case 2: case 1: - memcpy(pad, key, len); - a ^= GET_U64(pad, 0); - a = diffuse(a); - break; - case 0: - break; - default: - unreachable(); - assert(0); - } + switch (len) { + case 31: case 30: case 29: case 28: case 27: case 26: case 25: + a ^= GET_U64(key, 0); + b ^= GET_U64(key, 8); + c ^= GET_U64(key, 16); + memcpy(pad, key + 24, len - 24); + d ^= GET_U64(pad, 0); + a = diffuse(a); + b = diffuse(b); + c = diffuse(c); + d = diffuse(d); + break; + case 24: + a ^= GET_U64(key, 0); + b ^= GET_U64(key, 8); + c ^= GET_U64(key, 16); + a = diffuse(a); + b = diffuse(b); + c = diffuse(c); + break; + case 23: case 22: case 21: case 20: case 19: case 18: case 17: + a ^= GET_U64(key, 0); + b ^= GET_U64(key, 8); + memcpy(pad, key + 16, len - 16); + c ^= GET_U64(pad, 0); + a = diffuse(a); + b = diffuse(b); + c = diffuse(c); + break; + case 16: + a ^= GET_U64(key, 0); + b ^= GET_U64(key, 8); + a = diffuse(a); + b = diffuse(b); + break; + case 15: case 14: case 13: case 12: case 11: case 10: case 9: + a ^= GET_U64(key, 0); + memcpy(pad, key + 8, len - 8); + b ^= GET_U64(pad, 0); + a = diffuse(a); + b = diffuse(b); + break; + case 8: + a ^= GET_U64(key, 0); + a = diffuse(a); + break; + case 7: case 6: case 5: case 4: case 3: case 2: case 1: + memcpy(pad, key, len); + a ^= GET_U64(pad, 0); + a = diffuse(a); + break; + case 0: + break; + default: + unreachable(); + assert(0); + } - a ^= b; - c ^= d; - a ^= c; - a ^= orig_len; - return BSWAP(diffuse(a)); + a ^= b; + c ^= d; + a ^= c; + a ^= orig_len; + return BSWAP(diffuse(a)); } -template < bool bswap > -static void SeaHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void SeaHash( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = seahash((const uint8_t *)in, len, (uint64_t)seed); + PUT_U64(h, (uint8_t *)out, 0); } REGISTER_FAMILY(seahash, - $.src_url = "https://gist.github.com/vstakhov/b58b855532a424cd634b6c7ea7baa1b9", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://gist.github.com/vstakhov/b58b855532a424cd634b6c7ea7baa1b9", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(seahash, - $.desc = "seahash", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_SHIFT_VARIABLE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0xF0374078, - $.verification_BE = 0x5BD66274, - $.hashfn_native = SeaHash, - $.hashfn_bswap = SeaHash -); + $.desc = "seahash", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_SHIFT_VARIABLE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0xF0374078, + $.verification_BE = 0x5BD66274, + $.hashfn_native = SeaHash, + $.hashfn_bswap = SeaHash + ); diff --git a/hashes/sha1.cpp b/hashes/sha1.cpp index e042b99f..a4c50ede 100644 --- a/hashes/sha1.cpp +++ b/hashes/sha1.cpp @@ -40,352 +40,355 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_SHA1) || defined(HAVE_ARM_SHA1) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif //----------------------------------------------------------------------------- // Raw SHA-1 implementation typedef struct { - uint32_t state[5]; - uint32_t count[2]; - uint8_t buffer[64]; + uint32_t state[5]; + uint32_t count[2]; + uint8_t buffer[64]; } SHA1_CTX; #define SHA1_DIGEST_SIZE 20 /* SHA1_Init - Initialize new context */ -static void SHA1_Init(SHA1_CTX * context) { - /* SHA1 initialization constants */ - context->state[0] = 0x67452301; - context->state[1] = 0xEFCDAB89; - context->state[2] = 0x98BADCFE; - context->state[3] = 0x10325476; - context->state[4] = 0xC3D2E1F0; - context->count[0] = context->count[1] = 0; +static void SHA1_Init( SHA1_CTX * context ) { + /* SHA1 initialization constants */ + context->state[0] = 0x67452301; + context->state[1] = 0xEFCDAB89; + context->state[2] = 0x98BADCFE; + context->state[3] = 0x10325476; + context->state[4] = 0xC3D2E1F0; + context->count[0] = context->count[1] = 0; } /* Hash a single 512-bit block. This is the core of the algorithm. */ -template < bool bswap > -static void SHA1_Transform_portable(uint32_t state[5], const uint8_t buffer[64]) { - uint32_t a, b, c, d, e; - uint32_t l[16]; +template +static void SHA1_Transform_portable( uint32_t state[5], const uint8_t buffer[64] ) { + uint32_t a, b, c, d, e; + uint32_t l[16]; /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ -#define blk0(i) (l[i] = GET_U32(buffer, 4*(i))) -#define blk(i) (l[i & 15] = ROTL32( \ - l[(i + 13) & 15] ^ \ - l[(i + 8) & 15] ^ \ - l[(i + 2) & 15] ^ \ - l[i & 15] \ +#define blk0(i) (l[i] = GET_U32(buffer, 4 * (i))) +#define blk(i) (l[i & 15] = ROTL32( \ + l[(i + 13) & 15] ^ \ + l[(i + 8) & 15] ^ \ + l[(i + 2) & 15] ^ \ + l[i & 15] \ , 1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -#define R0(v, w, x, y, z, i) \ - z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + ROTL32(v, 5); \ +#define R0(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + ROTL32(v, 5); \ w = ROTL32(w, 30); -#define R1(v, w, x, y, z, i) \ - z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + ROTL32(v, 5); \ +#define R1(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + ROTL32(v, 5); \ w = ROTL32(w, 30); -#define R2(v, w, x, y, z, i) \ - z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + ROTL32(v, 5); \ +#define R2(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + ROTL32(v, 5); \ w = ROTL32(w, 30); -#define R3(v, w, x, y, z, i) \ - z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + ROTL32(v, 5); \ +#define R3(v, w, x, y, z, i) \ + z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + ROTL32(v, 5); \ w = ROTL32(w, 30); -#define R4(v, w, x, y, z, i) \ - z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + ROTL32(v, 5); \ +#define R4(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + ROTL32(v, 5); \ w = ROTL32(w, 30); - /* Copy context->state[] to working vars */ - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a, b, c, d, e, 0); - R0(e, a, b, c, d, 1); - R0(d, e, a, b, c, 2); - R0(c, d, e, a, b, 3); - R0(b, c, d, e, a, 4); - R0(a, b, c, d, e, 5); - R0(e, a, b, c, d, 6); - R0(d, e, a, b, c, 7); - R0(c, d, e, a, b, 8); - R0(b, c, d, e, a, 9); - R0(a, b, c, d, e, 10); - R0(e, a, b, c, d, 11); - R0(d, e, a, b, c, 12); - R0(c, d, e, a, b, 13); - R0(b, c, d, e, a, 14); - R0(a, b, c, d, e, 15); - - R1(e, a, b, c, d, 16); - R1(d, e, a, b, c, 17); - R1(c, d, e, a, b, 18); - R1(b, c, d, e, a, 19); - - R2(a, b, c, d, e, 20); - R2(e, a, b, c, d, 21); - R2(d, e, a, b, c, 22); - R2(c, d, e, a, b, 23); - R2(b, c, d, e, a, 24); - R2(a, b, c, d, e, 25); - R2(e, a, b, c, d, 26); - R2(d, e, a, b, c, 27); - R2(c, d, e, a, b, 28); - R2(b, c, d, e, a, 29); - R2(a, b, c, d, e, 30); - R2(e, a, b, c, d, 31); - R2(d, e, a, b, c, 32); - R2(c, d, e, a, b, 33); - R2(b, c, d, e, a, 34); - R2(a, b, c, d, e, 35); - R2(e, a, b, c, d, 36); - R2(d, e, a, b, c, 37); - R2(c, d, e, a, b, 38); - R2(b, c, d, e, a, 39); - - R3(a, b, c, d, e, 40); - R3(e, a, b, c, d, 41); - R3(d, e, a, b, c, 42); - R3(c, d, e, a, b, 43); - R3(b, c, d, e, a, 44); - R3(a, b, c, d, e, 45); - R3(e, a, b, c, d, 46); - R3(d, e, a, b, c, 47); - R3(c, d, e, a, b, 48); - R3(b, c, d, e, a, 49); - R3(a, b, c, d, e, 50); - R3(e, a, b, c, d, 51); - R3(d, e, a, b, c, 52); - R3(c, d, e, a, b, 53); - R3(b, c, d, e, a, 54); - R3(a, b, c, d, e, 55); - R3(e, a, b, c, d, 56); - R3(d, e, a, b, c, 57); - R3(c, d, e, a, b, 58); - R3(b, c, d, e, a, 59); - - R4(a, b, c, d, e, 60); - R4(e, a, b, c, d, 61); - R4(d, e, a, b, c, 62); - R4(c, d, e, a, b, 63); - R4(b, c, d, e, a, 64); - R4(a, b, c, d, e, 65); - R4(e, a, b, c, d, 66); - R4(d, e, a, b, c, 67); - R4(c, d, e, a, b, 68); - R4(b, c, d, e, a, 69); - R4(a, b, c, d, e, 70); - R4(e, a, b, c, d, 71); - R4(d, e, a, b, c, 72); - R4(c, d, e, a, b, 73); - R4(b, c, d, e, a, 74); - R4(a, b, c, d, e, 75); - R4(e, a, b, c, d, 76); - R4(d, e, a, b, c, 77); - R4(c, d, e, a, b, 78); - R4(b, c, d, e, a, 79); - - /* Add the working vars back into context.state[] */ - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; + /* Copy context->state[] to working vars */ + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + + /* 4 rounds of 20 operations each. Loop unrolled. */ + R0(a, b, c, d, e, 0); + R0(e, a, b, c, d, 1); + R0(d, e, a, b, c, 2); + R0(c, d, e, a, b, 3); + R0(b, c, d, e, a, 4); + R0(a, b, c, d, e, 5); + R0(e, a, b, c, d, 6); + R0(d, e, a, b, c, 7); + R0(c, d, e, a, b, 8); + R0(b, c, d, e, a, 9); + R0(a, b, c, d, e, 10); + R0(e, a, b, c, d, 11); + R0(d, e, a, b, c, 12); + R0(c, d, e, a, b, 13); + R0(b, c, d, e, a, 14); + R0(a, b, c, d, e, 15); + + R1(e, a, b, c, d, 16); + R1(d, e, a, b, c, 17); + R1(c, d, e, a, b, 18); + R1(b, c, d, e, a, 19); + + R2(a, b, c, d, e, 20); + R2(e, a, b, c, d, 21); + R2(d, e, a, b, c, 22); + R2(c, d, e, a, b, 23); + R2(b, c, d, e, a, 24); + R2(a, b, c, d, e, 25); + R2(e, a, b, c, d, 26); + R2(d, e, a, b, c, 27); + R2(c, d, e, a, b, 28); + R2(b, c, d, e, a, 29); + R2(a, b, c, d, e, 30); + R2(e, a, b, c, d, 31); + R2(d, e, a, b, c, 32); + R2(c, d, e, a, b, 33); + R2(b, c, d, e, a, 34); + R2(a, b, c, d, e, 35); + R2(e, a, b, c, d, 36); + R2(d, e, a, b, c, 37); + R2(c, d, e, a, b, 38); + R2(b, c, d, e, a, 39); + + R3(a, b, c, d, e, 40); + R3(e, a, b, c, d, 41); + R3(d, e, a, b, c, 42); + R3(c, d, e, a, b, 43); + R3(b, c, d, e, a, 44); + R3(a, b, c, d, e, 45); + R3(e, a, b, c, d, 46); + R3(d, e, a, b, c, 47); + R3(c, d, e, a, b, 48); + R3(b, c, d, e, a, 49); + R3(a, b, c, d, e, 50); + R3(e, a, b, c, d, 51); + R3(d, e, a, b, c, 52); + R3(c, d, e, a, b, 53); + R3(b, c, d, e, a, 54); + R3(a, b, c, d, e, 55); + R3(e, a, b, c, d, 56); + R3(d, e, a, b, c, 57); + R3(c, d, e, a, b, 58); + R3(b, c, d, e, a, 59); + + R4(a, b, c, d, e, 60); + R4(e, a, b, c, d, 61); + R4(d, e, a, b, c, 62); + R4(c, d, e, a, b, 63); + R4(b, c, d, e, a, 64); + R4(a, b, c, d, e, 65); + R4(e, a, b, c, d, 66); + R4(d, e, a, b, c, 67); + R4(c, d, e, a, b, 68); + R4(b, c, d, e, a, 69); + R4(a, b, c, d, e, 70); + R4(e, a, b, c, d, 71); + R4(d, e, a, b, c, 72); + R4(c, d, e, a, b, 73); + R4(b, c, d, e, a, 74); + R4(a, b, c, d, e, 75); + R4(e, a, b, c, d, 76); + R4(d, e, a, b, c, 77); + R4(c, d, e, a, b, 78); + R4(b, c, d, e, a, 79); + + /* Add the working vars back into context.state[] */ + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; } #if defined(HAVE_X86_64_SHA1) -template < bool bswap > -static void SHA1_Transform_sha1NI(uint32_t state[5], const uint8_t buffer[64]) { - __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; - __m128i MSG0, MSG1, MSG2, MSG3; + +template +static void SHA1_Transform_sha1NI( uint32_t state[5], const uint8_t buffer[64] ) { + __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; + __m128i MSG0, MSG1, MSG2, MSG3; const __m128i MASK = bswap ? - _mm_set_epi64x(UINT64_C(0x0001020304050607), UINT64_C(0x08090a0b0c0d0e0f)) : - _mm_set_epi64x(UINT64_C(0x0302010007060504), UINT64_C(0x0b0a09080f0e0d0c)) ; + _mm_set_epi64x(UINT64_C(0x0001020304050607), UINT64_C(0x08090a0b0c0d0e0f)) : + _mm_set_epi64x(UINT64_C(0x0302010007060504), UINT64_C(0x0b0a09080f0e0d0c)); /* Load initial values */ - ABCD = _mm_loadu_si128((const __m128i*) state); - E0 = _mm_set_epi32(state[4], 0, 0, 0); + ABCD = _mm_loadu_si128((const __m128i *)state); + E0 = _mm_set_epi32(state[4], 0, 0, 0); ABCD = _mm_shuffle_epi32(ABCD, 0x1B); /* Save current state */ ABCD_SAVE = ABCD; - E0_SAVE = E0; + E0_SAVE = E0; /* Rounds 0-3 */ - MSG0 = _mm_loadu_si128((const __m128i*)(buffer + 0)); + MSG0 = _mm_loadu_si128((const __m128i *)(buffer + 0)); MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*)(buffer + 16)); + MSG1 = _mm_loadu_si128((const __m128i *)(buffer + 16)); MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*)(buffer + 32)); + MSG2 = _mm_loadu_si128((const __m128i *)(buffer + 32)); MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*)(buffer + 48)); + MSG3 = _mm_loadu_si128((const __m128i *)(buffer + 48)); MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 16-19 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 20-23 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 24-27 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 28-31 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 32-35 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 36-39 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 40-43 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 44-47 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 48-51 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 52-55 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 56-59 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); MSG0 = _mm_xor_si128(MSG0, MSG2); /* Rounds 60-63 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); MSG1 = _mm_xor_si128(MSG1, MSG3); /* Rounds 64-67 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); MSG2 = _mm_xor_si128(MSG2, MSG0); /* Rounds 68-71 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); MSG3 = _mm_xor_si128(MSG3, MSG1); /* Rounds 72-75 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); /* Rounds 76-79 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); /* Combine state */ - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); + E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); /* Save state */ - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - _mm_storeu_si128((__m128i*) state, ABCD); + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + _mm_storeu_si128((__m128i *)state, ABCD); state[4] = _mm_extract_epi32(E0, 3); } + #endif #if defined(HAVE_ARM_SHA1) -template < bool bswap > -static void SHA1_Transform_neon(uint32_t state[5], const uint8_t buffer[64]) { + +template +static void SHA1_Transform_neon( uint32_t state[5], const uint8_t buffer[64] ) { uint32x4_t ABCD, ABCD_SAVED; uint32x4_t TMP0, TMP1; uint32x4_t MSG0, MSG1, MSG2, MSG3; @@ -393,17 +396,17 @@ static void SHA1_Transform_neon(uint32_t state[5], const uint8_t buffer[64]) { /* Load state */ ABCD = vld1q_u32(&state[0]); - E0 = state[4]; + E0 = state[4]; /* Save state */ ABCD_SAVED = ABCD; - E0_SAVED = E0; + E0_SAVED = E0; /* Load message */ - MSG0 = vld1q_u32((const uint32_t*)(buffer)); - MSG1 = vld1q_u32((const uint32_t*)(buffer + 16)); - MSG2 = vld1q_u32((const uint32_t*)(buffer + 32)); - MSG3 = vld1q_u32((const uint32_t*)(buffer + 48)); + MSG0 = vld1q_u32((const uint32_t *)(buffer )); + MSG1 = vld1q_u32((const uint32_t *)(buffer + 16)); + MSG2 = vld1q_u32((const uint32_t *)(buffer + 32)); + MSG3 = vld1q_u32((const uint32_t *)(buffer + 48)); if (bswap) { /* Reverse for little endian */ @@ -417,149 +420,150 @@ static void SHA1_Transform_neon(uint32_t state[5], const uint8_t buffer[64]) { TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); /* Rounds 0-3 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1cq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); /* Rounds 4-7 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1cq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); MSG0 = vsha1su1q_u32(MSG0, MSG3); MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); /* Rounds 8-11 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1cq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); MSG1 = vsha1su1q_u32(MSG1, MSG0); MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); /* Rounds 12-15 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1cq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); MSG2 = vsha1su1q_u32(MSG2, MSG1); MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); /* Rounds 16-19 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1cq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); MSG3 = vsha1su1q_u32(MSG3, MSG2); MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); /* Rounds 20-23 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); MSG0 = vsha1su1q_u32(MSG0, MSG3); MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); /* Rounds 24-27 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); MSG1 = vsha1su1q_u32(MSG1, MSG0); MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); /* Rounds 28-31 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); MSG2 = vsha1su1q_u32(MSG2, MSG1); MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); /* Rounds 32-35 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); MSG3 = vsha1su1q_u32(MSG3, MSG2); MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); /* Rounds 36-39 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); MSG0 = vsha1su1q_u32(MSG0, MSG3); MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); /* Rounds 40-43 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1mq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); MSG1 = vsha1su1q_u32(MSG1, MSG0); MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); /* Rounds 44-47 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1mq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); MSG2 = vsha1su1q_u32(MSG2, MSG1); MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); /* Rounds 48-51 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1mq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); MSG3 = vsha1su1q_u32(MSG3, MSG2); MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); /* Rounds 52-55 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1mq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); MSG0 = vsha1su1q_u32(MSG0, MSG3); MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); /* Rounds 56-59 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1mq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); MSG1 = vsha1su1q_u32(MSG1, MSG0); MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); /* Rounds 60-63 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); MSG2 = vsha1su1q_u32(MSG2, MSG1); MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); /* Rounds 64-67 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E0, TMP0); TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); MSG3 = vsha1su1q_u32(MSG3, MSG2); MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); /* Rounds 68-71 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); MSG0 = vsha1su1q_u32(MSG0, MSG3); /* Rounds 72-75 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E0, TMP0); /* Rounds 76-79 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); ABCD = vsha1pq_u32(ABCD, E1, TMP1); /* Combine state */ - E0 += E0_SAVED; + E0 += E0_SAVED; ABCD = vaddq_u32(ABCD_SAVED, ABCD); /* Save state */ vst1q_u32(&state[0], ABCD); state[4] = E0; } + #endif -template < bool bswap > -static void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]) { +template +static void SHA1_Transform( uint32_t state[5], const uint8_t buffer[64] ) { #if defined(HAVE_X86_64_SHA1) return SHA1_Transform_sha1NI(state, buffer); #endif @@ -569,15 +573,16 @@ static void SHA1_Transform(uint32_t state[5], const uint8_t buffer[64]) { return SHA1_Transform_portable(state, buffer); } -template < bool bswap > -static void SHA1_Update(SHA1_CTX * context, const uint8_t * data, const size_t len) { +template +static void SHA1_Update( SHA1_CTX * context, const uint8_t * data, const size_t len ) { size_t i, j; j = context->count[0]; - if ((context->count[0] += len << 3) < j) + if ((context->count[0] += len << 3) < j) { context->count[1]++; + } context->count[1] += (len >> 29); - j = (j >> 3) & 63; + j = (j >> 3) & 63; if ((j + len) > 63) { memcpy(&context->buffer[j], data, (i = 64 - j)); @@ -593,36 +598,37 @@ static void SHA1_Update(SHA1_CTX * context, const uint8_t * data, const size_t l } /* Add padding and return len bytes of the message digest. */ -template < bool bswap > -static void SHA1_Final(SHA1_CTX * context, uint32_t digest_words, uint8_t * digest) { - uint32_t i; - uint8_t finalcount[8]; - uint8_t c; - - for (i = 0; i < 8; i++) { - finalcount[i] = - /* Endian independent */ - (uint8_t)(context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)); - } - c = 0200; - SHA1_Update(context, &c, 1); - while ((context->count[0] & 504) != 448) { - c = 0000; +template +static void SHA1_Final( SHA1_CTX * context, uint32_t digest_words, uint8_t * digest ) { + uint32_t i; + uint8_t finalcount[8]; + uint8_t c; + + for (i = 0; i < 8; i++) { + finalcount[i] = + /* Endian independent */ + (uint8_t)(context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)); + } + c = 0200; SHA1_Update(context, &c, 1); - } - SHA1_Update(context, finalcount, 8); /* Should cause a SHA1_Transform() */ + while ((context->count[0] & 504) != 448) { + c = 0000; + SHA1_Update(context, &c, 1); + } + SHA1_Update(context, finalcount, 8); /* Should cause a SHA1_Transform() */ - if (digest_words > 5) { digest_words = 5; } - for (i = 0; i < digest_words; i++) { - PUT_U32(context->state[i], digest, 4*i); - } + if (digest_words > 5) { digest_words = 5; } + for (i = 0; i < digest_words; i++) { + PUT_U32(context->state[i], digest, 4 * i); + } } //----------------------------------------------------------------------------- // Homegrown SHA-1 seeding function -static FORCE_INLINE void SHA1_Seed(SHA1_CTX * ctx, const seed_t seed) { +static FORCE_INLINE void SHA1_Seed( SHA1_CTX * ctx, const seed_t seed ) { const uint32_t seedlo = seed & 0xFFFFFFFF; const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + ctx->state[0] ^= seedlo; ctx->state[1] ^= seedhi; ctx->state[2] += seedlo ^ seedhi; @@ -631,14 +637,14 @@ static FORCE_INLINE void SHA1_Seed(SHA1_CTX * ctx, const seed_t seed) { } //----------------------------------------------------------------------------- -template < uint32_t hashbits, bool bswap > -static void SHA1(const void * in, const size_t len, const seed_t seed, void * out) { - SHA1_CTX context; - - SHA1_Init (&context); - SHA1_Seed (&context, seed); - SHA1_Update(&context, (uint8_t*)in, len); - SHA1_Final (&context, (hashbits+31)/32, (uint8_t*)out); +template +static void SHA1( const void * in, const size_t len, const seed_t seed, void * out ) { + SHA1_CTX context; + + SHA1_Init(&context); + SHA1_Seed(&context, seed); + SHA1_Update(&context, (uint8_t *)in, len); + SHA1_Final(&context, (hashbits + 31) / 32, (uint8_t *)out); } //----------------------------------------------------------------------------- @@ -652,16 +658,18 @@ static void SHA1(const void * in, const size_t len, const seed_t seed, void * ou // A million repetitions of "a" // 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F -static const char *const test_data[] = { +static const char * const test_data[] = { "abc", "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", - "A million repetitions of 'a'"}; -static const char *const test_results[] = { + "A million repetitions of 'a'" +}; +static const char * const test_results[] = { "A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D", "84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1", - "34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F"}; + "34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F" +}; -static void digest_to_hex(const uint8_t digest[SHA1_DIGEST_SIZE], char * output) { - int i, j; +static void digest_to_hex( const uint8_t digest[SHA1_DIGEST_SIZE], char * output ) { + int i, j; char * c = output; for (i = 0; i < SHA1_DIGEST_SIZE / 4; i++) { @@ -675,119 +683,119 @@ static void digest_to_hex(const uint8_t digest[SHA1_DIGEST_SIZE], char * output) *(c - 1) = '\0'; } -template < bool bswap > -static bool SHA1_Selftest(void) { - int k; - SHA1_CTX context; - uint8_t digest[20]; - char output[80]; - - for (k = 0; k < 2; k++) { - SHA1_Init (&context); - SHA1_Update(&context, (uint8_t *)test_data[k], strlen(test_data[k])); - SHA1_Final (&context, 5, digest); - digest_to_hex(digest, output); - - if (strcmp(output, test_results[k])) { - fprintf(stdout, "SHA-1 self test FAILED\n"); - fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[k]); - fprintf(stderr, "\t%s returned\n", output); - fprintf(stderr, "\t%s is correct\n", test_results[k]); - return false; - } - } - - /* million 'a' vector we feed separately */ - SHA1_Init(&context); - for (k = 0; k < 1000000; k++) { - SHA1_Update(&context, (uint8_t *)"a", 1); - } - SHA1_Final(&context, 5, digest); - digest_to_hex(digest, output); - if (strcmp(output, test_results[2])) { - fprintf(stdout, "SHA-1 self test FAILED\n"); - fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[2]); - fprintf(stderr, "\t%s returned\n", output); - fprintf(stderr, "\t%s is correct\n", test_results[2]); - return false; - } - - /* success */ - return true; +template +static bool SHA1_Selftest( void ) { + int k; + SHA1_CTX context; + uint8_t digest[20]; + char output[80]; + + for (k = 0; k < 2; k++) { + SHA1_Init(&context); + SHA1_Update(&context, (uint8_t *)test_data[k], strlen(test_data[k])); + SHA1_Final(&context, 5, digest); + digest_to_hex(digest, output); + + if (strcmp(output, test_results[k])) { + fprintf(stdout, "SHA-1 self test FAILED\n" ); + fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[k]); + fprintf(stderr, "\t%s returned\n", output); + fprintf(stderr, "\t%s is correct\n", test_results[k]); + return false; + } + } + + /* million 'a' vector we feed separately */ + SHA1_Init(&context); + for (k = 0; k < 1000000; k++) { + SHA1_Update(&context, (uint8_t *)"a", 1); + } + SHA1_Final(&context, 5, digest); + digest_to_hex(digest, output); + if (strcmp(output, test_results[2])) { + fprintf(stdout, "SHA-1 self test FAILED\n" ); + fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[2]); + fprintf(stderr, "\t%s returned\n", output); + fprintf(stderr, "\t%s is correct\n", test_results[2]); + return false; + } + + /* success */ + return true; } -static bool SHA1_test(void) { - if (isBE()) { - return SHA1_Selftest(); - } else { - return SHA1_Selftest(); - } +static bool SHA1_test( void ) { + if (isBE()) { + return SHA1_Selftest(); + } else { + return SHA1_Selftest(); + } } REGISTER_FAMILY(sha1, - $.src_url = "https://github.com/noloader/SHA-Intrinsics", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/noloader/SHA-Intrinsics", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(SHA_1__32, - $.desc = "SHA-1, bits 0-31", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 32, - $.verification_LE = 0xF0E4D9E9, - $.verification_BE = 0xE00EF4D6, - $.initfn = SHA1_test, - $.hashfn_native = SHA1<32,false>, - $.hashfn_bswap = SHA1<32,true> -); + $.desc = "SHA-1, bits 0-31", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 32, + $.verification_LE = 0xF0E4D9E9, + $.verification_BE = 0xE00EF4D6, + $.initfn = SHA1_test, + $.hashfn_native = SHA1<32, false>, + $.hashfn_bswap = SHA1<32, true> + ); REGISTER_HASH(SHA_1__64, - $.desc = "SHA-1, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0x36801ECB, - $.verification_BE = 0xFC26F4C7, - $.initfn = SHA1_test, - $.hashfn_native = SHA1<64,false>, - $.hashfn_bswap = SHA1<64,true> -); + $.desc = "SHA-1, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0x36801ECB, + $.verification_BE = 0xFC26F4C7, + $.initfn = SHA1_test, + $.hashfn_native = SHA1<64, false>, + $.hashfn_bswap = SHA1<64, true> + ); REGISTER_HASH(SHA_1, - $.desc = "SHA-1", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_CRYPTOGRAPHIC_WEAK | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 128, - $.verification_LE = 0xE444A591, - $.verification_BE = 0x35E00C29, - $.initfn = SHA1_test, - $.hashfn_native = SHA1<128,false>, - $.hashfn_bswap = SHA1<128,true> -); + $.desc = "SHA-1", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_CRYPTOGRAPHIC_WEAK | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 128, + $.verification_LE = 0xE444A591, + $.verification_BE = 0x35E00C29, + $.initfn = SHA1_test, + $.hashfn_native = SHA1<128, false>, + $.hashfn_bswap = SHA1<128, true> + ); diff --git a/hashes/sha2.cpp b/hashes/sha2.cpp index 3cb446ad..46ad7ea6 100644 --- a/hashes/sha2.cpp +++ b/hashes/sha2.cpp @@ -35,42 +35,42 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_SHA2) || defined(HAVE_ARM_SHA2) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif //----------------------------------------------------------------------------- // Raw SHA-2 implementation typedef struct { - uint64_t length; - uint32_t state[8], curlen; - uint8_t buf[64]; + uint64_t length; + uint32_t state[8], curlen; + uint8_t buf[64]; } SHA2_CTX; -static void SHA224_Init(SHA2_CTX * context) { - context->curlen = 0; - context->length = 0; - context->state[0] = 0xc1059ed8; - context->state[1] = 0x367cd507; - context->state[2] = 0x3070dd17; - context->state[3] = 0xf70e5939; - context->state[4] = 0xffc00b31; - context->state[5] = 0x68581511; - context->state[6] = 0x64f98fa7; - context->state[7] = 0xbefa4fa4; +static void SHA224_Init( SHA2_CTX * context ) { + context->curlen = 0; + context->length = 0; + context->state[0] = 0xc1059ed8; + context->state[1] = 0x367cd507; + context->state[2] = 0x3070dd17; + context->state[3] = 0xf70e5939; + context->state[4] = 0xffc00b31; + context->state[5] = 0x68581511; + context->state[6] = 0x64f98fa7; + context->state[7] = 0xbefa4fa4; } /* SHA256_Init - Initialize new context */ -static void SHA256_Init(SHA2_CTX * context) { - context->curlen = 0; - context->length = 0; - context->state[0] = 0x6A09E667; - context->state[1] = 0xBB67AE85; - context->state[2] = 0x3C6EF372; - context->state[3] = 0xA54FF53A; - context->state[4] = 0x510E527F; - context->state[5] = 0x9B05688C; - context->state[6] = 0x1F83D9AB; - context->state[7] = 0x5BE0CD19; +static void SHA256_Init( SHA2_CTX * context ) { + context->curlen = 0; + context->length = 0; + context->state[0] = 0x6A09E667; + context->state[1] = 0xBB67AE85; + context->state[2] = 0x3C6EF372; + context->state[3] = 0xA54FF53A; + context->state[4] = 0x510E527F; + context->state[5] = 0x9B05688C; + context->state[6] = 0x1F83D9AB; + context->state[7] = 0x5BE0CD19; } /* Hash a single 512-bit block. This is the core of the algorithm. */ @@ -93,266 +93,268 @@ static const uint32_t K256[] = { 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; -#define ROTATE(x,y) (((x)>>(y)) | ((x)<<(32-(y)))) -#define Sigma0(x) (ROTATE((x), 2) ^ ROTATE((x),13) ^ ROTATE((x),22)) -#define Sigma1(x) (ROTATE((x), 6) ^ ROTATE((x),11) ^ ROTATE((x),25)) -#define sigma0(x) (ROTATE((x), 7) ^ ROTATE((x),18) ^ ((x)>> 3)) -#define sigma1(x) (ROTATE((x),17) ^ ROTATE((x),19) ^ ((x)>>10)) - -#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) -#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - -template < bool bswap > -static void SHA256_Transform_portable(uint32_t state[8], const uint8_t buffer[64]) { - uint32_t a, b, c, d, e, f, g, h, s0, s1, T1, T2; - uint32_t X[16], i; - - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - f = state[5]; - g = state[6]; - h = state[7]; - - for (i = 0; i < 16; i++) { - X[i] = GET_U32(buffer, i*4); - - T1 = h; - T1 += Sigma1(e); - T1 += Ch(e, f, g); - T1 += K256[i]; - T1 += X[i]; - - T2 = Sigma0(a); - T2 += Maj(a, b, c); - - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - } - - for (; i < 64; i++) { - s0 = X[(i + 1) & 0x0f]; - s0 = sigma0(s0); - s1 = X[(i + 14) & 0x0f]; - s1 = sigma1(s1); - - T1 = X[i & 0xf] += s0 + s1 + X[(i + 9) & 0xf]; - T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; - T2 = Sigma0(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - } - - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; +#define ROTATE(x, y) (((x) >> (y)) | ((x) << (32 - (y)))) +#define Sigma0(x) (ROTATE((x), 2) ^ ROTATE((x), 13) ^ ROTATE((x), 22)) +#define Sigma1(x) (ROTATE((x), 6) ^ ROTATE((x), 11) ^ ROTATE((x), 25)) +#define sigma0(x) (ROTATE((x), 7) ^ ROTATE((x), 18) ^ ((x) >> 3)) +#define sigma1(x) (ROTATE((x), 17) ^ ROTATE((x), 19) ^ ((x) >> 10)) + +#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +template +static void SHA256_Transform_portable( uint32_t state[8], const uint8_t buffer[64] ) { + uint32_t a, b, c, d, e, f, g, h, s0, s1, T1, T2; + uint32_t X[16], i; + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + for (i = 0; i < 16; i++) { + X[i] = GET_U32(buffer, i * 4); + + T1 = h; + T1 += Sigma1(e); + T1 += Ch(e, f, g); + T1 += K256[i]; + T1 += X[i]; + + T2 = Sigma0(a); + T2 += Maj(a, b, c); + + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; i < 64; i++) { + s0 = X[(i + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(i + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[i & 0xf] += s0 + s1 + X[(i + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; } #if defined(HAVE_X86_64_SHA2) -template < bool bswap > -static void SHA256_Transform_x64(uint32_t state[8], const uint8_t data[64]) { - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i MSG0, MSG1, MSG2, MSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - /* Load initial values */ - TMP = _mm_loadu_si128((const __m128i*) &state[0]); - STATE1 = _mm_loadu_si128((const __m128i*) &state[4]); - - TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ - - /* Save current state */ - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - /* Rounds 0-3 */ - MSG0 = _mm_loadu_si128((const __m128i*) (data+0)); - if (bswap) { MSG0 = mm_bswap32(MSG0); } - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0xE9B5DBA5B5C0FBCF), UINT64_C(0x71374491428A2F98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*) (data+16)); - if (bswap) { MSG1 = mm_bswap32(MSG1); } - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0xAB1C5ED5923F82A4), UINT64_C(0x59F111F13956C25B))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*) (data+32)); - if (bswap) { MSG2 = mm_bswap32(MSG2); } - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0x550C7DC3243185BE), UINT64_C(0x12835B01D807AA98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*) (data+48)); - if (bswap) { MSG3 = mm_bswap32(MSG3); } - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0xC19BF1749BDC06A7), UINT64_C(0x80DEB1FE72BE5D74))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 16-19 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x240CA1CC0FC19DC6), UINT64_C(0xEFBE4786E49B69C1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 20-23 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x76F988DA5CB0A9DC), UINT64_C(0x4A7484AA2DE92C6F))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 24-27 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0xBF597FC7B00327C8), UINT64_C(0xA831C66D983E5152))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 28-31 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0x1429296706CA6351), UINT64_C(0xD5A79147C6E00BF3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 32-35 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x53380D134D2C6DFC), UINT64_C(0x2E1B213827B70A85))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 36-39 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x92722C8581C2C92E), UINT64_C(0x766A0ABB650A7354))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 40-43 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0xC76C51A3C24B8B70), UINT64_C(0xA81A664BA2BFE8A1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 44-47 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0x106AA070F40E3585), UINT64_C(0xD6990624D192E819))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 48-51 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x34B0BCB52748774C), UINT64_C(0x1E376C0819A4C116))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 52-55 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x682E6FF35B9CCA4F), UINT64_C(0x4ED8AA4A391C0CB3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 56-59 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0x8CC7020884C87814), UINT64_C(0x78A5636F748F82EE))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 60-63 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0xC67178F2BEF9A3F7), UINT64_C(0xA4506CEB90BEFFFA))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Combine state */ - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ - - /* Save state */ - _mm_storeu_si128((__m128i*) &state[0], STATE0); - _mm_storeu_si128((__m128i*) &state[4], STATE1); + +template +static void SHA256_Transform_x64( uint32_t state[8], const uint8_t data[64] ) { + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i MSG0, MSG1, MSG2, MSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + /* Load initial values */ + TMP = _mm_loadu_si128((const __m128i *)&state[0]); + STATE1 = _mm_loadu_si128((const __m128i *)&state[4]); + + TMP = _mm_shuffle_epi32(TMP , 0xB1); /* CDAB */ + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ + + /* Save current state */ + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + /* Rounds 0-3 */ + MSG0 = _mm_loadu_si128((const __m128i *)(data + 0)); + if (bswap) { MSG0 = mm_bswap32(MSG0); } + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0xE9B5DBA5B5C0FBCF), UINT64_C(0x71374491428A2F98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128((const __m128i *)(data + 16)); + if (bswap) { MSG1 = mm_bswap32(MSG1); } + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0xAB1C5ED5923F82A4), UINT64_C(0x59F111F13956C25B))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128((const __m128i *)(data + 32)); + if (bswap) { MSG2 = mm_bswap32(MSG2); } + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0x550C7DC3243185BE), UINT64_C(0x12835B01D807AA98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128((const __m128i *)(data + 48)); + if (bswap) { MSG3 = mm_bswap32(MSG3); } + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0xC19BF1749BDC06A7), UINT64_C(0x80DEB1FE72BE5D74))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 16-19 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x240CA1CC0FC19DC6), UINT64_C(0xEFBE4786E49B69C1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 20-23 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x76F988DA5CB0A9DC), UINT64_C(0x4A7484AA2DE92C6F))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 24-27 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0xBF597FC7B00327C8), UINT64_C(0xA831C66D983E5152))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 28-31 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0x1429296706CA6351), UINT64_C(0xD5A79147C6E00BF3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 32-35 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x53380D134D2C6DFC), UINT64_C(0x2E1B213827B70A85))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 36-39 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x92722C8581C2C92E), UINT64_C(0x766A0ABB650A7354))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 40-43 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0xC76C51A3C24B8B70), UINT64_C(0xA81A664BA2BFE8A1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 44-47 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0x106AA070F40E3585), UINT64_C(0xD6990624D192E819))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 48-51 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(UINT64_C(0x34B0BCB52748774C), UINT64_C(0x1E376C0819A4C116))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 52-55 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(UINT64_C(0x682E6FF35B9CCA4F), UINT64_C(0x4ED8AA4A391C0CB3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 56-59 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(UINT64_C(0x8CC7020884C87814), UINT64_C(0x78A5636F748F82EE))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 60-63 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(UINT64_C(0xC67178F2BEF9A3F7), UINT64_C(0xA4506CEB90BEFFFA))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Combine state */ + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ + + /* Save state */ + _mm_storeu_si128((__m128i *)&state[0], STATE0); + _mm_storeu_si128((__m128i *)&state[4], STATE1); } + #endif #if defined(HAVE_ARM_SHA2) @@ -375,167 +377,168 @@ static const uint32_t K[] = { 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, }; -template < bool bswap > -static void SHA256_Transform_neon(uint32_t state[8], const uint8_t data[64]) { - uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1, TMP2; - - /* Load state */ - STATE0 = vld1q_u32(&state[0]); - STATE1 = vld1q_u32(&state[4]); - - /* Save state */ - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - /* Load message */ - MSG0 = vld1q_u32((const uint32_t *)(data + 0)); - MSG1 = vld1q_u32((const uint32_t *)(data + 16)); - MSG2 = vld1q_u32((const uint32_t *)(data + 32)); - MSG3 = vld1q_u32((const uint32_t *)(data + 48)); - - /* Reverse for little endian */ - if (bswap) { - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - } - - TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00])); - - /* Rounds 0-3 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 4-7 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 8-11 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 12-15 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 16-19 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 20-23 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 24-27 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 28-31 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 32-35 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 36-39 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 40-43 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 44-47 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 48-51 */ - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - - /* Rounds 52-55 */ - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - - /* Rounds 56-59 */ - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - - /* Rounds 60-63 */ - TMP2 = STATE0; - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - - /* Combine state */ - STATE0 = vaddq_u32(STATE0, ABEF_SAVE); - STATE1 = vaddq_u32(STATE1, CDGH_SAVE); - - /* Save state */ - vst1q_u32(&state[0], STATE0); - vst1q_u32(&state[4], STATE1); +template +static void SHA256_Transform_neon( uint32_t state[8], const uint8_t data[64] ) { + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + /* Load state */ + STATE0 = vld1q_u32(&state[0]); + STATE1 = vld1q_u32(&state[4]); + + /* Save state */ + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + /* Load message */ + MSG0 = vld1q_u32((const uint32_t *)(data + 0)); + MSG1 = vld1q_u32((const uint32_t *)(data + 16)); + MSG2 = vld1q_u32((const uint32_t *)(data + 32)); + MSG3 = vld1q_u32((const uint32_t *)(data + 48)); + + /* Reverse for little endian */ + if (bswap) { + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + } + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00])); + + /* Rounds 0-3 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 4-7 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 8-11 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 12-15 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 16-19 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 20-23 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 24-27 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 28-31 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 32-35 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 36-39 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 40-43 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 44-47 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 48-51 */ + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + /* Rounds 52-55 */ + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + /* Rounds 56-59 */ + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + /* Rounds 60-63 */ + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + /* Combine state */ + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + /* Save state */ + vst1q_u32(&state[0], STATE0); + vst1q_u32(&state[4], STATE1); } + #endif -template < bool bswap > -static void SHA256_Transform(uint32_t state[8], const uint8_t buffer[64]) { +template +static void SHA256_Transform( uint32_t state[8], const uint8_t buffer[64] ) { #if defined(HAVE_X86_64_SHA2) return SHA256_Transform_x64(state, buffer); #endif @@ -545,84 +548,85 @@ static void SHA256_Transform(uint32_t state[8], const uint8_t buffer[64]) { return SHA256_Transform_portable(state, buffer); } -template < bool bswap > -static void SHA256_Update(SHA2_CTX * context, const uint8_t * data, size_t len) { - while (len > 0) { - if ((context->curlen == 0) && (len >= sizeof(context->buf))) { - SHA256_Transform(context->state, data); - context->length += 64*8; - len -= 64; - data += 64; - } else { - size_t n = 64 - context->curlen; - if (n > len) { n = len; } - memcpy(&context->buf[context->curlen], data, n); - context->curlen += n; - len -= n; - data += n; - if (context->curlen == 64) { - SHA256_Transform(context->state, context->buf); - context->curlen = 0; - context->length += 64*8; - } +template +static void SHA256_Update( SHA2_CTX * context, const uint8_t * data, size_t len ) { + while (len > 0) { + if ((context->curlen == 0) && (len >= sizeof(context->buf))) { + SHA256_Transform(context->state, data); + context->length += 64 * 8; + len -= 64; + data += 64; + } else { + size_t n = 64 - context->curlen; + if (n > len) { n = len; } + memcpy(&context->buf[context->curlen], data, n); + context->curlen += n; + len -= n; + data += n; + if (context->curlen == 64) { + SHA256_Transform(context->state, context->buf); + context->curlen = 0; + context->length += 64 * 8; + } + } } - } } /* Add padding and return len bytes of the message digest. */ -template < bool bswap > -static void SHA256_Final(SHA2_CTX * context, uint32_t digest_words, uint8_t * digest) { - uint32_t i; - uint8_t finalcount[8]; - uint8_t c; - - context->length += context->curlen * 8; - for (i = 0; i < 8; i++) { - finalcount[i] = (uint8_t)(context->length >> ((7 - i) * 8)); // Endian independent - } - c = 0200; - SHA256_Update(context, &c, 1); - while ((context->curlen) != 56) { - c = 0000; +template +static void SHA256_Final( SHA2_CTX * context, uint32_t digest_words, uint8_t * digest ) { + uint32_t i; + uint8_t finalcount[8]; + uint8_t c; + + context->length += context->curlen * 8; + for (i = 0; i < 8; i++) { + finalcount[i] = (uint8_t)(context->length >> ((7 - i) * 8)); // Endian independent + } + c = 0200; SHA256_Update(context, &c, 1); - } - SHA256_Update(context, finalcount, 8); /* Should cause a SHA256_Transform() */ + while ((context->curlen) != 56) { + c = 0000; + SHA256_Update(context, &c, 1); + } + SHA256_Update(context, finalcount, 8); /* Should cause a SHA256_Transform() */ - if (digest_words > 8) { digest_words = 8; } - for (i = 0; i < digest_words; i++) { - PUT_U32(context->state[i], digest, 4*i); - } + if (digest_words > 8) { digest_words = 8; } + for (i = 0; i < digest_words; i++) { + PUT_U32(context->state[i], digest, 4 * i); + } } //----------------------------------------------------------------------------- // Homegrown SHA-2 seeding function -static FORCE_INLINE void SHA256_Seed(SHA2_CTX * ctx, const seed_t seed) { +static FORCE_INLINE void SHA256_Seed( SHA2_CTX * ctx, const seed_t seed ) { const uint32_t seedlo = seed & 0xFFFFFFFF; const uint32_t seedhi = (seed >> 32) & 0xFFFFFFFF; + ctx->state[1] ^= seedlo; ctx->state[3] += seedlo + seedhi; ctx->state[5] ^= seedhi; } //----------------------------------------------------------------------------- -template < uint32_t hashbits, bool bswap > -static void SHA256(const void * in, const size_t len, const seed_t seed, void * out) { - SHA2_CTX context; - - SHA256_Init (&context); - SHA256_Seed (&context, seed); - SHA256_Update(&context, (const uint8_t*)in, len); - SHA256_Final (&context, (hashbits+31)/32, (uint8_t*)out); +template +static void SHA256( const void * in, const size_t len, const seed_t seed, void * out ) { + SHA2_CTX context; + + SHA256_Init(&context); + SHA256_Seed(&context, seed); + SHA256_Update(&context, (const uint8_t *)in, len); + SHA256_Final(&context, (hashbits + 31) / 32, (uint8_t *)out); } -template < uint32_t hashbits, bool bswap > -static void SHA224(const void * in, const size_t len, const seed_t seed, void * out) { - SHA2_CTX context; +template +static void SHA224( const void * in, const size_t len, const seed_t seed, void * out ) { + SHA2_CTX context; - SHA224_Init (&context); - SHA256_Seed (&context, seed); - SHA256_Update(&context, (const uint8_t*)in, len); - SHA256_Final (&context, (hashbits+31)/32, (uint8_t*)out); + SHA224_Init(&context); + SHA256_Seed(&context, seed); + SHA256_Update(&context, (const uint8_t *)in, len); + SHA256_Final(&context, (hashbits + 31) / 32, (uint8_t *)out); } //----------------------------------------------------------------------------- @@ -634,22 +638,23 @@ static void SHA224(const void * in, const size_t len, const seed_t seed, void * // e3b0c442 98fc1c14 9afbf4c8 996fb924 // 27ae41e4 649b934c a495991b 7852b855 // "abc" -// ba7816bf 8f01cfea 414140de 5dae2223 +// ba7816bf 8f01cfea 414140de 5dae2223 // b00361a3 96177a9c b410ff61 f20015ad // A million repetitions of "a" // cdc76e5c 9914fb92 81a1c7e2 84d73e67 // f1809a48 a497200e 046d39cc c7112cd0 -static const char *const test_data[] = { +static const char * const test_data[] = { "", "abc", - "A million repetitions of 'a'"}; -static const char *const test_results[] = { - "e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855", - "ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad", - "cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0", + "A million repetitions of 'a'" +}; +static const char * const test_results[] = { + "e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855", + "ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad", + "cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0", }; -static void digest_to_hex(const uint8_t digest[32], char * output) { - int i, j; +static void digest_to_hex( const uint8_t digest[32], char * output ) { + int i, j; char * c = output; for (i = 0; i < 32 / 4; i++) { @@ -662,136 +667,136 @@ static void digest_to_hex(const uint8_t digest[32], char * output) { *(c - 1) = '\0'; } -template < bool bswap > -static bool SHA256_Selftest(void) { - int k; - SHA2_CTX context; - uint8_t digest[32]; - char output[72]; - - for (k = 0; k < 2; k++) { - SHA256_Init (&context); - SHA256_Update(&context, (uint8_t *)test_data[k], strlen(test_data[k])); - SHA256_Final (&context, 8, digest); - digest_to_hex(digest, output); - - if (strcmp(output, test_results[k])) { - fprintf(stdout, "SHA-256 self test FAILED\n"); - fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[k]); - fprintf(stderr, "\t%s returned\n", output); - fprintf(stderr, "\t%s is correct\n", test_results[k]); - return false; - } - } - - /* million 'a' vector we feed separately */ - SHA256_Init(&context); - for (k = 0; k < 1000000; k++) { - SHA256_Update(&context, (uint8_t *)"a", 1); - } - SHA256_Final(&context, 8, digest); - digest_to_hex(digest, output); - if (strcmp(output, test_results[2])) { - fprintf(stdout, "SHA-256 self test FAILED\n"); - fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[2]); - fprintf(stderr, "\t%s returned\n", output); - fprintf(stderr, "\t%s is correct\n", test_results[2]); - return false; - } - - /* success */ - return true; +template +static bool SHA256_Selftest( void ) { + int k; + SHA2_CTX context; + uint8_t digest[32]; + char output[72]; + + for (k = 0; k < 2; k++) { + SHA256_Init(&context); + SHA256_Update(&context, (uint8_t *)test_data[k], strlen(test_data[k])); + SHA256_Final(&context, 8, digest); + digest_to_hex(digest, output); + + if (strcmp(output, test_results[k])) { + fprintf(stdout, "SHA-256 self test FAILED\n" ); + fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[k]); + fprintf(stderr, "\t%s returned\n", output); + fprintf(stderr, "\t%s is correct\n", test_results[k]); + return false; + } + } + + /* million 'a' vector we feed separately */ + SHA256_Init(&context); + for (k = 0; k < 1000000; k++) { + SHA256_Update(&context, (uint8_t *)"a", 1); + } + SHA256_Final(&context, 8, digest); + digest_to_hex(digest, output); + if (strcmp(output, test_results[2])) { + fprintf(stdout, "SHA-256 self test FAILED\n" ); + fprintf(stderr, "* hash of \"%s\" incorrect:\n", test_data[2]); + fprintf(stderr, "\t%s returned\n", output); + fprintf(stderr, "\t%s is correct\n", test_results[2]); + return false; + } + + /* success */ + return true; } -static bool SHA256_test(void) { - if (isBE()) { - return SHA256_Selftest(); - } else { - return SHA256_Selftest(); - } +static bool SHA256_test( void ) { + if (isBE()) { + return SHA256_Selftest(); + } else { + return SHA256_Selftest(); + } } REGISTER_FAMILY(sha2, - $.src_url = "https://github.com/noloader/SHA-Intrinsics", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/noloader/SHA-Intrinsics", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(SHA_2_256__64, - $.desc = "SHA-256, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0x31C40E74, - $.verification_BE = 0x6E81AB0B, - $.initfn = SHA256_test, - $.hashfn_native = SHA256<64,false>, - $.hashfn_bswap = SHA256<64,true> -); + $.desc = "SHA-256, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0x31C40E74, + $.verification_BE = 0x6E81AB0B, + $.initfn = SHA256_test, + $.hashfn_native = SHA256<64, false>, + $.hashfn_bswap = SHA256<64, true> + ); REGISTER_HASH(SHA_2_256, - $.desc = "SHA-256", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 256, - $.verification_LE = 0x33BD25DE, - $.verification_BE = 0x1643B047, - $.initfn = SHA256_test, - $.hashfn_native = SHA256<256,false>, - $.hashfn_bswap = SHA256<256,true> -); + $.desc = "SHA-256", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 256, + $.verification_LE = 0x33BD25DE, + $.verification_BE = 0x1643B047, + $.initfn = SHA256_test, + $.hashfn_native = SHA256<256, false>, + $.hashfn_bswap = SHA256<256, true> + ); REGISTER_HASH(SHA_2_224__64, - $.desc = "SHA-224, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0x36C55CA5, - $.verification_BE = 0x8C3C0B2A, - $.initfn = SHA256_test, - $.hashfn_native = SHA224<64,false>, - $.hashfn_bswap = SHA224<64,true> -); + $.desc = "SHA-224, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0x36C55CA5, + $.verification_BE = 0x8C3C0B2A, + $.initfn = SHA256_test, + $.hashfn_native = SHA224<64, false>, + $.hashfn_bswap = SHA224<64, true> + ); REGISTER_HASH(SHA_2_224, - $.desc = "SHA-224", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_BE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 224, - $.verification_LE = 0x6BA219E5, - $.verification_BE = 0x56F30297, - $.initfn = SHA256_test, - $.hashfn_native = SHA224<224,false>, - $.hashfn_bswap = SHA224<224,true> -); + $.desc = "SHA-224", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_BE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 224, + $.verification_LE = 0x6BA219E5, + $.verification_BE = 0x56F30297, + $.initfn = SHA256_test, + $.hashfn_native = SHA224<224, false>, + $.hashfn_bswap = SHA224<224, true> + ); diff --git a/hashes/sha3.cpp b/hashes/sha3.cpp index a83a9860..583f4216 100644 --- a/hashes/sha3.cpp +++ b/hashes/sha3.cpp @@ -33,83 +33,94 @@ #include /* 'Words' here refers to uint64_t */ -#define SHA3_KECCAK_SPONGE_WORDS (((1600)/8/*bits to byte*/)/sizeof(uint64_t)) +#define SHA3_KECCAK_SPONGE_WORDS (((1600) / 8 /*bits to byte*/) / sizeof(uint64_t)) #define SHA3_KECCAK_ROUNDS 24 typedef struct sha3_context_ { - uint64_t s[SHA3_KECCAK_SPONGE_WORDS]; /* Keccak's state */ - uint64_t saved; /* the portion of the input message that we - * didn't consume yet */ - uint32_t byteIndex; /* 0..7--the next byte after the set one - * (starts from 0; 0--none are buffered) */ - uint32_t wordIndex; /* 0..24--the next word to integrate input - * (starts from 0) */ - uint32_t capacityWords; /* the double size of the hash output in - * words (e.g. 16 for Keccak 512) */ + uint64_t s[SHA3_KECCAK_SPONGE_WORDS]; /* Keccak's state */ + uint64_t saved; /* + * the portion of the input message that we + * didn't consume yet + */ + uint32_t byteIndex; /* + * 0..7--the next byte after the set one + * (starts from 0; 0--none are buffered) + */ + uint32_t wordIndex; /* + * 0..24--the next word to integrate input + * (starts from 0) + */ + uint32_t capacityWords; /* + * the double size of the hash output in + * words (e.g. 16 for Keccak 512) + */ } sha3_context; static const uint64_t keccakf_rndc[24] = { - UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), - UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000), - UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), - UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009), - UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), - UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a), - UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), - UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003), - UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), - UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a), - UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), - UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008) + UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082), + UINT64_C(0x800000000000808a), UINT64_C(0x8000000080008000), + UINT64_C(0x000000000000808b), UINT64_C(0x0000000080000001), + UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009), + UINT64_C(0x000000000000008a), UINT64_C(0x0000000000000088), + UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000a), + UINT64_C(0x000000008000808b), UINT64_C(0x800000000000008b), + UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003), + UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080), + UINT64_C(0x000000000000800a), UINT64_C(0x800000008000000a), + UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080), + UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008) }; static const unsigned keccakf_rotc[24] = { - 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 }; static const unsigned keccakf_piln[24] = { - 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 }; -static void keccakf(uint64_t s[25]) -{ - int i, j, round; - uint64_t t, bc[5]; - - for(round = 0; round < SHA3_KECCAK_ROUNDS; round++) { - /* Theta */ - for(i = 0; i < 5; i++) - bc[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20]; - - for(i = 0; i < 5; i++) { - t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); - for(j = 0; j < 25; j += 5) - s[j + i] ^= t; - } - /* Rho Pi */ - t = s[1]; - for(i = 0; i < 24; i++) { - j = keccakf_piln[i]; - bc[0] = s[j]; - s[j] = ROTL64(t, keccakf_rotc[i]); - t = bc[0]; - } - /* Chi */ - for(j = 0; j < 25; j += 5) { - for(i = 0; i < 5; i++) - bc[i] = s[j + i]; - for(i = 0; i < 5; i++) - s[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; - } - /* Iota */ - s[0] ^= keccakf_rndc[round]; - } +static void keccakf( uint64_t s[25] ) { + int i, j, round; + uint64_t t, bc[5]; + + for (round = 0; round < SHA3_KECCAK_ROUNDS; round++) { + /* Theta */ + for (i = 0; i < 5; i++) { + bc[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20]; + } + + for (i = 0; i < 5; i++) { + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); + for (j = 0; j < 25; j += 5) { + s[j + i] ^= t; + } + } + /* Rho Pi */ + t = s[1]; + for (i = 0; i < 24; i++) { + j = keccakf_piln[i]; + bc[0] = s [j]; + s[j] = ROTL64(t, keccakf_rotc[i]); + t = bc[0]; + } + /* Chi */ + for (j = 0; j < 25; j += 5) { + for (i = 0; i < 5; i++) { + bc[i] = s[j + i]; + } + for (i = 0; i < 5; i++) { + s[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; + } + } + /* Iota */ + s[0] ^= keccakf_rndc[round]; + } } -static void sha3_Init(sha3_context * ctx, unsigned bitSize) { - assert(bitSize == 256 || bitSize == 384 || bitSize == 512); - memset(ctx, 0, sizeof(*ctx)); - ctx->capacityWords = 2 * bitSize / (8 * sizeof(uint64_t)); +static void sha3_Init( sha3_context * ctx, unsigned bitSize ) { + assert(bitSize == 256 || bitSize == 384 || bitSize == 512); + memset(ctx, 0, sizeof(*ctx)); + ctx->capacityWords = 2 * bitSize / (8 * sizeof(uint64_t)); } /* @@ -117,137 +128,139 @@ static void sha3_Init(sha3_context * ctx, unsigned bitSize) { * changing the hashed bytes cannot easily reveal the seed nor * trivially collide the hash state. */ -static void sha3_Seed(sha3_context * ctx, uint64_t seed) { - if (ctx->capacityWords >= 2) { - ctx->s[SHA3_KECCAK_SPONGE_WORDS - 2] ^= seed; - ctx->s[SHA3_KECCAK_SPONGE_WORDS - 1] ^= seed * UINT64_C(0x9E3779B97F4A7C15); - } else { - ctx->s[SHA3_KECCAK_SPONGE_WORDS - 1] ^= seed; - } +static void sha3_Seed( sha3_context * ctx, uint64_t seed ) { + if (ctx->capacityWords >= 2) { + ctx->s[SHA3_KECCAK_SPONGE_WORDS - 2] ^= seed; + ctx->s[SHA3_KECCAK_SPONGE_WORDS - 1] ^= seed * UINT64_C(0x9E3779B97F4A7C15); + } else { + ctx->s[SHA3_KECCAK_SPONGE_WORDS - 1] ^= seed; + } } -template < bool bswap > -static void sha3_Process(sha3_context * ctx, const uint8_t * in, size_t inlen) { - /* 0...7 -- how much is needed to have a word */ - uint32_t old_tail = (8 - ctx->byteIndex) & 7; - uint32_t tail; - size_t words, i; - - if (inlen == 0) return; /* nothing to do */ - - if (inlen < old_tail) { /* have no complete word or haven't started the word yet */ - while (inlen--) - ctx->saved |= (uint64_t) (*(in++)) << ((ctx->byteIndex++) * 8); - return; - } - - if (old_tail) { /* will have one word to process */ - inlen -= old_tail; - while (old_tail--) - ctx->saved |= (uint64_t) (*(in++)) << ((ctx->byteIndex++) * 8); - - /* now ready to add saved to the sponge */ - ctx->s[ctx->wordIndex] ^= ctx->saved; - ctx->byteIndex = 0; - ctx->saved = 0; - if(++ctx->wordIndex == (SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords)) { - keccakf(ctx->s); - ctx->wordIndex = 0; - } - } - - /* now work in full words directly from input */ - words = inlen / sizeof(uint64_t); - tail = inlen - words * sizeof(uint64_t); - - for(i = 0; i < words; i++, in += sizeof(uint64_t)) { - uint64_t t = GET_U64(in, 0); - ctx->s[ctx->wordIndex] ^= t; - if(++ctx->wordIndex == (SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords)) { - keccakf(ctx->s); - ctx->wordIndex = 0; - } - } - - /* finally, save the partial word */ - while (tail--) { - ctx->saved |= (uint64_t) (*(in++)) << ((ctx->byteIndex++) * 8); - } - return; +template +static void sha3_Process( sha3_context * ctx, const uint8_t * in, size_t inlen ) { + /* 0...7 -- how much is needed to have a word */ + uint32_t old_tail = (8 - ctx->byteIndex) & 7; + uint32_t tail; + size_t words, i; + + if (inlen == 0) { return; } /* nothing to do */ + + if (inlen < old_tail) { /* have no complete word or haven't started the word yet */ + while (inlen--) { + ctx->saved |= (uint64_t)(*(in++)) << ((ctx->byteIndex++) * 8); + } + return; + } + + if (old_tail) { /* will have one word to process */ + inlen -= old_tail; + while (old_tail--) { + ctx->saved |= (uint64_t)(*(in++)) << ((ctx->byteIndex++) * 8); + } + + /* now ready to add saved to the sponge */ + ctx->s[ctx->wordIndex] ^= ctx->saved; + ctx->byteIndex = 0; + ctx->saved = 0; + if (++ctx->wordIndex == (SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords)) { + keccakf(ctx->s); + ctx->wordIndex = 0; + } + } + + /* now work in full words directly from input */ + words = inlen / sizeof (uint64_t); + tail = inlen - words * sizeof(uint64_t); + + for (i = 0; i < words; i++, in += sizeof(uint64_t)) { + uint64_t t = GET_U64(in, 0); + ctx->s[ctx->wordIndex] ^= t; + if (++ctx->wordIndex == (SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords)) { + keccakf(ctx->s); + ctx->wordIndex = 0; + } + } + + /* finally, save the partial word */ + while (tail--) { + ctx->saved |= (uint64_t)(*(in++)) << ((ctx->byteIndex++) * 8); + } + return; } -template < bool bswap > -static void sha3_Finalize(sha3_context * ctx, size_t digest_words, uint8_t * digest) { - /* - * Append 2-bit suffix 01, per SHA-3 spec. Instead of 1 for padding - * we use 1<<2 below. The 0x02 below corresponds to the suffix 01. - * Overall, we feed 0, then 1, and finally 1 to start - * padding. Without M || 01, we would simply use 1 to start padding. - */ - uint64_t t = (uint64_t)(((uint64_t)(0x02 | (1 << 2))) << ((ctx->byteIndex) * 8)); - - ctx->s[ctx->wordIndex] ^= ctx->saved ^ t; - ctx->s[SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords - 1] ^= UINT64_C(0x8000000000000000); - keccakf(ctx->s); - - uint32_t maxdigest_words = ctx->capacityWords / 2; - if (digest_words > maxdigest_words) { digest_words = maxdigest_words; } - for (int i = 0; i < digest_words; i++) { - PUT_U64(ctx->s[i], digest, 8*i); - } - - return; +template +static void sha3_Finalize( sha3_context * ctx, size_t digest_words, uint8_t * digest ) { + /* + * Append 2-bit suffix 01, per SHA-3 spec. Instead of 1 for padding + * we use 1<<2 below. The 0x02 below corresponds to the suffix 01. + * Overall, we feed 0, then 1, and finally 1 to start + * padding. Without M || 01, we would simply use 1 to start padding. + */ + uint64_t t = (uint64_t)(((uint64_t)(0x02 | (1 << 2))) << ((ctx->byteIndex) * 8)); + + ctx->s[ctx->wordIndex] ^= ctx->saved ^ t; + ctx->s[SHA3_KECCAK_SPONGE_WORDS - ctx->capacityWords - 1] ^= UINT64_C(0x8000000000000000); + keccakf(ctx->s); + + uint32_t maxdigest_words = ctx->capacityWords / 2; + if (digest_words > maxdigest_words) { digest_words = maxdigest_words; } + for (int i = 0; i < digest_words; i++) { + PUT_U64(ctx->s[i], digest, 8 * i); + } + + return; } -template < uint32_t hashbits, bool bswap > -static void SHA3_256(const void * in, const size_t len, const seed_t seed, void * out) { - sha3_context context; +template +static void SHA3_256( const void * in, const size_t len, const seed_t seed, void * out ) { + sha3_context context; - sha3_Init (&context, 256); - sha3_Seed (&context, (uint64_t)seed); - sha3_Process (&context, (const uint8_t *)in, len); - sha3_Finalize(&context, (hashbits+63)/64, (uint8_t *)out); + sha3_Init(&context, 256); + sha3_Seed(&context, (uint64_t)seed); + sha3_Process(&context, (const uint8_t *)in, len); + sha3_Finalize(&context, (hashbits + 63) / 64, (uint8_t *)out); } REGISTER_FAMILY(sha3, - $.src_url = "https://github.com/brainhub/SHA3IUF", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/brainhub/SHA3IUF", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(SHA_3_256__64, - $.desc = "SHA-3, bits 0-63", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 64, - $.verification_LE = 0x76804BEC, - $.verification_BE = 0xC7D2D825, - $.hashfn_native = SHA3_256<64,false>, - $.hashfn_bswap = SHA3_256<64,true> -); + $.desc = "SHA-3, bits 0-63", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 64, + $.verification_LE = 0x76804BEC, + $.verification_BE = 0xC7D2D825, + $.hashfn_native = SHA3_256<64, false>, + $.hashfn_bswap = SHA3_256<64, true> + ); REGISTER_HASH(SHA_3, - $.desc = "SHA-3", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC | - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_NO_SEED, - $.impl_flags = - FLAG_IMPL_LICENSE_MIT | - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_VERY_SLOW, - $.bits = 256, - $.verification_LE = 0x79AEFB60, - $.verification_BE = 0x074CB90C, - $.hashfn_native = SHA3_256<256,false>, - $.hashfn_bswap = SHA3_256<256,true> -); + $.desc = "SHA-3", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC | + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_NO_SEED, + $.impl_flags = + FLAG_IMPL_LICENSE_MIT | + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_VERY_SLOW, + $.bits = 256, + $.verification_LE = 0x79AEFB60, + $.verification_BE = 0x074CB90C, + $.hashfn_native = SHA3_256<256, false>, + $.hashfn_bswap = SHA3_256<256, true> + ); diff --git a/hashes/siphash.cpp b/hashes/siphash.cpp index a63f2080..2a2da0c8 100644 --- a/hashes/siphash.cpp +++ b/hashes/siphash.cpp @@ -29,245 +29,247 @@ #include "Hashlib.h" #if defined(HAVE_SSSE_3) || defined(HAVE_SSE_2) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif //------------------------------------------------------------ -#define SIPCOMPRESS_64 \ - v0 += v1; v2 += v3; \ - v1 = ROTL64(v1,13); v3 = ROTL64(v3,16); \ - v1 ^= v0; v3 ^= v2; \ - v0 = ROTL64(v0,32); \ - v2 += v1; v0 += v3; \ - v1 = ROTL64(v1,17); v3 = ROTL64(v3,21); \ - v1 ^= v2; v3 ^= v0; \ +#define SIPCOMPRESS_64 \ + v0 += v1; v2 += v3; \ + v1 = ROTL64(v1,13); v3 = ROTL64(v3,16); \ + v1 ^= v0; v3 ^= v2; \ + v0 = ROTL64(v0,32); \ + v2 += v1; v0 += v3; \ + v1 = ROTL64(v1,17); v3 = ROTL64(v3,21); \ + v1 ^= v2; v3 ^= v0; \ v2 = ROTL64(v2,32) /* The 64bit 1-3 and 2-4 variants */ -template < bool variant_2_4, bool bswap > -static uint64_t siphash_portable(const uint64_t key[2], const uint8_t * m, size_t len) { - uint64_t v0, v1, v2, v3; - uint64_t mi, k0, k1; - uint64_t last7; - size_t i, blocks; +template +static uint64_t siphash_portable( const uint64_t key[2], const uint8_t * m, size_t len ) { + uint64_t v0, v1, v2, v3; + uint64_t mi, k0, k1; + uint64_t last7; + size_t i, blocks; - k0 = key[0]; - k1 = key[1]; + k0 = key[0]; + k1 = key[1]; - v0 = k0 ^ UINT64_C(0x736f6d6570736575); + v0 = k0 ^ UINT64_C(0x736f6d6570736575); v1 = k1 ^ UINT64_C(0x646f72616e646f6d); - v2 = k0 ^ UINT64_C(0x6c7967656e657261); - v3 = k1 ^ UINT64_C(0x7465646279746573); + v2 = k0 ^ UINT64_C(0x6c7967656e657261); + v3 = k1 ^ UINT64_C(0x7465646279746573); - for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { - mi = GET_U64(m, i); - v3 ^= mi; - SIPCOMPRESS_64; + for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { + mi = GET_U64(m, i); + v3 ^= mi; + SIPCOMPRESS_64; if (variant_2_4) { SIPCOMPRESS_64; } - v0 ^= mi; - } - - last7 = (uint64_t)(len & 0xff) << 56; - switch (len - blocks) { - case 7: last7 |= (uint64_t)m[i + 6] << 48; - case 6: last7 |= (uint64_t)m[i + 5] << 40; - case 5: last7 |= (uint64_t)m[i + 4] << 32; - case 4: last7 |= (uint64_t)m[i + 3] << 24; - case 3: last7 |= (uint64_t)m[i + 2] << 16; - case 2: last7 |= (uint64_t)m[i + 1] << 8; - case 1: last7 |= (uint64_t)m[i + 0] ; - case 0: - default:; - }; - - v3 ^= last7; - SIPCOMPRESS_64; + v0 ^= mi; + } + + last7 = (uint64_t)(len & 0xff) << 56; + switch (len - blocks) { + case 7: last7 |= (uint64_t)m[i + 6] << 48; + case 6: last7 |= (uint64_t)m[i + 5] << 40; + case 5: last7 |= (uint64_t)m[i + 4] << 32; + case 4: last7 |= (uint64_t)m[i + 3] << 24; + case 3: last7 |= (uint64_t)m[i + 2] << 16; + case 2: last7 |= (uint64_t)m[i + 1] << 8; + case 1: last7 |= (uint64_t)m[i + 0]; + case 0: + default:; + } + + v3 ^= last7; + SIPCOMPRESS_64; if (variant_2_4) { SIPCOMPRESS_64; } - v0 ^= last7; - v2 ^= 0xff; - SIPCOMPRESS_64; - SIPCOMPRESS_64; - SIPCOMPRESS_64; + v0 ^= last7; + v2 ^= 0xff; + SIPCOMPRESS_64; + SIPCOMPRESS_64; + SIPCOMPRESS_64; if (variant_2_4) { SIPCOMPRESS_64; } - return v0 ^ v1 ^ v2 ^ v3; + return v0 ^ v1 ^ v2 ^ v3; } //------------------------------------------------------------ #if defined(HAVE_SSSE_3) || defined(HAVE_SSE_2) -typedef __m128i xmmi; -typedef __m64 qmm; +typedef __m128i xmmi; +typedef __m64 qmm; typedef union packedelem64_t { - uint64_t u[2]; - xmmi v; + uint64_t u[2]; + xmmi v; } packedelem64; typedef union packedelem8_t { - uint8_t u[16]; - xmmi v; + uint8_t u[16]; + xmmi v; } packedelem8; /* 0,2,1,3 */ static const packedelem64 siphash_init[2] = { - {{ UINT64_C(0x736f6d6570736575), UINT64_C(0x6c7967656e657261) }}, - {{ UINT64_C(0x646f72616e646f6d), UINT64_C(0x7465646279746573) }} + { { UINT64_C(0x736f6d6570736575), UINT64_C(0x6c7967656e657261) } }, + { { UINT64_C(0x646f72616e646f6d), UINT64_C(0x7465646279746573) } } }; static const packedelem64 siphash_final = { - { UINT64_C(0x0000000000000000), UINT64_C(0x00000000000000ff) } + { UINT64_C(0x0000000000000000), UINT64_C(0x00000000000000ff) } }; static const packedelem8 siphash_rot16v3 = { - {14,15,8,9,10,11,12,13,8,9,10,11,12,13,14,15} + { 14, 15, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 } }; -template < bool variant_2_4, bool bswap > -static uint64_t siphash_sse(const uint64_t key[2], const uint8_t * m, size_t len) { - xmmi k,v02,v20,v13,v11,v33,mi; - uint64_t last7; - uint32_t lo, hi; - size_t i, blocks; - - k = _mm_loadu_si128((xmmi *)key); - v02 = siphash_init[0].v; - v13 = siphash_init[1].v; - v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); - v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); - -#if defined(HAVE_SSSE_3) -#define sipcompress() \ - v11 = v13; \ - v33 = v13; \ - v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \ - v02 = _mm_add_epi64(v02, v13); \ - v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v13 = _mm_xor_si128(v13, v02); \ - v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ - v11 = v13; \ - v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ - v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \ - v20 = _mm_add_epi64(v20, v13); \ - v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ +template +static uint64_t siphash_sse( const uint64_t key[2], const uint8_t * m, size_t len ) { + xmmi k, v02, v20, v13, v11, v33, mi; + uint64_t last7; + uint32_t lo, hi; + size_t i, blocks; + + k = _mm_loadu_si128((xmmi *)key); + v02 = siphash_init[0].v; + v13 = siphash_init[1].v; + v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); + v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); + + #if defined(HAVE_SSSE_3) +#define sipcompress() \ + v11 = v13; \ + v33 = v13; \ + v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13));\ + v02 = _mm_add_epi64(v02, v13); \ + v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v13 = _mm_xor_si128(v13, v02); \ + v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ + v11 = v13; \ + v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ + v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17));\ + v20 = _mm_add_epi64(v20, v13); \ + v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21));\ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ v13 = _mm_xor_si128(v13, v20); -#else -#define sipcompress() \ - v11 = v13; \ - v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ - v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \ - v02 = _mm_add_epi64(v02, v13); \ - v33 = _mm_or_si128(_mm_slli_epi64(v33, 16), _mm_srli_epi64(v33, 64-16)); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v13 = _mm_xor_si128(v13, v02); \ - v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ - v11 = v13; \ - v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ - v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \ - v20 = _mm_add_epi64(v20, v13); \ - v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v13 = _mm_unpacklo_epi64(v11, v33); \ - v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ + #else +#define sipcompress() \ + v11 = v13; \ + v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ + v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13));\ + v02 = _mm_add_epi64(v02, v13); \ + v33 = _mm_or_si128(_mm_slli_epi64(v33, 16), _mm_srli_epi64(v33, 64-16));\ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v13 = _mm_xor_si128(v13, v02); \ + v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ + v11 = v13; \ + v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ + v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17));\ + v20 = _mm_add_epi64(v20, v13); \ + v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21));\ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v13 = _mm_unpacklo_epi64(v11, v33); \ + v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ v13 = _mm_xor_si128(v13, v20); -#endif + #endif - for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { - mi = _mm_loadl_epi64((xmmi *)(m + i)); + for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { + mi = _mm_loadl_epi64((xmmi *)(m + i)); if (bswap) { mi = mm_bswap64(mi); } - v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); - sipcompress(); + v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); + sipcompress(); if (variant_2_4) { sipcompress(); } - v02 = _mm_xor_si128(v02, mi); - } - - last7 = (uint64_t)(len & 0xff) << 56; - switch (len - blocks) { - case 7: last7 |= (uint64_t)m[i + 6] << 48; - case 6: last7 |= (uint64_t)m[i + 5] << 40; - case 5: last7 |= (uint64_t)m[i + 4] << 32; - case 4: last7 |= (uint64_t)m[i + 3] << 24; - case 3: last7 |= (uint64_t)m[i + 2] << 16; - case 2: last7 |= (uint64_t)m[i + 1] << 8; - case 1: last7 |= (uint64_t)m[i + 0] ; - case 0: - default:; - }; - - mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32))); - v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); - sipcompress(); + v02 = _mm_xor_si128(v02, mi); + } + + last7 = (uint64_t)(len & 0xff) << 56; + switch (len - blocks) { + case 7: last7 |= (uint64_t)m[i + 6] << 48; + case 6: last7 |= (uint64_t)m[i + 5] << 40; + case 5: last7 |= (uint64_t)m[i + 4] << 32; + case 4: last7 |= (uint64_t)m[i + 3] << 24; + case 3: last7 |= (uint64_t)m[i + 2] << 16; + case 2: last7 |= (uint64_t)m[i + 1] << 8; + case 1: last7 |= (uint64_t)m[i + 0]; + case 0: + default:; + } + + mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7), _mm_cvtsi32_si128((uint32_t)(last7 >> 32))); + v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); + sipcompress(); if (variant_2_4) { sipcompress(); } - v02 = _mm_xor_si128(v02, mi); - v02 = _mm_xor_si128(v02, siphash_final.v); - sipcompress(); - sipcompress(); - sipcompress(); + v02 = _mm_xor_si128(v02, mi); + v02 = _mm_xor_si128(v02, siphash_final.v); + sipcompress(); + sipcompress(); + sipcompress(); if (variant_2_4) { sipcompress(); } - v02 = _mm_xor_si128(v02, v13); - v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); - lo = _mm_cvtsi128_si32(v02); - hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); - return ((uint64_t)hi << 32) | lo; + v02 = _mm_xor_si128(v02, v13); + v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1, 0, 3, 2))); + lo = _mm_cvtsi128_si32(v02); + hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); + return ((uint64_t)hi << 32) | lo; } + #endif //------------------------------------------------------------ // the faster half 32bit variant for the linux kernel -#define SIPCOMPRESS_32 \ - do { \ - v0 += v1; \ - v1 = ROTL32(v1, 5); \ - v1 ^= v0; \ - v0 = ROTL32(v0, 16); \ - v2 += v3; \ - v3 = ROTL32(v3, 8); \ - v3 ^= v2; \ - v0 += v3; \ - v3 = ROTL32(v3, 7); \ - v3 ^= v0; \ - v2 += v1; \ - v1 = ROTL32(v1, 13); \ - v1 ^= v2; \ - v2 = ROTL32(v2, 16); \ +#define SIPCOMPRESS_32 \ + do { \ + v0 += v1; \ + v1 = ROTL32(v1, 5); \ + v1 ^= v0; \ + v0 = ROTL32(v0, 16); \ + v2 += v3; \ + v3 = ROTL32(v3, 8); \ + v3 ^= v2; \ + v0 += v3; \ + v3 = ROTL32(v3, 7); \ + v3 ^= v0; \ + v2 += v1; \ + v1 = ROTL32(v1, 13); \ + v1 ^= v2; \ + v2 = ROTL32(v2, 16); \ } while (0) -template < bool bswap > -static uint32_t halfsiphash(const uint32_t key[2], const uint8_t *m, size_t len) { - uint32_t v0 = 0; - uint32_t v1 = 0; - uint32_t v2 = 0x6c796765; - uint32_t v3 = 0x74656462; - uint32_t k0 = key[0]; - uint32_t k1 = key[1]; - uint32_t mi; - const uint8_t *end = m + len - (len % sizeof(uint32_t)); - const int left = len & 3; - uint32_t b = ((uint32_t)len) << 24; +template +static uint32_t halfsiphash( const uint32_t key[2], const uint8_t * m, size_t len ) { + uint32_t v0 = 0; + uint32_t v1 = 0; + uint32_t v2 = 0x6c796765; + uint32_t v3 = 0x74656462; + uint32_t k0 = key[0]; + uint32_t k1 = key[1]; + uint32_t mi; + const uint8_t * end = m + len - (len % sizeof(uint32_t)); + const int left = len & 3; + uint32_t b = ((uint32_t)len) << 24; + v3 ^= k1; v2 ^= k0; v1 ^= k1; v0 ^= k0; for (; m != end; m += 4) { - mi = GET_U32(m, 0); + mi = GET_U32(m, 0); v3 ^= mi; SIPCOMPRESS_32; SIPCOMPRESS_32; @@ -276,14 +278,14 @@ static uint32_t halfsiphash(const uint32_t key[2], const uint8_t *m, size_t len) switch (left) { case 3: - b |= ((uint32_t)m[2]) << 16; + b |= ((uint32_t)m[2]) << 16; case 2: - b |= ((uint32_t)m[1]) << 8; + b |= ((uint32_t)m[1]) << 8; case 1: - b |= ((uint32_t)m[0]); - break; + b |= ((uint32_t)m[0]); + break; case 0: - break; + break; } v3 ^= b; @@ -304,173 +306,177 @@ static uint32_t halfsiphash(const uint32_t key[2], const uint8_t *m, size_t len) // I could find no source for this other than rurban's SMHasher // fork. The slightly-bizarre seeding routine is a hardcoded 64-bit // version of the awkward global-variable+Rand() one in that code. -template < bool bswap > -static uint64_t tsip(const uint64_t seed, const uint8_t * m, uint64_t len) { - uint64_t v0, v1; - uint64_t mi, k0, k1; - uint64_t last7; - - k0 = seed ^ UINT64_C(0x4915a64c00000000); - k1 = seed ^ UINT64_C(0x1c29205700000000); - - v0 = k0 ^ UINT64_C(0x736f6d6570736575); - v1 = k1 ^ UINT64_C(0x646f72616e646f6d); - -#define tsipcompress() \ - do { \ - v0 += v1; \ - v1 = ROTL64(v1, 13) ^ v0; \ - v0 = ROTL64(v0, 35) + v1; \ - v1 = ROTL64(v1, 17) ^ v0; \ - v0 = ROTL64(v0, 21); \ +template +static uint64_t tsip( const uint64_t seed, const uint8_t * m, uint64_t len ) { + uint64_t v0, v1; + uint64_t mi, k0, k1; + uint64_t last7; + + k0 = seed ^ UINT64_C(0x4915a64c00000000); + k1 = seed ^ UINT64_C(0x1c29205700000000); + + v0 = k0 ^ UINT64_C(0x736f6d6570736575); + v1 = k1 ^ UINT64_C(0x646f72616e646f6d); + +#define tsipcompress() \ + do { \ + v0 += v1; \ + v1 = ROTL64(v1, 13) ^ v0; \ + v0 = ROTL64(v0, 35) + v1; \ + v1 = ROTL64(v1, 17) ^ v0; \ + v0 = ROTL64(v0, 21); \ } while (0) - const uint8_t *end = m + (len & ~7); - - while (m < end) { - mi = GET_U64(m, 0); - v1 ^= mi; - tsipcompress(); - v0 ^= mi; - m += 8; - } - - last7 = (uint64_t)(len & 0xff) << 56; - switch (len & 7) { - case 7: - last7 |= (uint64_t)m[6] << 48; - case 6: - last7 |= (uint64_t)m[5] << 40; - case 5: - last7 |= (uint64_t)m[4] << 32; - case 4: - last7 |= (uint64_t)m[3] << 24; - case 3: - last7 |= (uint64_t)m[2] << 16; - case 2: - last7 |= (uint64_t)m[1] << 8; - case 1: - last7 |= (uint64_t)m[0]; - case 0: - default:; - }; - - v1 ^= last7; - tsipcompress(); - v0 ^= last7; - - // finalization - v1 ^= 0xff; - tsipcompress(); - v1 = ROTL64(v1, 32); - tsipcompress(); - v1 = ROTL64(v1, 32); - - return v0 ^ v1; + const uint8_t * end = m + (len & ~7); + + while (m < end) { + mi = GET_U64(m, 0); + v1 ^= mi; + tsipcompress(); + v0 ^= mi; + m += 8; + } + + last7 = (uint64_t)(len & 0xff) << 56; + switch (len & 7) { + case 7: + last7 |= (uint64_t)m[6] << 48; + case 6: + last7 |= (uint64_t)m[5] << 40; + case 5: + last7 |= (uint64_t)m[4] << 32; + case 4: + last7 |= (uint64_t)m[3] << 24; + case 3: + last7 |= (uint64_t)m[2] << 16; + case 2: + last7 |= (uint64_t)m[1] << 8; + case 1: + last7 |= (uint64_t)m[0]; + case 0: + default:; + } + + v1 ^= last7; + tsipcompress(); + v0 ^= last7; + + // finalization + v1 ^= 0xff; + tsipcompress(); + v1 = ROTL64(v1, 32); + tsipcompress(); + v1 = ROTL64(v1, 32); + + return v0 ^ v1; } //------------------------------------------------------------ -template < bool bswap > -static void SipHash_2_4(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void SipHash_2_4( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t key[2] = { seed, 0 }; uint64_t h; + #if defined(HAVE_SSSE_3) || defined(HAVE_SSE_2) - h = siphash_sse(key, (const uint8_t *)in, len); + h = siphash_sse (key, (const uint8_t *)in, len); #else h = siphash_portable(key, (const uint8_t *)in, len); #endif PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void SipHash_1_3(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void SipHash_1_3( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t key[2] = { seed, 0 }; uint64_t h; + #if defined(HAVE_SSSE_3) || defined(HAVE_SSE_2) - h = siphash_sse(key, (const uint8_t *)in, len); + h = siphash_sse (key, (const uint8_t *)in, len); #else h = siphash_portable(key, (const uint8_t *)in, len); #endif PUT_U64(h, (uint8_t *)out, 0); } -template < bool bswap > -static void HalfSipHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void HalfSipHash( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t key[2] = { (uint32_t)seed, (uint32_t)(((uint64_t)seed) >> 32) }; uint32_t h; + h = halfsiphash(key, (const uint8_t *)in, len); PUT_U32(h, (uint8_t *)out, 0); } -template < bool bswap > -static void TinySipHash(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void TinySipHash( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h; + h = tsip((uint64_t)seed, (const uint8_t *)in, len); PUT_U64(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(siphash, - $.src_url = "https://github.com/floodyberry/siphash", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/floodyberry/siphash", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(SipHash_2_4, - $.desc = "SipHash 2-4", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x57B661ED, - $.verification_BE = 0x01B634D0, - $.hashfn_native = SipHash_2_4, - $.hashfn_bswap = SipHash_2_4 -); + $.desc = "SipHash 2-4", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x57B661ED, + $.verification_BE = 0x01B634D0, + $.hashfn_native = SipHash_2_4, + $.hashfn_bswap = SipHash_2_4 + ); REGISTER_HASH(SipHash_1_3, - $.desc = "SipHash 1-3", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x8936B193, - $.verification_BE = 0xBEB90EAC, - $.hashfn_native = SipHash_1_3, - $.hashfn_bswap = SipHash_1_3 -); + $.desc = "SipHash 1-3", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x8936B193, + $.verification_BE = 0xBEB90EAC, + $.hashfn_native = SipHash_1_3, + $.hashfn_bswap = SipHash_1_3 + ); REGISTER_HASH(HalfSipHash, - $.desc = "SipHash half-width version", - $.hash_flags = - FLAG_HASH_CRYPTOGRAPHIC, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0xD2BE7FD8, - $.verification_BE = 0xEC8BC9AF, - $.hashfn_native = HalfSipHash, - $.hashfn_bswap = HalfSipHash -); + $.desc = "SipHash half-width version", + $.hash_flags = + FLAG_HASH_CRYPTOGRAPHIC, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0xD2BE7FD8, + $.verification_BE = 0xEC8BC9AF, + $.hashfn_native = HalfSipHash, + $.hashfn_bswap = HalfSipHash + ); REGISTER_HASH(TinySipHash, - $.desc = "Damian Gryski's Tiny SipHash variant", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x75C732C0, - $.verification_BE = 0xEFE9C35D, - $.hashfn_native = TinySipHash, - $.hashfn_bswap = TinySipHash -); + $.desc = "Damian Gryski's Tiny SipHash variant", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x75C732C0, + $.verification_BE = 0xEFE9C35D, + $.hashfn_native = TinySipHash, + $.hashfn_bswap = TinySipHash + ); diff --git a/hashes/spookyhash.cpp b/hashes/spookyhash.cpp index a5d5906a..14d64be4 100644 --- a/hashes/spookyhash.cpp +++ b/hashes/spookyhash.cpp @@ -61,16 +61,15 @@ // By Bob Jenkins, public domain class SpookyHash { -public: + public: // // SpookyHash: hash a single message in one call, produce 128-bit output // - template < uint32_t version, bool bswap > - static void Hash128( - const void *message, // message to hash - size_t length, // length of message in bytes - uint64_t *hash1, // in/out: in seed 1, out hash value 1 - uint64_t *hash2); // in/out: in seed 2, out hash value 2 + template + static void Hash128( const void * message, // message to hash + size_t length, // length of message in bytes + uint64_t * hash1, // in/out: in seed 1, out hash value 1 + uint64_t * hash2 ); // in/out: in seed 2, out hash value 2 // // This is used if the input is 96 bytes long or longer. @@ -85,25 +84,22 @@ class SpookyHash { // When run forward or backwards one Mix // I tried 3 pairs of each; they all differed by at least 212 bits. // - template < bool bswap > - static FORCE_INLINE void Mix( - const uint8_t * data, - uint64_t &s0, uint64_t &s1, uint64_t &s2, uint64_t &s3, - uint64_t &s4, uint64_t &s5, uint64_t &s6, uint64_t &s7, - uint64_t &s8, uint64_t &s9, uint64_t &s10,uint64_t &s11) - { - s0 += GET_U64(data, 8*0); s2 ^= s10; s11 ^= s0; s0 = ROTL64(s0,11); s11 += s1; - s1 += GET_U64(data, 8*1); s3 ^= s11; s0 ^= s1; s1 = ROTL64(s1,32); s0 += s2; - s2 += GET_U64(data, 8*2); s4 ^= s0; s1 ^= s2; s2 = ROTL64(s2,43); s1 += s3; - s3 += GET_U64(data, 8*3); s5 ^= s1; s2 ^= s3; s3 = ROTL64(s3,31); s2 += s4; - s4 += GET_U64(data, 8*4); s6 ^= s2; s3 ^= s4; s4 = ROTL64(s4,17); s3 += s5; - s5 += GET_U64(data, 8*5); s7 ^= s3; s4 ^= s5; s5 = ROTL64(s5,28); s4 += s6; - s6 += GET_U64(data, 8*6); s8 ^= s4; s5 ^= s6; s6 = ROTL64(s6,39); s5 += s7; - s7 += GET_U64(data, 8*7); s9 ^= s5; s6 ^= s7; s7 = ROTL64(s7,57); s6 += s8; - s8 += GET_U64(data, 8*8); s10 ^= s6; s7 ^= s8; s8 = ROTL64(s8,55); s7 += s9; - s9 += GET_U64(data, 8*9); s11 ^= s7; s8 ^= s9; s9 = ROTL64(s9,54); s8 += s10; - s10 += GET_U64(data, 8*10); s0 ^= s8; s9 ^= s10; s10 = ROTL64(s10,22); s9 += s11; - s11 += GET_U64(data, 8*11); s1 ^= s9; s10 ^= s11; s11 = ROTL64(s11,46); s10 += s0; + template + static FORCE_INLINE void Mix( const uint8_t * data, uint64_t & s0, uint64_t & s1, uint64_t & s2, + uint64_t & s3, uint64_t & s4, uint64_t & s5, uint64_t & s6, uint64_t & s7, uint64_t & s8, + uint64_t & s9, uint64_t & s10, uint64_t & s11 ) { + s0 += GET_U64(data, 8 * 0); s2 ^= s10; s11 ^= s0; s0 = ROTL64(s0, 11); s11 += s1; + s1 += GET_U64(data, 8 * 1); s3 ^= s11; s0 ^= s1; s1 = ROTL64(s1, 32); s0 += s2; + s2 += GET_U64(data, 8 * 2); s4 ^= s0; s1 ^= s2; s2 = ROTL64(s2, 43); s1 += s3; + s3 += GET_U64(data, 8 * 3); s5 ^= s1; s2 ^= s3; s3 = ROTL64(s3, 31); s2 += s4; + s4 += GET_U64(data, 8 * 4); s6 ^= s2; s3 ^= s4; s4 = ROTL64(s4, 17); s3 += s5; + s5 += GET_U64(data, 8 * 5); s7 ^= s3; s4 ^= s5; s5 = ROTL64(s5, 28); s4 += s6; + s6 += GET_U64(data, 8 * 6); s8 ^= s4; s5 ^= s6; s6 = ROTL64(s6, 39); s5 += s7; + s7 += GET_U64(data, 8 * 7); s9 ^= s5; s6 ^= s7; s7 = ROTL64(s7, 57); s6 += s8; + s8 += GET_U64(data, 8 * 8); s10 ^= s6; s7 ^= s8; s8 = ROTL64(s8, 55); s7 += s9; + s9 += GET_U64(data, 8 * 9); s11 ^= s7; s8 ^= s9; s9 = ROTL64(s9, 54); s8 += s10; + s10 += GET_U64(data, 8 * 10); s0 ^= s8; s9 ^= s10; s10 = ROTL64(s10, 22); s9 += s11; + s11 += GET_U64(data, 8 * 11); s1 ^= s9; s10 ^= s11; s11 = ROTL64(s11, 46); s10 += s0; } // @@ -122,45 +118,40 @@ class SpookyHash { // Two iterations was almost good enough for a 64-bit result, but a // 128-bit result is reported, so End() does three iterations. // - static FORCE_INLINE void EndPartial( - uint64_t &h0, uint64_t &h1, uint64_t &h2, uint64_t &h3, - uint64_t &h4, uint64_t &h5, uint64_t &h6, uint64_t &h7, - uint64_t &h8, uint64_t &h9, uint64_t &h10,uint64_t &h11) - { - h11+= h1; h2 ^= h11; h1 = ROTL64(h1,44); - h0 += h2; h3 ^= h0; h2 = ROTL64(h2,15); - h1 += h3; h4 ^= h1; h3 = ROTL64(h3,34); - h2 += h4; h5 ^= h2; h4 = ROTL64(h4,21); - h3 += h5; h6 ^= h3; h5 = ROTL64(h5,38); - h4 += h6; h7 ^= h4; h6 = ROTL64(h6,33); - h5 += h7; h8 ^= h5; h7 = ROTL64(h7,10); - h6 += h8; h9 ^= h6; h8 = ROTL64(h8,13); - h7 += h9; h10^= h7; h9 = ROTL64(h9,38); - h8 += h10; h11^= h8; h10= ROTL64(h10,53); - h9 += h11; h0 ^= h9; h11= ROTL64(h11,42); - h10+= h0; h1 ^= h10; h0 = ROTL64(h0,54); + static FORCE_INLINE void EndPartial( uint64_t & h0, uint64_t & h1, uint64_t & h2, uint64_t & h3, uint64_t & h4, + uint64_t & h5, uint64_t & h6, uint64_t & h7, uint64_t & h8, + uint64_t & h9, uint64_t & h10, uint64_t & h11 ) { + h11 += h1; h2 ^= h11; h1 = ROTL64(h1 , 44); + h0 += h2; h3 ^= h0; h2 = ROTL64(h2 , 15); + h1 += h3; h4 ^= h1; h3 = ROTL64(h3 , 34); + h2 += h4; h5 ^= h2; h4 = ROTL64(h4 , 21); + h3 += h5; h6 ^= h3; h5 = ROTL64(h5 , 38); + h4 += h6; h7 ^= h4; h6 = ROTL64(h6 , 33); + h5 += h7; h8 ^= h5; h7 = ROTL64(h7 , 10); + h6 += h8; h9 ^= h6; h8 = ROTL64(h8 , 13); + h7 += h9; h10 ^= h7; h9 = ROTL64(h9 , 38); + h8 += h10; h11 ^= h8; h10 = ROTL64(h10, 53); + h9 += h11; h0 ^= h9; h11 = ROTL64(h11, 42); + h10 += h0; h1 ^= h10; h0 = ROTL64(h0 , 54); } - template < uint32_t version, bool bswap > - static FORCE_INLINE void End( - uint64_t &h0, uint64_t &h1, uint64_t &h2, uint64_t &h3, - uint64_t &h4, uint64_t &h5, uint64_t &h6, uint64_t &h7, - uint64_t &h8, uint64_t &h9, uint64_t &h10,uint64_t &h11, - const uint8_t * data) - { + template + static FORCE_INLINE void End( uint64_t & h0, uint64_t & h1, uint64_t & h2, uint64_t & h3, + uint64_t & h4, uint64_t & h5, uint64_t & h6, uint64_t & h7, uint64_t & h8, uint64_t & h9, + uint64_t & h10, uint64_t & h11, const uint8_t * data ) { if (version == 2) { - h0 += GET_U64(data, 8*0); h1 += GET_U64(data, 8*1); - h2 += GET_U64(data, 8*2); h3 += GET_U64(data, 8*3); - h4 += GET_U64(data, 8*4); h5 += GET_U64(data, 8*5); - h6 += GET_U64(data, 8*6); h7 += GET_U64(data, 8*7); - h8 += GET_U64(data, 8*8); h9 += GET_U64(data, 8*9); - h10 += GET_U64(data, 8*10); h11 += GET_U64(data, 8*11); + h0 += GET_U64(data, 8 * 0); h1 += GET_U64(data, 8 * 1); + h2 += GET_U64(data, 8 * 2); h3 += GET_U64(data, 8 * 3); + h4 += GET_U64(data, 8 * 4); h5 += GET_U64(data, 8 * 5); + h6 += GET_U64(data, 8 * 6); h7 += GET_U64(data, 8 * 7); + h8 += GET_U64(data, 8 * 8); h9 += GET_U64(data, 8 * 9); + h10 += GET_U64(data, 8 * 10); h11 += GET_U64(data, 8 * 11); } else { - Mix(data,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11); + Mix(data, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); } - EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11); - EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11); - EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11); + EndPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); + EndPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); + EndPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); } // @@ -178,20 +169,19 @@ class SpookyHash { // with diffs defined by either xor or subtraction // with a base of all zeros plus a counter, or plus another bit, or random // - static FORCE_INLINE void ShortMix(uint64_t &h0, uint64_t &h1, uint64_t &h2, uint64_t &h3) - { - h2 = ROTL64(h2,50); h2 += h3; h0 ^= h2; - h3 = ROTL64(h3,52); h3 += h0; h1 ^= h3; - h0 = ROTL64(h0,30); h0 += h1; h2 ^= h0; - h1 = ROTL64(h1,41); h1 += h2; h3 ^= h1; - h2 = ROTL64(h2,54); h2 += h3; h0 ^= h2; - h3 = ROTL64(h3,48); h3 += h0; h1 ^= h3; - h0 = ROTL64(h0,38); h0 += h1; h2 ^= h0; - h1 = ROTL64(h1,37); h1 += h2; h3 ^= h1; - h2 = ROTL64(h2,62); h2 += h3; h0 ^= h2; - h3 = ROTL64(h3,34); h3 += h0; h1 ^= h3; - h0 = ROTL64(h0,5); h0 += h1; h2 ^= h0; - h1 = ROTL64(h1,36); h1 += h2; h3 ^= h1; + static FORCE_INLINE void ShortMix( uint64_t & h0, uint64_t & h1, uint64_t & h2, uint64_t & h3 ) { + h2 = ROTL64(h2, 50); h2 += h3; h0 ^= h2; + h3 = ROTL64(h3, 52); h3 += h0; h1 ^= h3; + h0 = ROTL64(h0, 30); h0 += h1; h2 ^= h0; + h1 = ROTL64(h1, 41); h1 += h2; h3 ^= h1; + h2 = ROTL64(h2, 54); h2 += h3; h0 ^= h2; + h3 = ROTL64(h3, 48); h3 += h0; h1 ^= h3; + h0 = ROTL64(h0, 38); h0 += h1; h2 ^= h0; + h1 = ROTL64(h1, 37); h1 += h2; h3 ^= h1; + h2 = ROTL64(h2, 62); h2 += h3; h0 ^= h2; + h3 = ROTL64(h3, 34); h3 += h0; h1 ^= h3; + h0 = ROTL64(h0, 5); h0 += h1; h2 ^= h0; + h1 = ROTL64(h1, 36); h1 += h2; h3 ^= h1; } // @@ -206,44 +196,41 @@ class SpookyHash { // For every pair of input bits, // with probability 50 +- .75% (the worst case is approximately that) // - static FORCE_INLINE void ShortEnd(uint64_t &h0, uint64_t &h1, uint64_t &h2, uint64_t &h3) - { - h3 ^= h2; h2 = ROTL64(h2,15); h3 += h2; - h0 ^= h3; h3 = ROTL64(h3,52); h0 += h3; - h1 ^= h0; h0 = ROTL64(h0,26); h1 += h0; - h2 ^= h1; h1 = ROTL64(h1,51); h2 += h1; - h3 ^= h2; h2 = ROTL64(h2,28); h3 += h2; - h0 ^= h3; h3 = ROTL64(h3,9); h0 += h3; - h1 ^= h0; h0 = ROTL64(h0,47); h1 += h0; - h2 ^= h1; h1 = ROTL64(h1,54); h2 += h1; - h3 ^= h2; h2 = ROTL64(h2,32); h3 += h2; - h0 ^= h3; h3 = ROTL64(h3,25); h0 += h3; - h1 ^= h0; h0 = ROTL64(h0,63); h1 += h0; + static FORCE_INLINE void ShortEnd( uint64_t & h0, uint64_t & h1, uint64_t & h2, uint64_t & h3 ) { + h3 ^= h2; h2 = ROTL64(h2, 15); h3 += h2; + h0 ^= h3; h3 = ROTL64(h3, 52); h0 += h3; + h1 ^= h0; h0 = ROTL64(h0, 26); h1 += h0; + h2 ^= h1; h1 = ROTL64(h1, 51); h2 += h1; + h3 ^= h2; h2 = ROTL64(h2, 28); h3 += h2; + h0 ^= h3; h3 = ROTL64(h3, 9); h0 += h3; + h1 ^= h0; h0 = ROTL64(h0, 47); h1 += h0; + h2 ^= h1; h1 = ROTL64(h1, 54); h2 += h1; + h3 ^= h2; h2 = ROTL64(h2, 32); h3 += h2; + h0 ^= h3; h3 = ROTL64(h3, 25); h0 += h3; + h1 ^= h0; h0 = ROTL64(h0, 63); h1 += h0; } -private: - + private: // // Short is used for messages under 192 bytes in length // Short has a low startup cost, the normal mode is good for long // keys, the cost crossover is at about 192 bytes. The two modes were // held to the same quality bar. // - template < uint32_t version, bool bswap > - static void Short( - const void *message, // message (array of bytes, not necessarily aligned) - size_t length, // length of message (in bytes) - uint64_t *hash1, // in/out: in the seed, out the hash value - uint64_t *hash2); // in/out: in the seed, out the hash value + template + static void Short( const void * message, // message (array of bytes, not necessarily aligned) + size_t length, // length of message (in bytes) + uint64_t * hash1, // in/out: in the seed, out the hash value + uint64_t * hash2 ); // in/out: in the seed, out the hash value // number of uint64_t's in internal state - static const size_t sc_numVars = 12; + static const size_t sc_numVars = 12; // size of the internal state - static const size_t sc_blockSize = sc_numVars*8; + static const size_t sc_blockSize = sc_numVars * 8; // size of buffer of unhashed data, in bytes - static const size_t sc_bufSize = 2*sc_blockSize; + static const size_t sc_bufSize = 2 * sc_blockSize; // // sc_const: a constant which: @@ -252,45 +239,45 @@ class SpookyHash { // * is a not-very-regular mix of 1's and 0's // * does not need any other special mathematical properties // - static const uint64_t sc_const = UINT64_C(0xdeadbeefdeadbeef); -}; - -template < uint32_t version, bool bswap > -void SpookyHash::Short(const void *message, size_t length, uint64_t *hash1, uint64_t *hash2) { - size_t remainder = length % 32; - uint64_t a = *hash1; - uint64_t b = *hash2; - uint64_t c = sc_const; - uint64_t d = sc_const; - const uint8_t * ptr = (const uint8_t *)message; + static const uint64_t sc_const = UINT64_C(0xdeadbeefdeadbeef); +}; // class SpookyHash + +template +void SpookyHash::Short( const void * message, size_t length, uint64_t * hash1, uint64_t * hash2 ) { + size_t remainder = length % 32; + uint64_t a = *hash1; + uint64_t b = *hash2; + uint64_t c = sc_const; + uint64_t d = sc_const; + const uint8_t * ptr = (const uint8_t *)message; if (length > 15) { - const uint8_t * end = ptr + (length/32)*32; + const uint8_t * end = ptr + (length / 32) * 32; // handle all complete sets of 32 bytes for (; ptr < end; ptr += 32) { c += GET_U64(ptr, 0); d += GET_U64(ptr, 8); - ShortMix(a,b,c,d); + ShortMix(a, b, c, d); a += GET_U64(ptr, 16); b += GET_U64(ptr, 24); } - //Handle the case of 16+ remaining bytes. + // Handle the case of 16+ remaining bytes. if (remainder >= 16) { - c += GET_U64(ptr, 0); - d += GET_U64(ptr, 8); - ShortMix(a,b,c,d); - ptr += 16; + c += GET_U64(ptr, 0); + d += GET_U64(ptr, 8); + ShortMix(a, b, c, d); + ptr += 16; remainder -= 16; } } // Handle the last 0..15 bytes, and its length if (version == 1) { - d = ((uint64_t)length) << 56; + d = ((uint64_t)length) << 56; } else { - d += ((uint64_t)length) << 56; + d += ((uint64_t)length) << 56; } switch (remainder) { case 15: d += ((uint64_t)ptr[14]) << 48; @@ -298,44 +285,44 @@ void SpookyHash::Short(const void *message, size_t length, uint64_t *hash1, uint case 13: d += ((uint64_t)ptr[12]) << 32; case 12: d += GET_U32(ptr, 8); c += GET_U64(ptr, 0); break; case 11: d += ((uint64_t)ptr[10]) << 16; - case 10: d += ((uint64_t)ptr[9]) << 8; - case 9: d += (uint64_t)ptr[8]; - case 8: c += GET_U64(ptr, 0); break; - case 7: c += ((uint64_t)ptr[6]) << 48; - case 6: c += ((uint64_t)ptr[5]) << 40; - case 5: c += ((uint64_t)ptr[4]) << 32; - case 4: c += GET_U32(ptr, 0); break; - case 3: c += ((uint64_t)ptr[2]) << 16; - case 2: c += ((uint64_t)ptr[1]) << 8; - case 1: c += (uint64_t)ptr[0]; break; - case 0: c += sc_const; d += sc_const; break; + case 10: d += ((uint64_t)ptr[ 9]) << 8; + case 9: d += (uint64_t)ptr[ 8]; + case 8: c += GET_U64(ptr, 0); break; + case 7: c += ((uint64_t)ptr[ 6]) << 48; + case 6: c += ((uint64_t)ptr[ 5]) << 40; + case 5: c += ((uint64_t)ptr[ 4]) << 32; + case 4: c += GET_U32(ptr, 0); break; + case 3: c += ((uint64_t)ptr[ 2]) << 16; + case 2: c += ((uint64_t)ptr[ 1]) << 8; + case 1: c += (uint64_t)ptr[0]; break; + case 0: c += sc_const; d += sc_const; break; } - ShortEnd(a,b,c,d); + ShortEnd(a, b, c, d); *hash1 = a; *hash2 = b; } // do the whole hash in one call -template < uint32_t version, bool bswap > -void SpookyHash::Hash128(const void * message, size_t length, uint64_t * hash1, uint64_t * hash2) { +template +void SpookyHash::Hash128( const void * message, size_t length, uint64_t * hash1, uint64_t * hash2 ) { if (length < sc_bufSize) { Short(message, length, hash1, hash2); return; } - uint64_t h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11; + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; const uint8_t * ptr = (const uint8_t *)message; - const uint8_t * end = ptr + (length/sc_blockSize)*sc_blockSize; - size_t remainder; + const uint8_t * end = ptr + (length / sc_blockSize) * sc_blockSize; + size_t remainder; - h0=h3=h6=h9 = *hash1; - h1=h4=h7=h10 = *hash2; - h2=h5=h8=h11 = sc_const; + h0 = h3 = h6 = h9 = *hash1; + h1 = h4 = h7 = h10 = *hash2; + h2 = h5 = h8 = h11 = sc_const; // handle all whole sc_blockSize blocks of bytes while (ptr < end) { - Mix(ptr,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11); - ptr += sc_blockSize; + Mix(ptr, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); + ptr += sc_blockSize; } // handle the last partial block of sc_blockSize bytes @@ -346,118 +333,119 @@ void SpookyHash::Hash128(const void * message, size_t length, uint64_t * hash1, buf[sc_blockSize - 1] = remainder; // do some final mixing - End(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11,buf); + End(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, buf); *hash1 = h0; *hash2 = h1; } -template < uint32_t version, uint32_t hashlen, bool bswap > -static void spookyhash(const void * in, const size_t len, const seed_t seed, void * out) { - uint64_t h1, h2; - h1 = h2 = (uint64_t)seed; +template +static void spookyhash( const void * in, const size_t len, const seed_t seed, void * out ) { + uint64_t h1, h2; - SpookyHash::Hash128(in, len, &h1, &h2); + h1 = h2 = (uint64_t)seed; - h1 = COND_BSWAP(h1, bswap); - h2 = COND_BSWAP(h2, bswap); + SpookyHash::Hash128(in, len, &h1, &h2); - if (hashlen > 64) { - memcpy(out, &h1, 8); - memcpy(((uint8_t *)out) + 8, &h2, hashlen/8 - 8); - } else { - memcpy(out, &h1, hashlen/8); - } + h1 = COND_BSWAP(h1, bswap); + h2 = COND_BSWAP(h2, bswap); + + if (hashlen > 64) { + memcpy(out, &h1, 8); + memcpy(((uint8_t *)out) + 8, &h2, hashlen / 8 - 8); + } else { + memcpy(out, &h1, hashlen / 8); + } } REGISTER_FAMILY(spookyhash, - $.src_url = "https://www.burtleburtle.net/bob/hash/spooky.html", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://www.burtleburtle.net/bob/hash/spooky.html", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); -//{ 0x111af082, 0x26bb3cda, 0x94c4f96c, 0xec24c166 } +// { 0x111af082, 0x26bb3cda, 0x94c4f96c, 0xec24c166 } REGISTER_HASH(SpookyHash1_32, - $.desc = "SpookyHash v1, 32-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x3F798BBB, - $.verification_BE = 0x32C8248C, - $.hashfn_native = spookyhash<1,32,false>, - $.hashfn_bswap = spookyhash<1,32,true> -); + $.desc = "SpookyHash v1, 32-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x3F798BBB, + $.verification_BE = 0x32C8248C, + $.hashfn_native = spookyhash<1, 32, false>, + $.hashfn_bswap = spookyhash<1, 32, true> + ); REGISTER_HASH(SpookyHash1_64, - $.desc = "SpookyHash v1, 64-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0xA7F955F1, - $.verification_BE = 0xD6BD6D2B, - $.hashfn_native = spookyhash<1,64,false>, - $.hashfn_bswap = spookyhash<1,64,true> -); + $.desc = "SpookyHash v1, 64-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0xA7F955F1, + $.verification_BE = 0xD6BD6D2B, + $.hashfn_native = spookyhash<1, 64, false>, + $.hashfn_bswap = spookyhash<1, 64, true> + ); REGISTER_HASH(SpookyHash1_128, - $.desc = "SpookyHash v1, 128-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x8D263080, - $.verification_BE = 0xE9E5572C, - $.hashfn_native = spookyhash<1,128,false>, - $.hashfn_bswap = spookyhash<1,128,true> -); + $.desc = "SpookyHash v1, 128-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x8D263080, + $.verification_BE = 0xE9E5572C, + $.hashfn_native = spookyhash<1, 128, false>, + $.hashfn_bswap = spookyhash<1, 128, true> + ); REGISTER_HASH(SpookyHash2_32, - $.desc = "SpookyHash v2, 32-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0xA48BE265, - $.verification_BE = 0x9742FF7D, - $.hashfn_native = spookyhash<2,32,false>, - $.hashfn_bswap = spookyhash<2,32,true>, - $.sort_order = 10 -); + $.desc = "SpookyHash v2, 32-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0xA48BE265, + $.verification_BE = 0x9742FF7D, + $.hashfn_native = spookyhash<2, 32, false>, + $.hashfn_bswap = spookyhash<2, 32, true>, + $.sort_order = 10 + ); REGISTER_HASH(SpookyHash2_64, - $.desc = "SpookyHash v2, 64-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x972C4BDC, - $.verification_BE = 0x6B914F15, - $.hashfn_native = spookyhash<2,64,false>, - $.hashfn_bswap = spookyhash<2,64,true>, - $.sort_order = 10 -); + $.desc = "SpookyHash v2, 64-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x972C4BDC, + $.verification_BE = 0x6B914F15, + $.hashfn_native = spookyhash<2, 64, false>, + $.hashfn_bswap = spookyhash<2, 64, true>, + $.sort_order = 10 + ); REGISTER_HASH(SpookyHash2_128, - $.desc = "SpookyHash v2, 128-bit result", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 128, - $.verification_LE = 0x893CFCBE, - $.verification_BE = 0x7C1EA273, - $.hashfn_native = spookyhash<2,128,false>, - $.hashfn_bswap = spookyhash<2,128,true>, - $.sort_order = 10 -); + $.desc = "SpookyHash v2, 128-bit result", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 128, + $.verification_LE = 0x893CFCBE, + $.verification_BE = 0x7C1EA273, + $.hashfn_native = spookyhash<2, 128, false>, + $.hashfn_bswap = spookyhash<2, 128, true>, + $.sort_order = 10 + ); diff --git a/hashes/superfasthash.cpp b/hashes/superfasthash.cpp index 633e66c0..158c98f4 100644 --- a/hashes/superfasthash.cpp +++ b/hashes/superfasthash.cpp @@ -28,44 +28,44 @@ #include "Hashlib.h" //------------------------------------------------------------ -template < bool bswap > -static uint32_t SuperFastHash(const uint8_t * data, size_t len, const uint32_t seed) { +template +static uint32_t SuperFastHash( const uint8_t * data, size_t len, const uint32_t seed ) { uint32_t hash = seed; uint32_t tmp; - size_t rem; + size_t rem; - if (len <= 0 || data == NULL) return 0; + if ((len <= 0) || (data == NULL)) { return 0; } hash += len; - rem = len & 3; + rem = len & 3; len >>= 2; /* Main loop */ - for (;len > 0; len--) { - hash += GET_U16(data, 0); - tmp = (GET_U16(data, 2) << 11) ^ hash; - hash = (hash << 16) ^ tmp; - hash += hash >> 11; - data += 2 * sizeof (uint16_t); + for (; len > 0; len--) { + hash += GET_U16(data, 0); + tmp = (GET_U16(data, 2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + hash += hash >> 11; + data += 2 * sizeof(uint16_t); } /* Handle end cases */ switch (rem) { case 3: - hash += GET_U16(data, 0); - hash ^= hash << 16; - hash ^= ((uint32_t)(int8_t)data[sizeof (uint16_t)]) << 18; - hash += hash >> 11; - break; + hash += GET_U16(data, 0); + hash ^= hash << 16; + hash ^= ((uint32_t)(int8_t)data[sizeof(uint16_t)]) << 18; + hash += hash >> 11; + break; case 2: - hash += GET_U16(data, 0); - hash ^= hash << 11; - hash += hash >> 17; - break; + hash += GET_U16(data, 0); + hash ^= hash << 11; + hash += hash >> 17; + break; case 1: - hash += (int8_t)(*data); - hash ^= hash << 10; - hash += hash >> 1; + hash += (int8_t)(*data); + hash ^= hash << 10; + hash += hash >> 1; } /* Force "avalanching" of final 127 bits */ @@ -80,29 +80,30 @@ static uint32_t SuperFastHash(const uint8_t * data, size_t len, const uint32_t s } //------------------------------------------------------------ -template < bool bswap > -static void SFH(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void SFH( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = SuperFastHash((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(superfasthash, - $.src_url = "http://www.azillionmonkeys.com/qed/hash.html", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "http://www.azillionmonkeys.com/qed/hash.html", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(SuperFastHash, - $.desc = "Paul Hsieh's SuperFastHash", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT | - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_LICENSE_GPL3, - $.bits = 32, - $.verification_LE = 0xCFA52B38, - $.verification_BE = 0xDF0823CA, - $.hashfn_native = SFH, - $.hashfn_bswap = SFH -); + $.desc = "Paul Hsieh's SuperFastHash", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT | + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_LICENSE_GPL3, + $.bits = 32, + $.verification_LE = 0xCFA52B38, + $.verification_BE = 0xDF0823CA, + $.hashfn_native = SFH, + $.hashfn_bswap = SFH + ); diff --git a/hashes/t1ha.cpp b/hashes/t1ha.cpp index 2750fb30..7c44a5f2 100644 --- a/hashes/t1ha.cpp +++ b/hashes/t1ha.cpp @@ -59,7 +59,7 @@ #include "Mathmult.h" #if defined(HAVE_X86_64_AES) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif #include @@ -80,103 +80,102 @@ defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT + #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT #else -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE + #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE #endif //------------------------------------------------------------ #if defined(__SANITIZE_ADDRESS__) -#undef T1HA_USE_ALIGNED_ONESHOT_READ -#define T1HA_USE_ALIGNED_ONESHOT_READ 0 -#undef T1HA_SYS_UNALIGNED_ACCESS -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE + #undef T1HA_USE_ALIGNED_ONESHOT_READ + #define T1HA_USE_ALIGNED_ONESHOT_READ 0 + #undef T1HA_SYS_UNALIGNED_ACCESS + #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE #endif #if !defined(PAGESIZE) -#define PAGESIZE 4096 + #define PAGESIZE 4096 #endif -#if T1HA_USE_ALIGNED_ONESHOT_READ && \ - T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ +#if T1HA_USE_ALIGNED_ONESHOT_READ && \ + T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ defined(PAGESIZE) && PAGESIZE > 42 -#define T1HA_USE_UNALIGNED_ONESHOT_READ 1 -#define can_read_underside(ptr, size) \ - ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) + #define T1HA_USE_UNALIGNED_ONESHOT_READ 1 + #define can_read_underside(ptr, size) \ + ((size) <= sizeof(uintptr_t) && ((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) #else -#define T1HA_USE_UNALIGNED_ONESHOT_READ 0 -#define can_read_underside(ptr, size) false + #define T1HA_USE_UNALIGNED_ONESHOT_READ 0 + #define can_read_underside(ptr, size) false #endif #define ALIGNMENT_16 2 #define ALIGNMENT_32 4 #if defined(HAVE_32BIT_PLATFORM) -#define ALIGNMENT_64 4 + #define ALIGNMENT_64 4 #else -#define ALIGNMENT_64 8 + #define ALIGNMENT_64 8 #endif #if defined(__GNUC__) && defined(__GNUC_MINOR__) -#define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) + #define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) #else -#define __GNUC_PREREQ(maj, min) 0 + #define __GNUC_PREREQ(maj, min) 0 #endif #if !defined(__has_builtin) -#define __has_builtin(x) (0) + #define __has_builtin(x) (0) #endif #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) + #define read_aligned(ptr, bits) \ + (*(const uint ## bits ## _t *)__builtin_assume_aligned(ptr, ALIGNMENT_ ## bits)) #elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) + #define read_aligned(ptr, bits) \ + (*(const uint ## bits ## _t __attribute__((aligned(ALIGNMENT_ ## bits))) *)(ptr)) #elif __has_attribute(assume_aligned) static __always_inline const - uint16_t *__attribute__((assume_aligned(ALIGNMENT_16))) - cast_aligned_16(const void *ptr) { - return (const uint16_t *)ptr; +uint16_t * __attribute__((assume_aligned(ALIGNMENT_16))) cast_aligned_16( const void * ptr ) { + return (const uint16_t *)ptr; } + static __always_inline const - uint32_t *__attribute__((assume_aligned(ALIGNMENT_32))) - cast_aligned_32(const void *ptr) { - return (const uint32_t *)ptr; +uint32_t * __attribute__((assume_aligned(ALIGNMENT_32))) cast_aligned_32( const void * ptr ) { + return (const uint32_t *)ptr; } + static __always_inline const - uint64_t *__attribute__((assume_aligned(ALIGNMENT_64))) - cast_aligned_64(const void *ptr) { - return (const uint64_t *)ptr; +uint64_t * __attribute__((assume_aligned(ALIGNMENT_64))) cast_aligned_64( const void * ptr ) { + return (const uint64_t *)ptr; } -#define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr)) + #define read_aligned(ptr, bits) (*cast_aligned_ ## bits(ptr)) #elif defined(_MSC_VER) -#define read_aligned(ptr, bits) \ - (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr)) + #define read_aligned(ptr, bits) \ + (*(const __declspec(align(ALIGNMENT_ ## bits)) uint ## bits ## _t *)(ptr)) #else -#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) + #define read_aligned(ptr, bits) (*(const uint ## bits ## _t *)(ptr)) #endif /* read_aligned */ //------------------------------------------------------------ // 'magic' primes -static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); -static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); -static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); -static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); -static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); -static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); -static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); +static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); +static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); +static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); +static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); +static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); +static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); +static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); static const uint32_t prime32_0 = UINT32_C(0x92D78269); static const uint32_t prime32_1 = UINT32_C(0xCA9B4735); @@ -195,17 +194,17 @@ enum t1ha_modes { }; #define MODE_NATIVE(m) (((m) == MODE_LE_NATIVE) || ((m) == MODE_BE_NATIVE)) -#define MODE_BSWAP(m) (((m) == MODE_LE_BSWAP ) || ((m) == MODE_BE_BSWAP )) -#define MODE_BE_SYS(m) (((m) == MODE_BE_BSWAP ) || ((m) == MODE_BE_NATIVE)) -#define MODE_LE_SYS(m) (((m) == MODE_LE_NATIVE) || ((m) == MODE_LE_BSWAP )) -#define MODE_BE_OUT(m) (((m) == MODE_LE_BSWAP ) || ((m) == MODE_BE_NATIVE)) -#define MODE_LE_OUT(m) (((m) == MODE_LE_NATIVE) || ((m) == MODE_BE_BSWAP )) +#define MODE_BSWAP(m) (((m) == MODE_LE_BSWAP) || ((m) == MODE_BE_BSWAP)) +#define MODE_BE_SYS(m) (((m) == MODE_BE_BSWAP) || ((m) == MODE_BE_NATIVE)) +#define MODE_LE_SYS(m) (((m) == MODE_LE_NATIVE) || ((m) == MODE_LE_BSWAP)) +#define MODE_BE_OUT(m) (((m) == MODE_LE_BSWAP) || ((m) == MODE_BE_NATIVE)) +#define MODE_LE_OUT(m) (((m) == MODE_LE_NATIVE) || ((m) == MODE_BE_BSWAP)) //------------------------------------------------------------ -template < enum t1ha_modes mode, bool aligned > -static FORCE_INLINE uint32_t fetch16(const void * v) { +template +static FORCE_INLINE uint32_t fetch16( const void * v ) { constexpr bool force_aligned = (T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE); - uint16_t result; + uint16_t result; if (aligned) { assert(((uintptr_t)v) % ALIGNMENT_16 == 0); } @@ -213,7 +212,7 @@ static FORCE_INLINE uint32_t fetch16(const void * v) { return COND_BSWAP(read_aligned(v, 16), MODE_BSWAP(mode)); } - const uint8_t *p = (const uint8_t *)v; + const uint8_t * p = (const uint8_t *)v; if (MODE_BE_OUT(mode)) { return (uint16_t)p[0] << 8 | p[1]; } else { @@ -221,10 +220,10 @@ static FORCE_INLINE uint32_t fetch16(const void * v) { } } -template < enum t1ha_modes mode, bool aligned > -static FORCE_INLINE uint32_t fetch32(const void * v) { +template +static FORCE_INLINE uint32_t fetch32( const void * v ) { constexpr bool force_aligned = (T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE); - uint32_t result; + uint32_t result; if (aligned) { assert(((uintptr_t)v) % ALIGNMENT_32 == 0); } @@ -233,18 +232,18 @@ static FORCE_INLINE uint32_t fetch32(const void * v) { } if (MODE_BE_OUT(mode)) { - return (uint32_t)fetch16(v) << 16 | - fetch16((const uint8_t *)v + 2); + return (uint32_t)fetch16(v) << 16 | + fetch16((const uint8_t *)v + 2); } else { - return fetch16(v) | - (uint32_t)fetch16((const uint8_t *)v + 2) << 16; + return fetch16(v) | + (uint32_t)fetch16((const uint8_t *)v + 2) << 16; } } -template < enum t1ha_modes mode, bool aligned > -static FORCE_INLINE uint64_t fetch64(const void * v) { +template +static FORCE_INLINE uint64_t fetch64( const void * v ) { constexpr bool force_aligned = (T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE); - uint64_t result; + uint64_t result; if (aligned) { assert(((uintptr_t)v) % ALIGNMENT_64 == 0); } @@ -253,19 +252,19 @@ static FORCE_INLINE uint64_t fetch64(const void * v) { } if (MODE_BE_OUT(mode)) { - return (uint64_t)fetch32(v) << 32 | - fetch32((const uint8_t *)v + 4); + return (uint64_t)fetch32(v) << 32 | + fetch32((const uint8_t *)v + 4); } else { - return fetch32(v) | - (uint64_t)fetch32((const uint8_t *)v + 4) << 32; + return fetch32(v) | + (uint64_t)fetch32((const uint8_t *)v + 4) << 32; } } //------------------------------------------------------------ -template < enum t1ha_modes mode, bool aligned > -static FORCE_INLINE uint32_t tail32(const void *v, size_t tail) { - constexpr bool unaligned_wordwise = (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT); - const uint8_t *const p = (const uint8_t *)v; +template +static FORCE_INLINE uint32_t tail32( const void * v, size_t tail ) { + constexpr bool unaligned_wordwise = (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT); + const uint8_t * const p = (const uint8_t *)v; uint32_t r = 0; if (aligned && T1HA_USE_ALIGNED_ONESHOT_READ) { @@ -283,46 +282,48 @@ static FORCE_INLINE uint32_t tail32(const void *v, size_t tail) { * for the reminder. */ const unsigned offset = (4 - tail) & 3; - const unsigned shift = offset << 3; + const unsigned shift = offset << 3; if (MODE_LE_OUT(mode)) { if (likely(can_read_underside(p, 4))) { - return fetch32(p - offset) >> shift; + return fetch32(p - offset) >> shift; } - return fetch32(p) & ((~UINT32_C(0)) >> shift); + return fetch32(p) & ((~UINT32_C(0)) >> shift); } else { if (likely(can_read_underside(p, 4))) { - return fetch32(p - offset) & ((~UINT32_C(0)) >> shift); + return fetch32(p - offset) & ((~UINT32_C(0)) >> shift); } - return fetch32(p) >> shift; + return fetch32(p) >> shift; } } if ((mode == MODE_LE_NATIVE) && (aligned || unaligned_wordwise)) { switch (tail & 3) { case 3: - r = (uint32_t)p[2] << 16; - /* fall through */ + r = (uint32_t)p[2] << 16; + /* fall through */ case 2: - return r + fetch16(p); + return r + fetch16(p); case 1: - return p[0]; + return p[0]; case 0: - return fetch32(v); + return fetch32(v); } } if ((mode == MODE_BE_NATIVE) && (aligned || unaligned_wordwise)) { - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ + /* + * For most CPUs this code is better when not needed + * copying for alignment or byte reordering. + */ switch (tail & 3) { case 3: - return fetch16(p) << 8 | p[2]; + return fetch16(p) << 8 | p[2]; case 2: - return fetch16(p); + return fetch16(p); case 1: - return p[0]; + return p[0]; case 0: - return fetch32(p); + return fetch32(p); } } @@ -330,19 +331,19 @@ static FORCE_INLINE uint32_t tail32(const void *v, size_t tail) { ((mode == MODE_LE_NATIVE) && !aligned && !unaligned_wordwise)) { switch (tail & 3) { case 0: - r += p[3]; - r <<= 8; - /* fall through */ + r += p[3]; + r <<= 8; + /* fall through */ case 3: - r += p[2]; - r <<= 8; - /* fall through */ + r += p[2]; + r <<= 8; + /* fall through */ case 2: - r += p[1]; - r <<= 8; - /* fall through */ + r += p[1]; + r <<= 8; + /* fall through */ case 1: - return r + p[0]; + return r + p[0]; } } @@ -350,14 +351,14 @@ static FORCE_INLINE uint32_t tail32(const void *v, size_t tail) { ((mode == MODE_BE_NATIVE) && !aligned && !unaligned_wordwise)) { switch (tail & 3) { case 0: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; case 2: - return p[1] | (uint32_t)p[0] << 8; + return p[1] | (uint32_t)p[0] << 8; case 1: - return p[0]; + return p[0]; } } @@ -366,19 +367,19 @@ static FORCE_INLINE uint32_t tail32(const void *v, size_t tail) { } //------------------------------------------------------------ -template < enum t1ha_modes mode, bool aligned > -static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { - constexpr bool unaligned_wordwise = (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT); - const uint8_t *const p = (const uint8_t *)v; +template +static FORCE_INLINE uint64_t tail64( const void * v, size_t tail ) { + constexpr bool unaligned_wordwise = (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT); + const uint8_t * const p = (const uint8_t *)v; uint64_t r = 0; if (aligned && T1HA_USE_ALIGNED_ONESHOT_READ) { /* We can perform a 'oneshot' read, which is little bit faster. */ const unsigned shift = ((8 - tail) & 7) << 3; if (MODE_LE_OUT(mode)) { - return fetch64(p) & ((~UINT64_C(0)) >> shift); + return fetch64(p) & ((~UINT64_C(0)) >> shift); } else { - return fetch64(p) >> shift; + return fetch64(p) >> shift; } } else if (!aligned && T1HA_USE_UNALIGNED_ONESHOT_READ) { /* @@ -387,17 +388,17 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { * for the reminder. */ const unsigned offset = (8 - tail) & 7; - const unsigned shift = offset << 3; + const unsigned shift = offset << 3; if (MODE_LE_OUT(mode)) { if (likely(can_read_underside(p, 8))) { - return fetch64(p - offset) >> shift; + return fetch64(p - offset) >> shift; } - return fetch64(p) & ((~UINT64_C(0)) >> shift); + return fetch64(p) & ((~UINT64_C(0)) >> shift); } else { if (likely(can_read_underside(p, 8))) { - return fetch64(p - offset) & ((~UINT64_C(0)) >> shift); + return fetch64(p - offset) & ((~UINT64_C(0)) >> shift); } - return fetch64(p) >> shift; + return fetch64(p) >> shift; } } @@ -405,27 +406,27 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { /* For most CPUs this code is better when not needed byte reordering. */ switch (tail & 7) { case 0: - return fetch64(p); + return fetch64(p); case 7: - r = (uint64_t)p[6] << 8; - /* fall through */ + r = (uint64_t)p[6] << 8; + /* fall through */ case 6: - r += p[5]; - r <<= 8; - /* fall through */ + r += p [5]; + r <<= 8; + /* fall through */ case 5: - r += p[4]; - r <<= 32; - /* fall through */ + r += p [4]; + r <<= 32; + /* fall through */ case 4: - return r + fetch32(p); + return r + fetch32(p); case 3: - r = (uint64_t)p[2] << 16; - /* fall through */ + r = (uint64_t)p[2] << 16; + /* fall through */ case 2: - return r + fetch16(p); + return r + fetch16(p); case 1: - return p[0]; + return p[0]; } } @@ -433,22 +434,22 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { /* For most CPUs this code is better when not byte reordering. */ switch (tail & 7) { case 1: - return p[0]; + return p[0]; case 2: - return fetch16(p); + return fetch16(p); case 3: - return (uint32_t)fetch16(p) << 8 | p[2]; + return (uint32_t)fetch16(p) << 8 | p[2]; case 4: - return fetch32(p); + return fetch32(p); case 5: - return (uint64_t)fetch32(p) << 8 | p[4]; + return (uint64_t)fetch32(p) << 8 | p[4]; case 6: - return (uint64_t)fetch32(p) << 16 | fetch16(p + 4); + return (uint64_t)fetch32(p) << 16 | fetch16(p + 4); case 7: - return (uint64_t)fetch32(p) << 24 | - (uint32_t)fetch16(p + 4) << 8 | p[6]; + return (uint64_t)fetch32(p) << 24 | + (uint32_t)fetch16(p + 4) << 8 | p[6]; case 0: - return fetch64(p); + return fetch64(p); } } @@ -456,34 +457,34 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { ((mode == MODE_LE_NATIVE) && !aligned && !unaligned_wordwise)) { switch (tail & 7) { case 0: - r = p[7] << 8; - /* fall through */ + r = p [7] << 8; + /* fall through */ case 7: - r += p[6]; - r <<= 8; - /* fall through */ + r += p[6]; + r <<= 8; + /* fall through */ case 6: - r += p[5]; - r <<= 8; - /* fall through */ + r += p[5]; + r <<= 8; + /* fall through */ case 5: - r += p[4]; - r <<= 8; - /* fall through */ + r += p[4]; + r <<= 8; + /* fall through */ case 4: - r += p[3]; - r <<= 8; - /* fall through */ + r += p[3]; + r <<= 8; + /* fall through */ case 3: - r += p[2]; - r <<= 8; - /* fall through */ + r += p[2]; + r <<= 8; + /* fall through */ case 2: - r += p[1]; - r <<= 8; - /* fall through */ + r += p[1]; + r <<= 8; + /* fall through */ case 1: - return r + p[0]; + return r + p[0]; } } @@ -491,28 +492,28 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { ((mode == MODE_BE_NATIVE) && !aligned && !unaligned_wordwise)) { switch (tail & 7) { case 1: - return p[0]; + return p[0]; case 2: - return p[1] | (uint32_t)p[0] << 8; + return p[1] | (uint32_t)p[0] << 8; case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; case 4: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; case 5: - return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | - (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; + return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | + (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; case 6: - return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | - (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; + return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | + (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; case 7: - return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | - (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | - (uint64_t)p[0] << 48; + return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | + (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | + (uint64_t)p[0] << 48; case 0: - return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | - (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | - (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; + return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | + (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | + (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; } } @@ -522,127 +523,131 @@ static FORCE_INLINE uint64_t tail64(const void *v, size_t tail) { //------------------------------------------------------------ // T1HA0 (non-AES version) -static FORCE_INLINE void mixup32(uint32_t *a, uint32_t *b, uint32_t v, - uint32_t prime) { +static FORCE_INLINE void mixup32( uint32_t * a, uint32_t * b, uint32_t v, uint32_t prime ) { uint32_t rlo, rhi; + mult32_64(rlo, rhi, *b + v, prime); *a ^= rlo; *b += rhi; } -static FORCE_INLINE uint64_t final32(uint32_t a, uint32_t b) { - uint64_t l = (b ^ ROTR32(a, 13)) | (uint64_t)a << 32; - l *= prime_0; - l ^= l >> 41; - l *= prime_4; - l ^= l >> 47; - l *= prime_6; - return l; +static FORCE_INLINE uint64_t final32( uint32_t a, uint32_t b ) { + uint64_t l = (b ^ ROTR32(a, 13)) | (uint64_t)a << 32; + + l *= prime_0; + l ^= l >> 41; + l *= prime_4; + l ^= l >> 47; + l *= prime_6; + return l; } -template < enum t1ha_modes mode, bool aligned32 > -static uint64_t t1ha0_32_impl(const void *data, size_t len, uint64_t seed) { - uint32_t a = ROTR32((uint32_t)len, 17) + (uint32_t)seed; - uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); +template +static uint64_t t1ha0_32_impl( const void * data, size_t len, uint64_t seed ) { + uint32_t a = ROTR32((uint32_t)len, 17) + (uint32_t)seed; + uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32); - const uint32_t *v = (const uint32_t *)data; + const uint32_t * v = (const uint32_t *)data; - if (unlikely(len > 16)) { - uint32_t c = ~a; - uint32_t d = ROTR32(b, 5); - const uint32_t *detent = - (const uint32_t *)((const uint8_t *)data + len - 15); - do { - const uint32_t w0 = fetch32(v + 0); - const uint32_t w1 = fetch32(v + 1); - const uint32_t w2 = fetch32(v + 2); - const uint32_t w3 = fetch32(v + 3); - v += 4; - prefetch(v); - - const uint32_t d13 = w1 + ROTR32(w3 + d, 17); - const uint32_t c02 = w0 ^ ROTR32(w2 + c, 11); - d ^= ROTR32(a + w0, 3); - c ^= ROTR32(b + w1, 7); - b = prime32_1 * (c02 + w3); - a = prime32_0 * (d13 ^ w2); - } while (likely(v < detent)); - - c += a; - d += b; - a ^= prime32_6 * (ROTR32(c, 16) + d); - b ^= prime32_5 * (c + ROTR32(d, 16)); - - len &= 15; - } - - switch (len) { - default: - mixup32(&a, &b, fetch32(v++), prime32_4); - /* fall through */ - case 12: - case 11: - case 10: - case 9: - mixup32(&b, &a, fetch32(v++), prime32_3); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - mixup32(&a, &b, fetch32(v++), prime32_2); - /* fall through */ - case 4: - case 3: - case 2: - case 1: - mixup32(&b, &a, tail32(v, len), prime32_1); - /* fall through */ - case 0: - return final32(a, b); - } + if (unlikely(len > 16)) { + uint32_t c = ~a; + uint32_t d = ROTR32(b, 5); + const uint32_t * detent = + (const uint32_t *)((const uint8_t *)data + len - 15); + do { + const uint32_t w0 = fetch32(v + 0); + const uint32_t w1 = fetch32(v + 1); + const uint32_t w2 = fetch32(v + 2); + const uint32_t w3 = fetch32(v + 3); + v += 4; + prefetch(v); + + const uint32_t d13 = w1 + ROTR32(w3 + d, 17); + const uint32_t c02 = w0 ^ ROTR32(w2 + c, 11); + d ^= ROTR32(a + w0, 3); + c ^= ROTR32(b + w1, 7); + b = prime32_1 * (c02 + w3); + a = prime32_0 * (d13 ^ w2); + } while (likely(v < detent)); + + c += a; + d += b; + a ^= prime32_6 * (ROTR32(c , 16) + d); + b ^= prime32_5 * (c + ROTR32(d, 16) ); + + len &= 15; + } + + switch (len) { + default: + mixup32(&a, &b, fetch32(v++) , prime32_4); + /* fall through */ + case 12: + case 11: + case 10: + case 9: + mixup32(&b, &a, fetch32(v++) , prime32_3); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + mixup32(&a, &b, fetch32(v++) , prime32_2); + /* fall through */ + case 4: + case 3: + case 2: + case 1: + mixup32(&b, &a, tail32(v, len), prime32_1); + /* fall through */ + case 0: + return final32(a, b); + } } //------------------------------------------------------------ // T1HA1 /* xor high and low parts of full 128-bit product */ -static FORCE_INLINE uint64_t mux64(uint64_t v, uint64_t prime) { - uint64_t l, h; - mult64_128(l, h, v, prime); - return l ^ h; +static FORCE_INLINE uint64_t mux64( uint64_t v, uint64_t prime ) { + uint64_t l, h; + + mult64_128(l, h, v, prime); + return l ^ h; } /* xor-mul-xor mixer */ -static FORCE_INLINE uint64_t mix64(uint64_t v, uint64_t p) { - v *= p; - return v ^ ROTR64(v, 41); +static FORCE_INLINE uint64_t mix64( uint64_t v, uint64_t p ) { + v *= p; + return v ^ ROTR64(v, 41); } -static FORCE_INLINE uint64_t final_weak_avalanche(uint64_t a, uint64_t b) { - /* LY: for performance reason on a some not high-end CPUs - * I replaced the second mux64() operation by mix64(). - * Unfortunately this approach fails the "strict avalanche criteria", - * see test results at https://github.com/demerphq/smhasher. */ - return mux64(ROTR64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); +static FORCE_INLINE uint64_t final_weak_avalanche( uint64_t a, uint64_t b ) { + /* + * LY: for performance reason on a some not high-end CPUs + * I replaced the second mux64() operation by mix64(). + * Unfortunately this approach fails the "strict avalanche criteria", + * see test results at https://github.com/demerphq/smhasher. + */ + return mux64(ROTR64(a + b, 17), prime_4) + mix64(a ^ b, prime_0); } -template < enum t1ha_modes mode, bool aligned64 > -static uint64_t t1ha1_impl(const void *data, size_t len, uint64_t seed) { - const uint64_t *v = (const uint64_t *)data; - uint64_t a = seed; - uint64_t b = len; +template +static uint64_t t1ha1_impl( const void * data, size_t len, uint64_t seed ) { + const uint64_t * v = (const uint64_t *)data; + uint64_t a = seed; + uint64_t b = len; if (unlikely(len > 32)) { - uint64_t c = ROTR64(len, 17) + seed; - uint64_t d = len ^ ROTR64(seed, 17); - const uint64_t *detent = - (const uint64_t *)((const uint8_t *)data + len - 31); + uint64_t c = ROTR64(len, 17) + seed; + uint64_t d = len ^ ROTR64(seed, 17); + const uint64_t * detent = + (const uint64_t *)((const uint8_t *)data + len - 31); do { - const uint64_t w0 = fetch64(v + 0); - const uint64_t w1 = fetch64(v + 1); - const uint64_t w2 = fetch64(v + 2); - const uint64_t w3 = fetch64(v + 3); + const uint64_t w0 = fetch64(v + 0); + const uint64_t w1 = fetch64(v + 1); + const uint64_t w2 = fetch64(v + 2); + const uint64_t w3 = fetch64(v + 3); v += 4; prefetch(v); @@ -654,15 +659,15 @@ static uint64_t t1ha1_impl(const void *data, size_t len, uint64_t seed) { a ^= prime_1 * (d02 + w3); } while (likely(v < detent)); - a ^= prime_6 * (ROTR64(c, 17) + d); - b ^= prime_5 * (c + ROTR64(d, 17)); + a ^= prime_6 * (ROTR64(c , 17) + d); + b ^= prime_5 * (c + ROTR64(d, 17) ); len &= 31; } switch (len) { default: - b += mux64(fetch64(v++), prime_4); - /* fall through */ + b += mux64(fetch64(v++) , prime_4); + /* fall through */ case 24: case 23: case 22: @@ -671,8 +676,8 @@ static uint64_t t1ha1_impl(const void *data, size_t len, uint64_t seed) { case 19: case 18: case 17: - a += mux64(fetch64(v++), prime_3); - /* fall through */ + a += mux64(fetch64(v++) , prime_3); + /* fall through */ case 16: case 15: case 14: @@ -680,21 +685,21 @@ static uint64_t t1ha1_impl(const void *data, size_t len, uint64_t seed) { case 12: case 11: case 10: - case 9: - b += mux64(fetch64(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - a += mux64(tail64(v, len), prime_1); - /* fall through */ - case 0: - return final_weak_avalanche(a, b); + case 9: + b += mux64(fetch64(v++) , prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + a += mux64(tail64(v, len), prime_1); + /* fall through */ + case 0: + return final_weak_avalanche(a, b); } } @@ -703,99 +708,103 @@ static uint64_t t1ha1_impl(const void *data, size_t len, uint64_t seed) { // XXX T1HA_ALIGN_PREFIX and T1HA_ALIGN_SUFFIX were not ported typedef union t1ha_state256 { - uint8_t bytes[32]; - uint32_t u32[8]; - uint64_t u64[4]; - struct { - uint64_t a, b, c, d; - } n; + uint8_t bytes[32]; + uint32_t u32[8]; + uint64_t u64[4]; + struct { + uint64_t a, b, c, d; + } n; } t1ha_state256_t; typedef struct t1ha_context { - t1ha_state256_t state; - t1ha_state256_t buffer; - size_t partial; - uint64_t total; + t1ha_state256_t state; + t1ha_state256_t buffer; + size_t partial; + uint64_t total; } t1ha_context_t; -static FORCE_INLINE void init_ab(t1ha_state256_t * s, uint64_t x, uint64_t y) { - s->n.a = x; - s->n.b = y; +static FORCE_INLINE void init_ab( t1ha_state256_t * s, uint64_t x, uint64_t y ) { + s->n.a = x; + s->n.b = y; } -static FORCE_INLINE void init_cd(t1ha_state256_t * s, uint64_t x, uint64_t y) { - s->n.c = ROTR64(y, 23) + ~x; - s->n.d = ~y + ROTR64(x, 19); +static FORCE_INLINE void init_cd( t1ha_state256_t * s, uint64_t x, uint64_t y ) { + s->n.c = ROTR64(y, 23) + ~x; + s->n.d = ~y + ROTR64(x, 19); } -static FORCE_INLINE void squash(t1ha_state256_t * s) { - s->n.a ^= prime_6 * (s->n.c + ROTR64(s->n.d, 23)); - s->n.b ^= prime_5 * (ROTR64(s->n.c, 19) + s->n.d); +static FORCE_INLINE void squash( t1ha_state256_t * s ) { + s->n.a ^= prime_6 * (s->n.c + ROTR64(s->n.d, 23) ); + s->n.b ^= prime_5 * (ROTR64(s->n.c , 19) + s->n.d); } -static FORCE_INLINE void mixup64(uint64_t * RESTRICT a, - uint64_t * RESTRICT b, uint64_t v, uint64_t prime) { +static FORCE_INLINE void mixup64( uint64_t * RESTRICT a, uint64_t * RESTRICT b, uint64_t v, uint64_t prime ) { uint64_t l, h; + mult64_128(l, h, *b + v, prime); *a ^= l; *b += h; } -static FORCE_INLINE uint64_t final64(uint64_t a, uint64_t b) { - uint64_t x = (a + ROTR64(b, 41)) * prime_0; - uint64_t y = (ROTR64(a, 23) + b) * prime_6; - return mux64(x ^ y, prime_5); +static FORCE_INLINE uint64_t final64( uint64_t a, uint64_t b ) { + uint64_t x = (a + ROTR64(b, 41) ) * prime_0; + uint64_t y = (ROTR64(a , 23) + b) * prime_6; + + return mux64(x ^ y, prime_5); } -static FORCE_INLINE uint64_t final128(uint64_t a, uint64_t b, uint64_t c, - uint64_t d, uint64_t * h) { - mixup64(&a, &b, ROTR64(c, 41) ^ d, prime_0); - mixup64(&b, &c, ROTR64(d, 23) ^ a, prime_6); - mixup64(&c, &d, ROTR64(a, 19) ^ b, prime_5); - mixup64(&d, &a, ROTR64(b, 31) ^ c, prime_4); - *h = c + d; - return a ^ b; +static FORCE_INLINE uint64_t final128( uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t * h ) { + mixup64(&a, &b, ROTR64(c, 41) ^ d, prime_0); + mixup64(&b, &c, ROTR64(d, 23) ^ a, prime_6); + mixup64(&c, &d, ROTR64(a, 19) ^ b, prime_5); + mixup64(&d, &a, ROTR64(b, 31) ^ c, prime_4); + *h = c + d; + return a ^ b; } -template < enum t1ha_modes mode, bool aligned64 > -static void T1HA2_UPDATE(t1ha_state256_t * const s, const uint64_t *v) { - const uint64_t w0 = fetch64(v + 0); - const uint64_t w1 = fetch64(v + 1); - const uint64_t w2 = fetch64(v + 2); - const uint64_t w3 = fetch64(v + 3); +template +static void T1HA2_UPDATE( t1ha_state256_t * const s, const uint64_t * v ) { + const uint64_t w0 = fetch64(v + 0); + const uint64_t w1 = fetch64(v + 1); + const uint64_t w2 = fetch64(v + 2); + const uint64_t w3 = fetch64(v + 3); const uint64_t d02 = w0 + ROTR64(w2 + s->n.d, 56); const uint64_t c13 = w1 + ROTR64(w3 + s->n.c, 19); + s->n.d ^= s->n.b + ROTR64(w1, 38); s->n.c ^= s->n.a + ROTR64(w0, 57); s->n.b ^= prime_6 * (c13 + w2); s->n.a ^= prime_5 * (d02 + w3); } -template < enum t1ha_modes mode, bool aligned64 > -static const void * T1HA2_LOOP(t1ha_state256_t * const state, const void *data, size_t len) { - const void *detent = (const uint8_t *)data + len - 31; +template +static const void * T1HA2_LOOP( t1ha_state256_t * const state, const void * data, size_t len ) { + const void * detent = (const uint8_t *)data + len - 31; + do { - const uint64_t *v = (const uint64_t *)data; + const uint64_t * v = (const uint64_t *)data; data = v + 4; prefetch(data); - T1HA2_UPDATE(state, v); + T1HA2_UPDATE(state, v); } while (likely(data < detent)); return data; } -template < enum t1ha_modes mode, bool aligned64, bool use_ABCD > -static uint64_t T1HA2_TAIL(t1ha_state256_t * const s, const void *data, size_t len, uint64_t * RESTRICT extra_result = NULL) { - const uint64_t *v = (const uint64_t *)data; - uint64_t val; +template +static uint64_t T1HA2_TAIL( t1ha_state256_t * const s, const void * data, + size_t len, uint64_t * RESTRICT extra_result = NULL ) { + const uint64_t * v = (const uint64_t *)data; + uint64_t val; + switch (len) { default: - if (use_ABCD) { - mixup64(&s->n.a, &s->n.d, fetch64(v++), prime_4); - } else { - mixup64(&s->n.a, &s->n.b, fetch64(v++), prime_4); - } - /* fall through */ + if (use_ABCD) { + mixup64(&s->n.a, &s->n.d, fetch64(v++), prime_4); + } else { + mixup64(&s->n.a, &s->n.b, fetch64(v++), prime_4); + } + /* fall through */ case 24: case 23: case 22: @@ -804,9 +813,9 @@ static uint64_t T1HA2_TAIL(t1ha_state256_t * const s, const void *data, size_t l case 19: case 18: case 17: - // ".b, .a" for either value of use_ABCD - mixup64(&s->n.b, &s->n.a, fetch64(v++), prime_3); - /* fall through */ + // ".b, .a" for either value of use_ABCD + mixup64(&s->n.b, &s->n.a, fetch64(v++), prime_3); + /* fall through */ case 16: case 15: case 14: @@ -814,92 +823,91 @@ static uint64_t T1HA2_TAIL(t1ha_state256_t * const s, const void *data, size_t l case 12: case 11: case 10: - case 9: - if (use_ABCD) { - mixup64(&s->n.c, &s->n.b, fetch64(v++), prime_2); - } else { - mixup64(&s->n.a, &s->n.b, fetch64(v++), prime_2); - } - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - val = tail64(v, len); - if (use_ABCD) { - mixup64(&s->n.d, &s->n.c, val, prime_1); - } else { - mixup64(&s->n.b, &s->n.a, val, prime_1); - } - /* fall through */ - case 0: - if (use_ABCD) { - return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result); - } else { - return final64(s->n.a, s->n.b); - } + case 9: + if (use_ABCD) { + mixup64(&s->n.c, &s->n.b, fetch64(v++), prime_2); + } else { + mixup64(&s->n.a, &s->n.b, fetch64(v++), prime_2); + } + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + val = tail64(v, len); + if (use_ABCD) { + mixup64(&s->n.d, &s->n.c, val, prime_1); + } else { + mixup64(&s->n.b, &s->n.a, val, prime_1); + } + /* fall through */ + case 0: + if (use_ABCD) { + return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result); + } else { + return final64(s->n.a, s->n.b); + } } } -static void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) { - init_ab(&ctx->state, seed_x, seed_y); - init_cd(&ctx->state, seed_x, seed_y); - ctx->partial = 0; - ctx->total = 0; +static void t1ha2_init( t1ha_context_t * ctx, uint64_t seed_x, uint64_t seed_y ) { + init_ab(&ctx->state, seed_x, seed_y); + init_cd(&ctx->state, seed_x, seed_y); + ctx->partial = 0; + ctx->total = 0; } -template < enum t1ha_modes mode > -static void t1ha2_update(t1ha_context_t * RESTRICT ctx, const void * RESTRICT data, - size_t length) { - ctx->total += length; - - if (ctx->partial) { - const size_t left = 32 - ctx->partial; - const size_t chunk = (length >= left) ? left : length; - memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); - ctx->partial += chunk; - if (ctx->partial < 32) { - assert(left >= length); - return; +template +static void t1ha2_update( t1ha_context_t * RESTRICT ctx, const void * RESTRICT data, size_t length ) { + ctx->total += length; + + if (ctx->partial) { + const size_t left = 32 - ctx->partial; + const size_t chunk = (length >= left) ? left : length; + memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); + ctx->partial += chunk; + if (ctx->partial < 32) { + assert(left >= length); + return; + } + ctx->partial = 0; + data = (const uint8_t *)data + chunk; + length -= chunk; + T1HA2_UPDATE(&ctx->state, ctx->buffer.u64); + } + + if (length >= 32) { + if ((T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT) || + ((((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0)) { + data = T1HA2_LOOP(&ctx->state, data, length); + } else { + data = T1HA2_LOOP(&ctx->state, data, length); + } + length &= 31; + } + + if (length) { + memcpy(ctx->buffer.bytes, data, ctx->partial = length); } - ctx->partial = 0; - data = (const uint8_t *)data + chunk; - length -= chunk; - T1HA2_UPDATE(&ctx->state, ctx->buffer.u64); - } - - if (length >= 32) { - if ((T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT) || - ((((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0)) { - data = T1HA2_LOOP(&ctx->state, data, length); - } else { - data = T1HA2_LOOP(&ctx->state, data, length); - } - length &= 31; - } - - if (length) { - memcpy(ctx->buffer.bytes, data, ctx->partial = length); - } } -template < enum t1ha_modes mode > -static uint64_t t1ha2_final(t1ha_context_t * RESTRICT ctx, - uint64_t * RESTRICT extra_result) { - uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63); - bits = COND_BSWAP(bits, MODE_BE_SYS(mode)); - t1ha2_update(ctx, &bits, 8); +template +static uint64_t t1ha2_final( t1ha_context_t * RESTRICT ctx, uint64_t * RESTRICT extra_result ) { + uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63); - if (likely(!extra_result)) { - squash(&ctx->state); - return T1HA2_TAIL(&ctx->state, ctx->buffer.u64, ctx->partial); - } + bits = COND_BSWAP(bits, MODE_BE_SYS(mode)); + t1ha2_update(ctx, &bits, 8); - return T1HA2_TAIL(&ctx->state, ctx->buffer.u64, ctx->partial, extra_result); + if (likely(!extra_result)) { + squash(&ctx->state); + return T1HA2_TAIL(&ctx->state, ctx->buffer.u64, ctx->partial); + } + + return T1HA2_TAIL(&ctx->state, ctx->buffer.u64, ctx->partial, extra_result); } //------------------------------------------------------------ @@ -909,184 +917,186 @@ static uint64_t t1ha2_final(t1ha_context_t * RESTRICT ctx, // versionA is t1ha0_ia32aes_avx1/t1ha0_ia32aes_noavx, which appear to // be identical. versionB is t1ha0_ia32aes_avx2, which does not appear // to need AVX2. ¯\_(ツ)_/¯ -template < enum t1ha_modes mode, bool versionB > -static uint64_t t1ha0_aes_impl(const void *data, size_t len, uint64_t seed) { - uint64_t a = seed; - uint64_t b = len; - - if (unlikely(len > 32)) { - __m128i x = _mm_set_epi64x(a, b); - __m128i y; - - if (versionB) { - const __m128i *v = (const __m128i *)data; - const __m128i *const detent = - (const __m128i *)((const uint8_t *)data + (len & ~15ul)); - y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_0, prime_1)); - data = detent; - - if (len & 16) { - x = _mm_add_epi64(x, _mm_loadu_si128(v++)); - y = _mm_aesenc_si128(x, y); - } - len &= 15; +template +static uint64_t t1ha0_aes_impl( const void * data, size_t len, uint64_t seed ) { + uint64_t a = seed; + uint64_t b = len; - if (v + 7 < detent) { - __m128i salt = y; - do { - __m128i t = _mm_aesenc_si128(_mm_loadu_si128(v++), salt); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); - - salt = _mm_add_epi64(salt, _mm_set_epi64x(prime_5, prime_6)); - t = _mm_aesenc_si128(x, t); - x = _mm_add_epi64(y, x); - y = t; - } while (v + 7 < detent); - } + if (unlikely(len > 32)) { + __m128i x = _mm_set_epi64x(a, b); + __m128i y; + + if (versionB) { + const __m128i * v = (const __m128i *)data; + const __m128i * const detent = + (const __m128i *)((const uint8_t *)data + (len & ~15ul)); + y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_0, prime_1)); + data = detent; + + if (len & 16) { + x = _mm_add_epi64(x, _mm_loadu_si128(v++)); + y = _mm_aesenc_si128(x, y); + } + len &= 15; + + if (v + 7 < detent) { + __m128i salt = y; + do { + __m128i t = _mm_aesenc_si128(_mm_loadu_si128(v++), salt); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + t = _mm_aesdec_si128(t, _mm_loadu_si128(v++)); + + salt = _mm_add_epi64(salt, _mm_set_epi64x(prime_5, prime_6)); + t = _mm_aesenc_si128(x, t); + x = _mm_add_epi64(y, x); + y = t; + } while (v + 7 < detent); + } - while (v < detent) { - __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); - __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); - x = _mm_aesdec_si128(x, v0y); - y = _mm_aesdec_si128(y, v1x); - } - } else { - const __m128i * RESTRICT v = (const __m128i *)data; - const __m128i * RESTRICT const detent = - (const __m128i *)((const uint8_t *)data + len - 127); - y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6)); - - while (v < detent) { - __m128i v0 = _mm_loadu_si128(v + 0); - __m128i v1 = _mm_loadu_si128(v + 1); - __m128i v2 = _mm_loadu_si128(v + 2); - __m128i v3 = _mm_loadu_si128(v + 3); - __m128i v4 = _mm_loadu_si128(v + 4); - __m128i v5 = _mm_loadu_si128(v + 5); - __m128i v6 = _mm_loadu_si128(v + 6); - __m128i v7 = _mm_loadu_si128(v + 7); - - __m128i v0y = _mm_aesenc_si128(v0, y); - __m128i v2x6 = _mm_aesenc_si128(v2, _mm_xor_si128(x, v6)); - __m128i v45_67 = - _mm_xor_si128(_mm_aesenc_si128(v4, v5), _mm_add_epi64(v6, v7)); - - __m128i v0y7_1 = _mm_aesdec_si128(_mm_sub_epi64(v7, v0y), v1); - __m128i v2x6_3 = _mm_aesenc_si128(v2x6, v3); - - x = _mm_aesenc_si128(v45_67, _mm_add_epi64(x, y)); - y = _mm_aesenc_si128(v2x6_3, _mm_xor_si128(v0y7_1, v5)); - v += 8; - } + while (v < detent) { + __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v0y); + y = _mm_aesdec_si128(y, v1x); + } + } else { + const __m128i * RESTRICT v = (const __m128i *)data; + const __m128i * RESTRICT const detent = + (const __m128i *)((const uint8_t *)data + len - 127); + y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6)); + + while (v < detent) { + __m128i v0 = _mm_loadu_si128(v + 0); + __m128i v1 = _mm_loadu_si128(v + 1); + __m128i v2 = _mm_loadu_si128(v + 2); + __m128i v3 = _mm_loadu_si128(v + 3); + __m128i v4 = _mm_loadu_si128(v + 4); + __m128i v5 = _mm_loadu_si128(v + 5); + __m128i v6 = _mm_loadu_si128(v + 6); + __m128i v7 = _mm_loadu_si128(v + 7); + + __m128i v0y = _mm_aesenc_si128(v0, y); + __m128i v2x6 = _mm_aesenc_si128(v2, _mm_xor_si128(x, v6)); + __m128i v45_67 = + _mm_xor_si128(_mm_aesenc_si128(v4, v5), _mm_add_epi64(v6, v7)); + + __m128i v0y7_1 = _mm_aesdec_si128(_mm_sub_epi64(v7, v0y), v1); + __m128i v2x6_3 = _mm_aesenc_si128(v2x6, v3); + + x = _mm_aesenc_si128(v45_67, _mm_add_epi64(x, y) ); + y = _mm_aesenc_si128(v2x6_3, _mm_xor_si128(v0y7_1, v5)); + v += 8; + } - if (len & 64) { - __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); - __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); - x = _mm_aesdec_si128(x, v0y); - y = _mm_aesdec_si128(y, v1x); + if (len & 64) { + __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v0y); + y = _mm_aesdec_si128(y, v1x); - __m128i v2y = _mm_add_epi64(y, _mm_loadu_si128(v++)); - __m128i v3x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); - x = _mm_aesdec_si128(x, v2y); - y = _mm_aesdec_si128(y, v3x); - } + __m128i v2y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v3x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v2y); + y = _mm_aesdec_si128(y, v3x); + } - if (len & 32) { - __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); - __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); - x = _mm_aesdec_si128(x, v0y); - y = _mm_aesdec_si128(y, v1x); - } + if (len & 32) { + __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++)); + __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++)); + x = _mm_aesdec_si128(x, v0y); + y = _mm_aesdec_si128(y, v1x); + } - if (len & 16) { - y = _mm_add_epi64(x, y); - x = _mm_aesdec_si128(x, _mm_loadu_si128(v++)); + if (len & 16) { + y = _mm_add_epi64(x, y); + x = _mm_aesdec_si128(x, _mm_loadu_si128(v++)); + } + + data = v; + len &= 15; } - data = v; - len &= 15; + x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y); + #if defined(HAVE_32BIT_PLATFORM) + #if defined(HAVE_SSE_4_1) + a = (uint32_t)_mm_extract_epi32(x, 0) | + (uint64_t)_mm_extract_epi32(x, 1) << 32; + b = (uint32_t)_mm_extract_epi32(x, 2) | + (uint64_t)_mm_extract_epi32(x, 3) << 32; + #else + a = (uint32_t)_mm_cvtsi128_si32(x); + a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; + x = _mm_unpackhi_epi64(x, x); + b = (uint32_t)_mm_cvtsi128_si32(x); + b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; + #endif + _mm_empty(); + #else /* HAVE_32BIT_PLATFORM */ + #if defined(HAVE_SSE_4_1) + a = _mm_extract_epi64(x, 0); + b = _mm_extract_epi64(x, 1); + #else + a = _mm_cvtsi128_si64(x); + b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x)); + #endif + #if defined(HAVE_AVX) + _mm256_zeroall(); + #endif + #endif } - x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y); -#if defined(HAVE_32BIT_PLATFORM) -#if defined(HAVE_SSE_4_1) - a = (uint32_t)_mm_extract_epi32(x, 0) | - (uint64_t)_mm_extract_epi32(x, 1) << 32; - b = (uint32_t)_mm_extract_epi32(x, 2) | - (uint64_t)_mm_extract_epi32(x, 3) << 32; -#else - a = (uint32_t)_mm_cvtsi128_si32(x); - a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; - x = _mm_unpackhi_epi64(x, x); - b = (uint32_t)_mm_cvtsi128_si32(x); - b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32; -#endif - _mm_empty(); -#else /* HAVE_32BIT_PLATFORM */ -#if defined(HAVE_SSE_4_1) - a = _mm_extract_epi64(x, 0); - b = _mm_extract_epi64(x, 1); -#else - a = _mm_cvtsi128_si64(x); - b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x)); -#endif -#if defined(HAVE_AVX) - _mm256_zeroall(); -#endif -#endif - } - - const uint64_t *v = (const uint64_t *)data; - switch (len) { - default: - mixup64(&a, &b, fetch64(v++), prime_4); - /* fall through */ - case 24: - case 23: - case 22: - case 21: - case 20: - case 19: - case 18: - case 17: - mixup64(&b, &a, fetch64(v++), prime_3); - /* fall through */ - case 16: - case 15: - case 14: - case 13: - case 12: - case 11: - case 10: - case 9: - mixup64(&a, &b, fetch64(v++), prime_2); - /* fall through */ - case 8: - case 7: - case 6: - case 5: - case 4: - case 3: - case 2: - case 1: - mixup64(&b, &a, tail64(v, len), prime_1); - /* fall through */ - case 0: - return final64(a, b); - } + const uint64_t * v = (const uint64_t *)data; + switch (len) { + default: + mixup64(&a, &b, fetch64(v++) , prime_4); + /* fall through */ + case 24: + case 23: + case 22: + case 21: + case 20: + case 19: + case 18: + case 17: + mixup64(&b, &a, fetch64(v++) , prime_3); + /* fall through */ + case 16: + case 15: + case 14: + case 13: + case 12: + case 11: + case 10: + case 9: + mixup64(&a, &b, fetch64(v++) , prime_2); + /* fall through */ + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + mixup64(&b, &a, tail64(v, len), prime_1); + /* fall through */ + case 0: + return final64(a, b); + } } + #endif -template < enum t1ha_modes mode > -static void t1ha0(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void t1ha0( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t hash; + // If unaligned access is fast, don't worry about // checking/handling pointer alignments. Otherwise, use // aligned-specific code if possible. @@ -1103,9 +1113,10 @@ static void t1ha0(const void * in, const size_t len, const seed_t seed, void * o PUT_U64(hash, (uint8_t *)out, 0); } -template < enum t1ha_modes mode > -static void t1ha1(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void t1ha1( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t hash; + // If unaligned access is fast, don't worry about // checking/handling pointer alignments. Otherwise, use // aligned-specific code if possible. @@ -1122,22 +1133,22 @@ static void t1ha1(const void * in, const size_t len, const seed_t seed, void * o PUT_U64(hash, (uint8_t *)out, 0); } -template < enum t1ha_modes mode, bool xwidth > -static void t1ha2(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void t1ha2( const void * in, const size_t len, const seed_t seed, void * out ) { alignas(16) t1ha_state256_t state; - uint64_t hash, xhash = 0; - uint64_t length = (uint64_t)len; + uint64_t hash, xhash = 0; + uint64_t length = (uint64_t)len; const bool use_unaligned = - (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT) || - ((((uintptr_t)in) & (ALIGNMENT_64 - 1)) != 0); + (T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT) || + ((((uintptr_t)in) & (ALIGNMENT_64 - 1)) != 0); init_ab(&state, (uint64_t)seed, length); if (unlikely(length > 32)) { init_cd(&state, (uint64_t)seed, length); if (use_unaligned) { - in = T1HA2_LOOP(&state, in, length); + in = T1HA2_LOOP(&state, in, length); } else { - in = T1HA2_LOOP(&state, in, length); + in = T1HA2_LOOP(&state, in, length); } if (!xwidth) { squash(&state); @@ -1148,12 +1159,12 @@ static void t1ha2(const void * in, const size_t len, const seed_t seed, void * o } if (use_unaligned) { hash = xwidth ? - T1HA2_TAIL (&state, in, length, &xhash) : - T1HA2_TAIL(&state, in, length) ; + T1HA2_TAIL(&state, in, length, &xhash) : + T1HA2_TAIL(&state, in, length); } else { hash = xwidth ? - T1HA2_TAIL (&state, in, length, &xhash) : - T1HA2_TAIL(&state, in, length) ; + T1HA2_TAIL(&state, in, length, &xhash) : + T1HA2_TAIL(&state, in, length); } PUT_U64(hash, (uint8_t *)out, 0); if (xwidth) { @@ -1165,8 +1176,8 @@ static void t1ha2(const void * in, const size_t len, const seed_t seed, void * o // initialization, while published SMHasher validation codes use it // once. Default to once so SMHasher3 tests are consistent, but allow // selftests to use published KAT tables. -template < enum t1ha_modes mode, bool xwidth, bool selftest_seeding = false > -static void t1ha2_incr(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void t1ha2_incr( const void * in, const size_t len, const seed_t seed, void * out ) { alignas(16) t1ha_context_t ctx; uint64_t hash, xhash = 0; uint64_t length = (uint64_t)len; @@ -1182,270 +1193,284 @@ static void t1ha2_incr(const void * in, const size_t len, const seed_t seed, voi } #if defined(HAVE_X86_64_AES) -template < bool bswap > -static void t1ha0_aesA(const void * in, const size_t len, const seed_t seed, void * out) { + +template +static void t1ha0_aesA( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t hash; - hash = t1ha0_aes_impl(in, len, (uint64_t)seed); + + hash = t1ha0_aes_impl(in, len, (uint64_t)seed); PUT_U64(hash, (uint8_t *)out, 0); } -template < bool bswap > -static void t1ha0_aesB(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void t1ha0_aesB( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t hash; - hash = t1ha0_aes_impl(in, len, (uint64_t)seed); + + hash = t1ha0_aes_impl(in, len, (uint64_t)seed); PUT_U64(hash, (uint8_t *)out, 0); } + #endif //------------------------------------------------------------ -static const uint8_t t1ha_test_pattern[64] = { - 0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0x7F, 0x3F, - 0x1F, 0xF, 8, 16, 32, 64, 0x80, 0xFE, 0xFC, 0xF8, 0xF0, - 0xE0, 0xC0, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x55, 0xAA, 11, - 17, 19, 23, 29, 37, 42, 43, 'a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x' +static const uint8_t t1ha_test_pattern [64] = { + 0, 1, 2, 3, 4, 5, 6, 7 , 0xFF, 0x7F, 0x3F, + 0x1F, 0xF, 8, 16, 32, 64, 0x80, 0xFE , 0xFC, 0xF8, 0xF0, + 0xE0, 0xC0, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF , 0x55, 0xAA, 11, + 17, 19, 23, 29, 37, 42, 43, 'a', 'b', 'c', 'd', + 'e' , 'f', 'g', 'h', 'i', 'j', 'k', 'l' , 'm', 'n', 'o', + 'p' , 'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' }; -static const uint64_t t1ha_refval_32le[81] = { 0, - UINT64_C(0xC92229C10FAEA50E), UINT64_C(0x3DF1354B0DFDC443), UINT64_C(0x968F016D60417BB3), UINT64_C(0x85AAFB50C6DA770F), - UINT64_C(0x66CCE3BB6842C7D6), UINT64_C(0xDDAA39C11537C226), UINT64_C(0x35958D281F0C9C8C), UINT64_C(0x8C5D64B091DE608E), - UINT64_C(0x4094DF680D39786B), UINT64_C(0x1014F4AA2A2EDF4D), UINT64_C(0x39D21891615AA310), UINT64_C(0x7EF51F67C398C7C4), - UINT64_C(0x06163990DDBF319D), UINT64_C(0xE229CAA00C8D6F3F), UINT64_C(0xD2240B4B0D54E0F5), UINT64_C(0xEA2E7E905DDEAF94), - UINT64_C(0x8D4F8A887183A5CE), UINT64_C(0x44337F9A63C5820C), UINT64_C(0x94938D1E86A9B797), UINT64_C(0x96E9CABA5CA210CC), - UINT64_C(0x6EFBB9CC9E8F7708), UINT64_C(0x3D12EA0282FB8BBC), UINT64_C(0x5DA781EE205A2C48), UINT64_C(0xFA4A51A12677FE12), - UINT64_C(0x81D5F04E20660B28), UINT64_C(0x57258D043BCD3841), UINT64_C(0x5C9BEB62059C1ED2), UINT64_C(0x57A02162F9034B33), - UINT64_C(0xBA2A13E457CE19B8), UINT64_C(0xE593263BF9451F3A), UINT64_C(0x0BC1175539606BC5), UINT64_C(0xA3E2929E9C5F289F), - UINT64_C(0x86BDBD06835E35F7), UINT64_C(0xA180950AB48BAADC), UINT64_C(0x7812C994D9924028), UINT64_C(0x308366011415F46B), - UINT64_C(0x77FE9A9991C5F959), UINT64_C(0x925C340B70B0B1E3), UINT64_C(0xCD9C5BA4C41E2E10), UINT64_C(0x7CC4E7758B94CD93), - UINT64_C(0x898B235962EA4625), UINT64_C(0xD7E3E5BF22893286), UINT64_C(0x396F4CDD33056C64), UINT64_C(0x740AB2E32F17CD9F), - UINT64_C(0x60D12FF9CD15B321), UINT64_C(0xBEE3A6C9903A81D8), UINT64_C(0xB47040913B33C35E), UINT64_C(0x19EE8C2ACC013CFF), - UINT64_C(0x5DEC94C5783B55C4), UINT64_C(0x78DC122D562C5F1D), UINT64_C(0x6520F008DA1C181E), UINT64_C(0x77CAF155A36EBF7C), - UINT64_C(0x0A09E02BDB883CA6), UINT64_C(0xFD5D9ADA7E3FB895), UINT64_C(0xC6F5FDD9EEAB83B5), UINT64_C(0x84589BB29F52A92A), - UINT64_C(0x9B2517F13F8E9814), UINT64_C(0x6F752AF6A52E31EC), UINT64_C(0x8E717799E324CE8A), UINT64_C(0x84D90AEF39262D58), - UINT64_C(0x79C27B13FC28944D), UINT64_C(0xE6D6DF6438E0044A), UINT64_C(0x51B603E400D79CA4), UINT64_C(0x6A902B28C588B390), - UINT64_C(0x8D7F8DE9E6CB1D83), UINT64_C(0xCF1A4DC11CA7F044), UINT64_C(0xEF02E43C366786F1), UINT64_C(0x89915BCDBCFBE30F), - UINT64_C(0x5928B306F1A9CC7F), UINT64_C(0xA8B59092996851C5), UINT64_C(0x22050A20427E8B25), UINT64_C(0x6E6D64018941E7EE), - UINT64_C(0x9798C898B81AE846), UINT64_C(0x80EF218CDC30124A), UINT64_C(0xFCE45E60D55B0284), UINT64_C(0x4010E735D3147C35), - UINT64_C(0xEB647D999FD8DC7E), UINT64_C(0xD3544DCAB14FE907), UINT64_C(0xB588B27D8438700C), UINT64_C(0xA49EBFC43E057A4C) +static const uint64_t t1ha_refval_32le [81] = { + 0, + UINT64_C(0xC92229C10FAEA50E), UINT64_C(0x3DF1354B0DFDC443), UINT64_C(0x968F016D60417BB3), UINT64_C(0x85AAFB50C6DA770F), + UINT64_C(0x66CCE3BB6842C7D6), UINT64_C(0xDDAA39C11537C226), UINT64_C(0x35958D281F0C9C8C), UINT64_C(0x8C5D64B091DE608E), + UINT64_C(0x4094DF680D39786B), UINT64_C(0x1014F4AA2A2EDF4D), UINT64_C(0x39D21891615AA310), UINT64_C(0x7EF51F67C398C7C4), + UINT64_C(0x06163990DDBF319D), UINT64_C(0xE229CAA00C8D6F3F), UINT64_C(0xD2240B4B0D54E0F5), UINT64_C(0xEA2E7E905DDEAF94), + UINT64_C(0x8D4F8A887183A5CE), UINT64_C(0x44337F9A63C5820C), UINT64_C(0x94938D1E86A9B797), UINT64_C(0x96E9CABA5CA210CC), + UINT64_C(0x6EFBB9CC9E8F7708), UINT64_C(0x3D12EA0282FB8BBC), UINT64_C(0x5DA781EE205A2C48), UINT64_C(0xFA4A51A12677FE12), + UINT64_C(0x81D5F04E20660B28), UINT64_C(0x57258D043BCD3841), UINT64_C(0x5C9BEB62059C1ED2), UINT64_C(0x57A02162F9034B33), + UINT64_C(0xBA2A13E457CE19B8), UINT64_C(0xE593263BF9451F3A), UINT64_C(0x0BC1175539606BC5), UINT64_C(0xA3E2929E9C5F289F), + UINT64_C(0x86BDBD06835E35F7), UINT64_C(0xA180950AB48BAADC), UINT64_C(0x7812C994D9924028), UINT64_C(0x308366011415F46B), + UINT64_C(0x77FE9A9991C5F959), UINT64_C(0x925C340B70B0B1E3), UINT64_C(0xCD9C5BA4C41E2E10), UINT64_C(0x7CC4E7758B94CD93), + UINT64_C(0x898B235962EA4625), UINT64_C(0xD7E3E5BF22893286), UINT64_C(0x396F4CDD33056C64), UINT64_C(0x740AB2E32F17CD9F), + UINT64_C(0x60D12FF9CD15B321), UINT64_C(0xBEE3A6C9903A81D8), UINT64_C(0xB47040913B33C35E), UINT64_C(0x19EE8C2ACC013CFF), + UINT64_C(0x5DEC94C5783B55C4), UINT64_C(0x78DC122D562C5F1D), UINT64_C(0x6520F008DA1C181E), UINT64_C(0x77CAF155A36EBF7C), + UINT64_C(0x0A09E02BDB883CA6), UINT64_C(0xFD5D9ADA7E3FB895), UINT64_C(0xC6F5FDD9EEAB83B5), UINT64_C(0x84589BB29F52A92A), + UINT64_C(0x9B2517F13F8E9814), UINT64_C(0x6F752AF6A52E31EC), UINT64_C(0x8E717799E324CE8A), UINT64_C(0x84D90AEF39262D58), + UINT64_C(0x79C27B13FC28944D), UINT64_C(0xE6D6DF6438E0044A), UINT64_C(0x51B603E400D79CA4), UINT64_C(0x6A902B28C588B390), + UINT64_C(0x8D7F8DE9E6CB1D83), UINT64_C(0xCF1A4DC11CA7F044), UINT64_C(0xEF02E43C366786F1), UINT64_C(0x89915BCDBCFBE30F), + UINT64_C(0x5928B306F1A9CC7F), UINT64_C(0xA8B59092996851C5), UINT64_C(0x22050A20427E8B25), UINT64_C(0x6E6D64018941E7EE), + UINT64_C(0x9798C898B81AE846), UINT64_C(0x80EF218CDC30124A), UINT64_C(0xFCE45E60D55B0284), UINT64_C(0x4010E735D3147C35), + UINT64_C(0xEB647D999FD8DC7E), UINT64_C(0xD3544DCAB14FE907), UINT64_C(0xB588B27D8438700C), UINT64_C(0xA49EBFC43E057A4C) }; -static const uint64_t t1ha_refval_32be[81] = { 0, - UINT64_C(0xC92229C10FAEA50E), UINT64_C(0x0FE212630DD87E0F), UINT64_C(0x968F016D60417BB3), UINT64_C(0xE6B12B2C889913AB), - UINT64_C(0xAA3787887A9DA368), UINT64_C(0x06EE7202D53CEF39), UINT64_C(0x6149AFB2C296664B), UINT64_C(0x86C893210F9A5805), - UINT64_C(0x8379E5DA988AA04C), UINT64_C(0x24763AA7CE411A60), UINT64_C(0x9CF9C64B395A4CF8), UINT64_C(0xFFC192C338DDE904), - UINT64_C(0x094575BAB319E5F5), UINT64_C(0xBBBACFE7728C6511), UINT64_C(0x36B8C3CEBE4EF409), UINT64_C(0xAA0BA8A3397BA4D0), - UINT64_C(0xF9F85CF7124EE653), UINT64_C(0x3ADF4F7DF2A887AE), UINT64_C(0xAA2A0F5964AA9A7A), UINT64_C(0xF18B563F42D36EB8), - UINT64_C(0x034366CEF8334F5C), UINT64_C(0xAE2E85180E330E5F), UINT64_C(0xA5CE9FBFDF5C65B8), UINT64_C(0x5E509F25A9CA9B0B), - UINT64_C(0xE30D1358C2013BD2), UINT64_C(0xBB3A04D5EB8111FE), UINT64_C(0xB04234E82A15A28D), UINT64_C(0x87426A56D0EA0E2F), - UINT64_C(0x095086668E07F9F8), UINT64_C(0xF4CD3A43B6A6AEA5), UINT64_C(0x73F9B9B674D472A6), UINT64_C(0x558344229A1E4DCF), - UINT64_C(0x0AD4C95B2279181A), UINT64_C(0x5E3D19D80821CA6B), UINT64_C(0x652492D25BEBA258), UINT64_C(0xEFA84B02EAB849B1), - UINT64_C(0x81AD2D253059AC2C), UINT64_C(0x1400CCB0DFB2F457), UINT64_C(0x5688DC72A839860E), UINT64_C(0x67CC130E0FD1B0A7), - UINT64_C(0x0A851E3A94E21E69), UINT64_C(0x2EA0000B6A073907), UINT64_C(0xAE9776FF9BF1D02E), UINT64_C(0xC0A96B66B160631C), - UINT64_C(0xA93341DE4ED7C8F0), UINT64_C(0x6FBADD8F5B85E141), UINT64_C(0xB7D295F1C21E0CBA), UINT64_C(0x6D6114591B8E434F), - UINT64_C(0xF5B6939B63D97BE7), UINT64_C(0x3C80D5053F0E5DB4), UINT64_C(0xAC520ACC6B73F62D), UINT64_C(0xD1051F5841CF3966), - UINT64_C(0x62245AEA644AE760), UINT64_C(0x0CD56BE15497C62D), UINT64_C(0x5BB93435C4988FB6), UINT64_C(0x5FADB88EB18DB512), - UINT64_C(0xC897CAE2242475CC), UINT64_C(0xF1A094EF846DC9BB), UINT64_C(0x2B1D8B24924F79B6), UINT64_C(0xC6DF0C0E8456EB53), - UINT64_C(0xE6A40128303A9B9C), UINT64_C(0x64D37AF5EFFA7BD9), UINT64_C(0x90FEB70A5AE2A598), UINT64_C(0xEC3BA5F126D9FF4B), - UINT64_C(0x3121C8EC3AC51B29), UINT64_C(0x3B41C4D422166EC1), UINT64_C(0xB4878DDCBF48ED76), UINT64_C(0x5CB850D77CB762E4), - UINT64_C(0x9A27A43CC1DD171F), UINT64_C(0x2FDFFC6F99CB424A), UINT64_C(0xF54A57E09FDEA7BB), UINT64_C(0x5F78E5EE2CAB7039), - UINT64_C(0xB8BA95883DB31CBA), UINT64_C(0x131C61EB84AF86C3), UINT64_C(0x84B1F64E9C613DA7), UINT64_C(0xE94C1888C0C37C02), - UINT64_C(0xEA08F8BFB2039CDE), UINT64_C(0xCCC6D04D243EC753), UINT64_C(0x8977D105298B0629), UINT64_C(0x7AAA976494A5905E) +static const uint64_t t1ha_refval_32be [81] = { + 0, + UINT64_C(0xC92229C10FAEA50E), UINT64_C(0x0FE212630DD87E0F), UINT64_C(0x968F016D60417BB3), UINT64_C(0xE6B12B2C889913AB), + UINT64_C(0xAA3787887A9DA368), UINT64_C(0x06EE7202D53CEF39), UINT64_C(0x6149AFB2C296664B), UINT64_C(0x86C893210F9A5805), + UINT64_C(0x8379E5DA988AA04C), UINT64_C(0x24763AA7CE411A60), UINT64_C(0x9CF9C64B395A4CF8), UINT64_C(0xFFC192C338DDE904), + UINT64_C(0x094575BAB319E5F5), UINT64_C(0xBBBACFE7728C6511), UINT64_C(0x36B8C3CEBE4EF409), UINT64_C(0xAA0BA8A3397BA4D0), + UINT64_C(0xF9F85CF7124EE653), UINT64_C(0x3ADF4F7DF2A887AE), UINT64_C(0xAA2A0F5964AA9A7A), UINT64_C(0xF18B563F42D36EB8), + UINT64_C(0x034366CEF8334F5C), UINT64_C(0xAE2E85180E330E5F), UINT64_C(0xA5CE9FBFDF5C65B8), UINT64_C(0x5E509F25A9CA9B0B), + UINT64_C(0xE30D1358C2013BD2), UINT64_C(0xBB3A04D5EB8111FE), UINT64_C(0xB04234E82A15A28D), UINT64_C(0x87426A56D0EA0E2F), + UINT64_C(0x095086668E07F9F8), UINT64_C(0xF4CD3A43B6A6AEA5), UINT64_C(0x73F9B9B674D472A6), UINT64_C(0x558344229A1E4DCF), + UINT64_C(0x0AD4C95B2279181A), UINT64_C(0x5E3D19D80821CA6B), UINT64_C(0x652492D25BEBA258), UINT64_C(0xEFA84B02EAB849B1), + UINT64_C(0x81AD2D253059AC2C), UINT64_C(0x1400CCB0DFB2F457), UINT64_C(0x5688DC72A839860E), UINT64_C(0x67CC130E0FD1B0A7), + UINT64_C(0x0A851E3A94E21E69), UINT64_C(0x2EA0000B6A073907), UINT64_C(0xAE9776FF9BF1D02E), UINT64_C(0xC0A96B66B160631C), + UINT64_C(0xA93341DE4ED7C8F0), UINT64_C(0x6FBADD8F5B85E141), UINT64_C(0xB7D295F1C21E0CBA), UINT64_C(0x6D6114591B8E434F), + UINT64_C(0xF5B6939B63D97BE7), UINT64_C(0x3C80D5053F0E5DB4), UINT64_C(0xAC520ACC6B73F62D), UINT64_C(0xD1051F5841CF3966), + UINT64_C(0x62245AEA644AE760), UINT64_C(0x0CD56BE15497C62D), UINT64_C(0x5BB93435C4988FB6), UINT64_C(0x5FADB88EB18DB512), + UINT64_C(0xC897CAE2242475CC), UINT64_C(0xF1A094EF846DC9BB), UINT64_C(0x2B1D8B24924F79B6), UINT64_C(0xC6DF0C0E8456EB53), + UINT64_C(0xE6A40128303A9B9C), UINT64_C(0x64D37AF5EFFA7BD9), UINT64_C(0x90FEB70A5AE2A598), UINT64_C(0xEC3BA5F126D9FF4B), + UINT64_C(0x3121C8EC3AC51B29), UINT64_C(0x3B41C4D422166EC1), UINT64_C(0xB4878DDCBF48ED76), UINT64_C(0x5CB850D77CB762E4), + UINT64_C(0x9A27A43CC1DD171F), UINT64_C(0x2FDFFC6F99CB424A), UINT64_C(0xF54A57E09FDEA7BB), UINT64_C(0x5F78E5EE2CAB7039), + UINT64_C(0xB8BA95883DB31CBA), UINT64_C(0x131C61EB84AF86C3), UINT64_C(0x84B1F64E9C613DA7), UINT64_C(0xE94C1888C0C37C02), + UINT64_C(0xEA08F8BFB2039CDE), UINT64_C(0xCCC6D04D243EC753), UINT64_C(0x8977D105298B0629), UINT64_C(0x7AAA976494A5905E) }; -static const uint64_t t1ha_refval_64le[81] = { 0, - UINT64_C(0x6A580668D6048674), UINT64_C(0xA2FE904AFF0D0879), UINT64_C(0xE3AB9C06FAF4D023), UINT64_C(0x6AF1C60874C95442), - UINT64_C(0xB3557E561A6C5D82), UINT64_C(0x0AE73C696F3D37C0), UINT64_C(0x5EF25F7062324941), UINT64_C(0x9B784F3B4CE6AF33), - UINT64_C(0x6993BB206A74F070), UINT64_C(0xF1E95DF109076C4C), UINT64_C(0x4E1EB70C58E48540), UINT64_C(0x5FDD7649D8EC44E4), - UINT64_C(0x559122C706343421), UINT64_C(0x380133D58665E93D), UINT64_C(0x9CE74296C8C55AE4), UINT64_C(0x3556F9A5757AB6D0), - UINT64_C(0xF62751F7F25C469E), UINT64_C(0x851EEC67F6516D94), UINT64_C(0xED463EE3848A8695), UINT64_C(0xDC8791FEFF8ED3AC), - UINT64_C(0x2569C744E1A282CF), UINT64_C(0xF90EB7C1D70A80B9), UINT64_C(0x68DFA6A1B8050A4C), UINT64_C(0x94CCA5E8210D2134), - UINT64_C(0xF5CC0BEABC259F52), UINT64_C(0x40DBC1F51618FDA7), UINT64_C(0x0807945BF0FB52C6), UINT64_C(0xE5EF7E09DE70848D), - UINT64_C(0x63E1DF35FEBE994A), UINT64_C(0x2025E73769720D5A), UINT64_C(0xAD6120B2B8A152E1), UINT64_C(0x2A71D9F13959F2B7), - UINT64_C(0x8A20849A27C32548), UINT64_C(0x0BCBC9FE3B57884E), UINT64_C(0x0E028D255667AEAD), UINT64_C(0xBE66DAD3043AB694), - UINT64_C(0xB00E4C1238F9E2D4), UINT64_C(0x5C54BDE5AE280E82), UINT64_C(0x0E22B86754BC3BC4), UINT64_C(0x016707EBF858B84D), - UINT64_C(0x990015FBC9E095EE), UINT64_C(0x8B9AF0A3E71F042F), UINT64_C(0x6AA56E88BD380564), UINT64_C(0xAACE57113E681A0F), - UINT64_C(0x19F81514AFA9A22D), UINT64_C(0x80DABA3D62BEAC79), UINT64_C(0x715210412CABBF46), UINT64_C(0xD8FA0B9E9D6AA93F), - UINT64_C(0x6C2FC5A4109FD3A2), UINT64_C(0x5B3E60EEB51DDCD8), UINT64_C(0x0A7C717017756FE7), UINT64_C(0xA73773805CA31934), - UINT64_C(0x4DBD6BB7A31E85FD), UINT64_C(0x24F619D3D5BC2DB4), UINT64_C(0x3E4AF35A1678D636), UINT64_C(0x84A1A8DF8D609239), - UINT64_C(0x359C862CD3BE4FCD), UINT64_C(0xCF3A39F5C27DC125), UINT64_C(0xC0FF62F8FD5F4C77), UINT64_C(0x5E9F2493DDAA166C), - UINT64_C(0x17424152BE1CA266), UINT64_C(0xA78AFA5AB4BBE0CD), UINT64_C(0x7BFB2E2CEF118346), UINT64_C(0x647C3E0FF3E3D241), - UINT64_C(0x0352E4055C13242E), UINT64_C(0x6F42FC70EB660E38), UINT64_C(0x0BEBAD4FABF523BA), UINT64_C(0x9269F4214414D61D), - UINT64_C(0x1CA8760277E6006C), UINT64_C(0x7BAD25A859D87B5D), UINT64_C(0xAD645ADCF7414F1D), UINT64_C(0xB07F517E88D7AFB3), - UINT64_C(0xB321C06FB5FFAB5C), UINT64_C(0xD50F162A1EFDD844), UINT64_C(0x1DFD3D1924FBE319), UINT64_C(0xDFAEAB2F09EF7E78), - UINT64_C(0xA7603B5AF07A0B1E), UINT64_C(0x41CD044C0E5A4EE3), UINT64_C(0xF64D2F86E813BF33), UINT64_C(0xFF9FDB99305EB06A) +static const uint64_t t1ha_refval_64le [81] = { + 0, + UINT64_C(0x6A580668D6048674), UINT64_C(0xA2FE904AFF0D0879), UINT64_C(0xE3AB9C06FAF4D023), UINT64_C(0x6AF1C60874C95442), + UINT64_C(0xB3557E561A6C5D82), UINT64_C(0x0AE73C696F3D37C0), UINT64_C(0x5EF25F7062324941), UINT64_C(0x9B784F3B4CE6AF33), + UINT64_C(0x6993BB206A74F070), UINT64_C(0xF1E95DF109076C4C), UINT64_C(0x4E1EB70C58E48540), UINT64_C(0x5FDD7649D8EC44E4), + UINT64_C(0x559122C706343421), UINT64_C(0x380133D58665E93D), UINT64_C(0x9CE74296C8C55AE4), UINT64_C(0x3556F9A5757AB6D0), + UINT64_C(0xF62751F7F25C469E), UINT64_C(0x851EEC67F6516D94), UINT64_C(0xED463EE3848A8695), UINT64_C(0xDC8791FEFF8ED3AC), + UINT64_C(0x2569C744E1A282CF), UINT64_C(0xF90EB7C1D70A80B9), UINT64_C(0x68DFA6A1B8050A4C), UINT64_C(0x94CCA5E8210D2134), + UINT64_C(0xF5CC0BEABC259F52), UINT64_C(0x40DBC1F51618FDA7), UINT64_C(0x0807945BF0FB52C6), UINT64_C(0xE5EF7E09DE70848D), + UINT64_C(0x63E1DF35FEBE994A), UINT64_C(0x2025E73769720D5A), UINT64_C(0xAD6120B2B8A152E1), UINT64_C(0x2A71D9F13959F2B7), + UINT64_C(0x8A20849A27C32548), UINT64_C(0x0BCBC9FE3B57884E), UINT64_C(0x0E028D255667AEAD), UINT64_C(0xBE66DAD3043AB694), + UINT64_C(0xB00E4C1238F9E2D4), UINT64_C(0x5C54BDE5AE280E82), UINT64_C(0x0E22B86754BC3BC4), UINT64_C(0x016707EBF858B84D), + UINT64_C(0x990015FBC9E095EE), UINT64_C(0x8B9AF0A3E71F042F), UINT64_C(0x6AA56E88BD380564), UINT64_C(0xAACE57113E681A0F), + UINT64_C(0x19F81514AFA9A22D), UINT64_C(0x80DABA3D62BEAC79), UINT64_C(0x715210412CABBF46), UINT64_C(0xD8FA0B9E9D6AA93F), + UINT64_C(0x6C2FC5A4109FD3A2), UINT64_C(0x5B3E60EEB51DDCD8), UINT64_C(0x0A7C717017756FE7), UINT64_C(0xA73773805CA31934), + UINT64_C(0x4DBD6BB7A31E85FD), UINT64_C(0x24F619D3D5BC2DB4), UINT64_C(0x3E4AF35A1678D636), UINT64_C(0x84A1A8DF8D609239), + UINT64_C(0x359C862CD3BE4FCD), UINT64_C(0xCF3A39F5C27DC125), UINT64_C(0xC0FF62F8FD5F4C77), UINT64_C(0x5E9F2493DDAA166C), + UINT64_C(0x17424152BE1CA266), UINT64_C(0xA78AFA5AB4BBE0CD), UINT64_C(0x7BFB2E2CEF118346), UINT64_C(0x647C3E0FF3E3D241), + UINT64_C(0x0352E4055C13242E), UINT64_C(0x6F42FC70EB660E38), UINT64_C(0x0BEBAD4FABF523BA), UINT64_C(0x9269F4214414D61D), + UINT64_C(0x1CA8760277E6006C), UINT64_C(0x7BAD25A859D87B5D), UINT64_C(0xAD645ADCF7414F1D), UINT64_C(0xB07F517E88D7AFB3), + UINT64_C(0xB321C06FB5FFAB5C), UINT64_C(0xD50F162A1EFDD844), UINT64_C(0x1DFD3D1924FBE319), UINT64_C(0xDFAEAB2F09EF7E78), + UINT64_C(0xA7603B5AF07A0B1E), UINT64_C(0x41CD044C0E5A4EE3), UINT64_C(0xF64D2F86E813BF33), UINT64_C(0xFF9FDB99305EB06A) }; -static const uint64_t t1ha_refval_64be[81] = { 0, - UINT64_C(0x6A580668D6048674), UINT64_C(0xDECC975A0E3B8177), UINT64_C(0xE3AB9C06FAF4D023), UINT64_C(0xE401FA8F1B6AF969), - UINT64_C(0x67DB1DAE56FB94E3), UINT64_C(0x1106266A09B7A073), UINT64_C(0x550339B1EF2C7BBB), UINT64_C(0x290A2BAF590045BB), - UINT64_C(0xA182C1258C09F54A), UINT64_C(0x137D53C34BE7143A), UINT64_C(0xF6D2B69C6F42BEDC), UINT64_C(0x39643EAF2CA2E4B4), - UINT64_C(0x22A81F139A2C9559), UINT64_C(0x5B3D6AEF0AF33807), UINT64_C(0x56E3F80A68643C08), UINT64_C(0x9E423BE502378780), - UINT64_C(0xCDB0986F9A5B2FD5), UINT64_C(0xD5B3C84E7933293F), UINT64_C(0xE5FB8C90399E9742), UINT64_C(0x5D393C1F77B2CF3D), - UINT64_C(0xC8C82F5B2FF09266), UINT64_C(0xACA0230CA6F7B593), UINT64_C(0xCB5805E2960D1655), UINT64_C(0x7E2AD5B704D77C95), - UINT64_C(0xC5E903CDB8B9EB5D), UINT64_C(0x4CC7D0D21CC03511), UINT64_C(0x8385DF382CFB3E93), UINT64_C(0xF17699D0564D348A), - UINT64_C(0xF77EE7F8274A4C8D), UINT64_C(0xB9D8CEE48903BABE), UINT64_C(0xFE0EBD2A82B9CFE9), UINT64_C(0xB49FB6397270F565), - UINT64_C(0x173735C8C342108E), UINT64_C(0xA37C7FBBEEC0A2EA), UINT64_C(0xC13F66F462BB0B6E), UINT64_C(0x0C04F3C2B551467E), - UINT64_C(0x76A9CB156810C96E), UINT64_C(0x2038850919B0B151), UINT64_C(0xCEA19F2B6EED647B), UINT64_C(0x6746656D2FA109A4), - UINT64_C(0xF05137F221007F37), UINT64_C(0x892FA9E13A3B4948), UINT64_C(0x4D57B70D37548A32), UINT64_C(0x1A7CFB3D566580E6), - UINT64_C(0x7CB30272A45E3FAC), UINT64_C(0x137CCFFD9D51423F), UINT64_C(0xB87D96F3B82DF266), UINT64_C(0x33349AEE7472ED37), - UINT64_C(0x5CC0D3C99555BC07), UINT64_C(0x4A8F4FA196D964EF), UINT64_C(0xE82A0D64F281FBFA), UINT64_C(0x38A1BAC2C36823E1), - UINT64_C(0x77D197C239FD737E), UINT64_C(0xFB07746B4E07DF26), UINT64_C(0xC8A2198E967672BD), UINT64_C(0x5F1A146D143FA05A), - UINT64_C(0x26B877A1201AB7AC), UINT64_C(0x74E5B145214723F8), UINT64_C(0xE9CE10E3C70254BC), UINT64_C(0x299393A0C05B79E8), - UINT64_C(0xFD2D2B9822A5E7E2), UINT64_C(0x85424FEA50C8E50A), UINT64_C(0xE6839E714B1FFFE5), UINT64_C(0x27971CCB46F9112A), - UINT64_C(0xC98695A2E0715AA9), UINT64_C(0x338E1CBB4F858226), UINT64_C(0xFC6B5C5CF7A8D806), UINT64_C(0x8973CAADDE8DA50C), - UINT64_C(0x9C6D47AE32EBAE72), UINT64_C(0x1EBF1F9F21D26D78), UINT64_C(0x80A9704B8E153859), UINT64_C(0x6AFD20A939F141FB), - UINT64_C(0xC35F6C2B3B553EEF), UINT64_C(0x59529E8B0DC94C1A), UINT64_C(0x1569DF036EBC4FA1), UINT64_C(0xDA32B88593C118F9), - UINT64_C(0xF01E4155FF5A5660), UINT64_C(0x765A2522DCE2B185), UINT64_C(0xCEE95554128073EF), UINT64_C(0x60F072A5CA51DE2F) +static const uint64_t t1ha_refval_64be [81] = { + 0, + UINT64_C(0x6A580668D6048674), UINT64_C(0xDECC975A0E3B8177), UINT64_C(0xE3AB9C06FAF4D023), UINT64_C(0xE401FA8F1B6AF969), + UINT64_C(0x67DB1DAE56FB94E3), UINT64_C(0x1106266A09B7A073), UINT64_C(0x550339B1EF2C7BBB), UINT64_C(0x290A2BAF590045BB), + UINT64_C(0xA182C1258C09F54A), UINT64_C(0x137D53C34BE7143A), UINT64_C(0xF6D2B69C6F42BEDC), UINT64_C(0x39643EAF2CA2E4B4), + UINT64_C(0x22A81F139A2C9559), UINT64_C(0x5B3D6AEF0AF33807), UINT64_C(0x56E3F80A68643C08), UINT64_C(0x9E423BE502378780), + UINT64_C(0xCDB0986F9A5B2FD5), UINT64_C(0xD5B3C84E7933293F), UINT64_C(0xE5FB8C90399E9742), UINT64_C(0x5D393C1F77B2CF3D), + UINT64_C(0xC8C82F5B2FF09266), UINT64_C(0xACA0230CA6F7B593), UINT64_C(0xCB5805E2960D1655), UINT64_C(0x7E2AD5B704D77C95), + UINT64_C(0xC5E903CDB8B9EB5D), UINT64_C(0x4CC7D0D21CC03511), UINT64_C(0x8385DF382CFB3E93), UINT64_C(0xF17699D0564D348A), + UINT64_C(0xF77EE7F8274A4C8D), UINT64_C(0xB9D8CEE48903BABE), UINT64_C(0xFE0EBD2A82B9CFE9), UINT64_C(0xB49FB6397270F565), + UINT64_C(0x173735C8C342108E), UINT64_C(0xA37C7FBBEEC0A2EA), UINT64_C(0xC13F66F462BB0B6E), UINT64_C(0x0C04F3C2B551467E), + UINT64_C(0x76A9CB156810C96E), UINT64_C(0x2038850919B0B151), UINT64_C(0xCEA19F2B6EED647B), UINT64_C(0x6746656D2FA109A4), + UINT64_C(0xF05137F221007F37), UINT64_C(0x892FA9E13A3B4948), UINT64_C(0x4D57B70D37548A32), UINT64_C(0x1A7CFB3D566580E6), + UINT64_C(0x7CB30272A45E3FAC), UINT64_C(0x137CCFFD9D51423F), UINT64_C(0xB87D96F3B82DF266), UINT64_C(0x33349AEE7472ED37), + UINT64_C(0x5CC0D3C99555BC07), UINT64_C(0x4A8F4FA196D964EF), UINT64_C(0xE82A0D64F281FBFA), UINT64_C(0x38A1BAC2C36823E1), + UINT64_C(0x77D197C239FD737E), UINT64_C(0xFB07746B4E07DF26), UINT64_C(0xC8A2198E967672BD), UINT64_C(0x5F1A146D143FA05A), + UINT64_C(0x26B877A1201AB7AC), UINT64_C(0x74E5B145214723F8), UINT64_C(0xE9CE10E3C70254BC), UINT64_C(0x299393A0C05B79E8), + UINT64_C(0xFD2D2B9822A5E7E2), UINT64_C(0x85424FEA50C8E50A), UINT64_C(0xE6839E714B1FFFE5), UINT64_C(0x27971CCB46F9112A), + UINT64_C(0xC98695A2E0715AA9), UINT64_C(0x338E1CBB4F858226), UINT64_C(0xFC6B5C5CF7A8D806), UINT64_C(0x8973CAADDE8DA50C), + UINT64_C(0x9C6D47AE32EBAE72), UINT64_C(0x1EBF1F9F21D26D78), UINT64_C(0x80A9704B8E153859), UINT64_C(0x6AFD20A939F141FB), + UINT64_C(0xC35F6C2B3B553EEF), UINT64_C(0x59529E8B0DC94C1A), UINT64_C(0x1569DF036EBC4FA1), UINT64_C(0xDA32B88593C118F9), + UINT64_C(0xF01E4155FF5A5660), UINT64_C(0x765A2522DCE2B185), UINT64_C(0xCEE95554128073EF), UINT64_C(0x60F072A5CA51DE2F) }; -static const uint64_t t1ha_refval_2atonce[81] = { 0, - UINT64_C(0x772C7311BE32FF42), UINT64_C(0x444753D23F207E03), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), - UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), - UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), - UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), - UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), - UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), - UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), - UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), - UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0x25A7201C85D9E2A3), UINT64_C(0x911573EDA15299AA), - UINT64_C(0x5C0062B669E18E4C), UINT64_C(0x17734ADE08D54E28), UINT64_C(0xFFF036E33883F43B), UINT64_C(0xFE0756E7777DF11E), - UINT64_C(0x37972472D023F129), UINT64_C(0x6CFCE201B55C7F57), UINT64_C(0xE019D1D89F02B3E1), UINT64_C(0xAE5CC580FA1BB7E6), - UINT64_C(0x295695FB7E59FC3A), UINT64_C(0x76B6C820A40DD35E), UINT64_C(0xB1680A1768462B17), UINT64_C(0x2FB6AF279137DADA), - UINT64_C(0x28FB6B4366C78535), UINT64_C(0xEC278E53924541B1), UINT64_C(0x164F8AAB8A2A28B5), UINT64_C(0xB6C330AEAC4578AD), - UINT64_C(0x7F6F371070085084), UINT64_C(0x94DEAD60C0F448D3), UINT64_C(0x99737AC232C559EF), UINT64_C(0x6F54A6F9CA8EDD57), - UINT64_C(0x979B01E926BFCE0C), UINT64_C(0xF7D20BC85439C5B4), UINT64_C(0x64EDB27CD8087C12), UINT64_C(0x11488DE5F79C0BE2), - UINT64_C(0x25541DDD1680B5A4), UINT64_C(0x8B633D33BE9D1973), UINT64_C(0x404A3113ACF7F6C6), UINT64_C(0xC59DBDEF8550CD56), - UINT64_C(0x039D23C68F4F992C), UINT64_C(0x5BBB48E4BDD6FD86), UINT64_C(0x41E312248780DF5A), UINT64_C(0xD34791CE75D4E94F), - UINT64_C(0xED523E5D04DCDCFF), UINT64_C(0x7A6BCE0B6182D879), UINT64_C(0x21FB37483CAC28D8), UINT64_C(0x19A1B66E8DA878AD), - UINT64_C(0x6F804C5295B09ABE), UINT64_C(0x2A4BE5014115BA81), UINT64_C(0xA678ECC5FC924BE0), UINT64_C(0x50F7A54A99A36F59), - UINT64_C(0x0FD7E63A39A66452), UINT64_C(0x5AB1B213DD29C4E4), UINT64_C(0xF3ED80D9DF6534C5), UINT64_C(0xC736B12EF90615FD) +static const uint64_t t1ha_refval_2atonce [81] = { + 0, + UINT64_C(0x772C7311BE32FF42), UINT64_C(0x444753D23F207E03), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), + UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), + UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), + UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), + UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), + UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), + UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), + UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), + UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0x25A7201C85D9E2A3), UINT64_C(0x911573EDA15299AA), + UINT64_C(0x5C0062B669E18E4C), UINT64_C(0x17734ADE08D54E28), UINT64_C(0xFFF036E33883F43B), UINT64_C(0xFE0756E7777DF11E), + UINT64_C(0x37972472D023F129), UINT64_C(0x6CFCE201B55C7F57), UINT64_C(0xE019D1D89F02B3E1), UINT64_C(0xAE5CC580FA1BB7E6), + UINT64_C(0x295695FB7E59FC3A), UINT64_C(0x76B6C820A40DD35E), UINT64_C(0xB1680A1768462B17), UINT64_C(0x2FB6AF279137DADA), + UINT64_C(0x28FB6B4366C78535), UINT64_C(0xEC278E53924541B1), UINT64_C(0x164F8AAB8A2A28B5), UINT64_C(0xB6C330AEAC4578AD), + UINT64_C(0x7F6F371070085084), UINT64_C(0x94DEAD60C0F448D3), UINT64_C(0x99737AC232C559EF), UINT64_C(0x6F54A6F9CA8EDD57), + UINT64_C(0x979B01E926BFCE0C), UINT64_C(0xF7D20BC85439C5B4), UINT64_C(0x64EDB27CD8087C12), UINT64_C(0x11488DE5F79C0BE2), + UINT64_C(0x25541DDD1680B5A4), UINT64_C(0x8B633D33BE9D1973), UINT64_C(0x404A3113ACF7F6C6), UINT64_C(0xC59DBDEF8550CD56), + UINT64_C(0x039D23C68F4F992C), UINT64_C(0x5BBB48E4BDD6FD86), UINT64_C(0x41E312248780DF5A), UINT64_C(0xD34791CE75D4E94F), + UINT64_C(0xED523E5D04DCDCFF), UINT64_C(0x7A6BCE0B6182D879), UINT64_C(0x21FB37483CAC28D8), UINT64_C(0x19A1B66E8DA878AD), + UINT64_C(0x6F804C5295B09ABE), UINT64_C(0x2A4BE5014115BA81), UINT64_C(0xA678ECC5FC924BE0), UINT64_C(0x50F7A54A99A36F59), + UINT64_C(0x0FD7E63A39A66452), UINT64_C(0x5AB1B213DD29C4E4), UINT64_C(0xF3ED80D9DF6534C5), UINT64_C(0xC736B12EF90615FD) }; -static const uint64_t t1ha_refval_2atonce128[81] = { UINT64_C(0x4EC7F6A48E33B00A), - UINT64_C(0xB7B7FAA5BD7D8C1E), UINT64_C(0x3269533F66534A76), UINT64_C(0x6C3EC6B687923BFC), UINT64_C(0xC096F5E7EFA471A9), - UINT64_C(0x79D8AFB550CEA471), UINT64_C(0xCEE0507A20FD5119), UINT64_C(0xFB04CFFC14A9F4BF), UINT64_C(0xBD4406E923807AF2), - UINT64_C(0x375C02FF11010491), UINT64_C(0xA6EA4C2A59E173FF), UINT64_C(0xE0A606F0002CADDF), UINT64_C(0xE13BEAE6EBC07897), - UINT64_C(0xF069C2463E48EA10), UINT64_C(0x75BEE1A97089B5FA), UINT64_C(0x378F22F8DE0B8085), UINT64_C(0x9C726FC4D53D0D8B), - UINT64_C(0x71F6130A2D08F788), UINT64_C(0x7A9B20433FF6CF69), UINT64_C(0xFF49B7CD59BF6D61), UINT64_C(0xCCAAEE0D1CA9C6B3), - UINT64_C(0xC77889D86039D2AD), UINT64_C(0x7B378B5BEA9B0475), UINT64_C(0x6520BFA79D59AD66), UINT64_C(0x2441490CB8A37267), - UINT64_C(0xA715A66B7D5CF473), UINT64_C(0x9AE892C88334FD67), UINT64_C(0xD2FFE9AEC1D2169A), UINT64_C(0x790B993F18B18CBB), - UINT64_C(0xA0D02FBCF6A7B1AD), UINT64_C(0xA90833E6F151D0C1), UINT64_C(0x1AC7AFA37BD79BE0), UINT64_C(0xD5383628B2881A24), - UINT64_C(0xE5526F9D63F9F8F1), UINT64_C(0xC1F165A01A6D1F4D), UINT64_C(0x6CCEF8FF3FCFA3F2), UINT64_C(0x2030F18325E6DF48), - UINT64_C(0x289207230E3FB17A), UINT64_C(0x077B66F713A3C4B9), UINT64_C(0x9F39843CAF871754), UINT64_C(0x512FDA0F808ACCF3), - UINT64_C(0xF4D9801CD0CD1F14), UINT64_C(0x28A0C749ED323638), UINT64_C(0x94844CAFA671F01C), UINT64_C(0xD0E261876B8ACA51), - UINT64_C(0x8FC2A648A4792EA2), UINT64_C(0x8EF87282136AF5FE), UINT64_C(0x5FE6A54A9FBA6B40), UINT64_C(0xA3CC5B8FE6223D54), - UINT64_C(0xA8C3C0DD651BB01C), UINT64_C(0x625E9FDD534716F3), UINT64_C(0x1AB2604083C33AC5), UINT64_C(0xDE098853F8692F12), - UINT64_C(0x4B0813891BD87624), UINT64_C(0x4AB89C4553D182AD), UINT64_C(0x92C15AA2A3C27ADA), UINT64_C(0xFF2918D68191F5D9), - UINT64_C(0x06363174F641C325), UINT64_C(0x667112ADA74A2059), UINT64_C(0x4BD605D6B5E53D7D), UINT64_C(0xF2512C53663A14C8), - UINT64_C(0x21857BCB1852667C), UINT64_C(0xAFBEBD0369AEE228), UINT64_C(0x7049340E48FBFD6B), UINT64_C(0x50710E1924F46954), - UINT64_C(0x869A75E04A976A3F), UINT64_C(0x5A41ABBDD6373889), UINT64_C(0xA781778389B4B188), UINT64_C(0x21A3AFCED6C925B6), - UINT64_C(0x107226192EC10B42), UINT64_C(0x62A862E84EC2F9B1), UINT64_C(0x2B15E91659606DD7), UINT64_C(0x613934D1F9EC5A42), - UINT64_C(0x4DC3A96DC5361BAF), UINT64_C(0xC80BBA4CB5F12903), UINT64_C(0x3E3EDAE99A7D6987), UINT64_C(0x8F97B2D55941DCB0), - UINT64_C(0x4C9787364C3E4EC1), UINT64_C(0xEF0A2D07BEA90CA7), UINT64_C(0x5FABF32C70AEEAFB), UINT64_C(0x3356A5CFA8F23BF4) +static const uint64_t t1ha_refval_2atonce128[81] = { + UINT64_C(0x4EC7F6A48E33B00A), + UINT64_C(0xB7B7FAA5BD7D8C1E), UINT64_C(0x3269533F66534A76), UINT64_C(0x6C3EC6B687923BFC), UINT64_C(0xC096F5E7EFA471A9), + UINT64_C(0x79D8AFB550CEA471), UINT64_C(0xCEE0507A20FD5119), UINT64_C(0xFB04CFFC14A9F4BF), UINT64_C(0xBD4406E923807AF2), + UINT64_C(0x375C02FF11010491), UINT64_C(0xA6EA4C2A59E173FF), UINT64_C(0xE0A606F0002CADDF), UINT64_C(0xE13BEAE6EBC07897), + UINT64_C(0xF069C2463E48EA10), UINT64_C(0x75BEE1A97089B5FA), UINT64_C(0x378F22F8DE0B8085), UINT64_C(0x9C726FC4D53D0D8B), + UINT64_C(0x71F6130A2D08F788), UINT64_C(0x7A9B20433FF6CF69), UINT64_C(0xFF49B7CD59BF6D61), UINT64_C(0xCCAAEE0D1CA9C6B3), + UINT64_C(0xC77889D86039D2AD), UINT64_C(0x7B378B5BEA9B0475), UINT64_C(0x6520BFA79D59AD66), UINT64_C(0x2441490CB8A37267), + UINT64_C(0xA715A66B7D5CF473), UINT64_C(0x9AE892C88334FD67), UINT64_C(0xD2FFE9AEC1D2169A), UINT64_C(0x790B993F18B18CBB), + UINT64_C(0xA0D02FBCF6A7B1AD), UINT64_C(0xA90833E6F151D0C1), UINT64_C(0x1AC7AFA37BD79BE0), UINT64_C(0xD5383628B2881A24), + UINT64_C(0xE5526F9D63F9F8F1), UINT64_C(0xC1F165A01A6D1F4D), UINT64_C(0x6CCEF8FF3FCFA3F2), UINT64_C(0x2030F18325E6DF48), + UINT64_C(0x289207230E3FB17A), UINT64_C(0x077B66F713A3C4B9), UINT64_C(0x9F39843CAF871754), UINT64_C(0x512FDA0F808ACCF3), + UINT64_C(0xF4D9801CD0CD1F14), UINT64_C(0x28A0C749ED323638), UINT64_C(0x94844CAFA671F01C), UINT64_C(0xD0E261876B8ACA51), + UINT64_C(0x8FC2A648A4792EA2), UINT64_C(0x8EF87282136AF5FE), UINT64_C(0x5FE6A54A9FBA6B40), UINT64_C(0xA3CC5B8FE6223D54), + UINT64_C(0xA8C3C0DD651BB01C), UINT64_C(0x625E9FDD534716F3), UINT64_C(0x1AB2604083C33AC5), UINT64_C(0xDE098853F8692F12), + UINT64_C(0x4B0813891BD87624), UINT64_C(0x4AB89C4553D182AD), UINT64_C(0x92C15AA2A3C27ADA), UINT64_C(0xFF2918D68191F5D9), + UINT64_C(0x06363174F641C325), UINT64_C(0x667112ADA74A2059), UINT64_C(0x4BD605D6B5E53D7D), UINT64_C(0xF2512C53663A14C8), + UINT64_C(0x21857BCB1852667C), UINT64_C(0xAFBEBD0369AEE228), UINT64_C(0x7049340E48FBFD6B), UINT64_C(0x50710E1924F46954), + UINT64_C(0x869A75E04A976A3F), UINT64_C(0x5A41ABBDD6373889), UINT64_C(0xA781778389B4B188), UINT64_C(0x21A3AFCED6C925B6), + UINT64_C(0x107226192EC10B42), UINT64_C(0x62A862E84EC2F9B1), UINT64_C(0x2B15E91659606DD7), UINT64_C(0x613934D1F9EC5A42), + UINT64_C(0x4DC3A96DC5361BAF), UINT64_C(0xC80BBA4CB5F12903), UINT64_C(0x3E3EDAE99A7D6987), UINT64_C(0x8F97B2D55941DCB0), + UINT64_C(0x4C9787364C3E4EC1), UINT64_C(0xEF0A2D07BEA90CA7), UINT64_C(0x5FABF32C70AEEAFB), UINT64_C(0x3356A5CFA8F23BF4) }; -static const uint64_t t1ha_refval_2stream[81] = { UINT64_C(0x3C8426E33CB41606), - UINT64_C(0xFD74BE70EE73E617), UINT64_C(0xF43DE3CDD8A20486), UINT64_C(0x882FBCB37E8EA3BB), UINT64_C(0x1AA2CDD34CAA3D4B), - UINT64_C(0xEE755B2BFAE07ED5), UINT64_C(0xD4E225250D92E213), UINT64_C(0xA09B49083205965B), UINT64_C(0xD47B21724EF9EC9E), - UINT64_C(0xAC888FC3858CEE11), UINT64_C(0x94F820D85736F244), UINT64_C(0x1707951CCA920932), UINT64_C(0x8E0E45603F7877F0), - UINT64_C(0x9FD2592C0E3A7212), UINT64_C(0x9A66370F3AE3D427), UINT64_C(0xD33382D2161DE2B7), UINT64_C(0x9A35BE079DA7115F), - UINT64_C(0x73457C7FF58B4EC3), UINT64_C(0xBE8610BD53D7CE98), UINT64_C(0x65506DFE5CCD5371), UINT64_C(0x286A321AF9D5D9FA), - UINT64_C(0xB81EF9A7EF3C536D), UINT64_C(0x2CFDB5E6825C6E86), UINT64_C(0xB2A58CBFDFDD303A), UINT64_C(0xD26094A42B950635), - UINT64_C(0xA34D666A5F02AD9A), UINT64_C(0x0151E013EBCC72E5), UINT64_C(0x9254A6EA7FCB6BB5), UINT64_C(0x10C9361B3869DC2B), - UINT64_C(0xD7EC55A060606276), UINT64_C(0xA2FF7F8BF8976FFD), UINT64_C(0xB5181BB6852DCC88), UINT64_C(0x0EE394BB6178BAFF), - UINT64_C(0x3A8B4B400D21B89C), UINT64_C(0xEC270461970960FD), UINT64_C(0x615967FAB053877E), UINT64_C(0xFA51BF1CFEB4714C), - UINT64_C(0x29FDA8383070F375), UINT64_C(0xC3B663061BC52EDA), UINT64_C(0x192BBAF1F1A57923), UINT64_C(0x6D193B52F93C53AF), - UINT64_C(0x7F6F5639FE87CA1E), UINT64_C(0x69F7F9140B32EDC8), UINT64_C(0xD0F2416FB24325B6), UINT64_C(0x62C0E37FEDD49FF3), - UINT64_C(0x57866A4B809D373D), UINT64_C(0x9848D24BD935E137), UINT64_C(0xDFC905B66734D50A), UINT64_C(0x9A938DD194A68529), - UINT64_C(0x8276C44DF0625228), UINT64_C(0xA4B35D00AD67C0AB), UINT64_C(0x3D9CB359842DB452), UINT64_C(0x4241BFA8C23B267F), - UINT64_C(0x650FA517BEF15952), UINT64_C(0x782DE2ABD8C7B1E1), UINT64_C(0x4EAE456166CA3E15), UINT64_C(0x40CDF3A02614E337), - UINT64_C(0xAD84092C46102172), UINT64_C(0x0C68479B03F9A167), UINT64_C(0x7E1BA046749E181C), UINT64_C(0x3F3AB41A697382C1), - UINT64_C(0xC5E5DD6586EBFDC4), UINT64_C(0xFF926CD4EB02555C), UINT64_C(0x035CFE67F89E709B), UINT64_C(0x89F06AB6464A1B9D), - UINT64_C(0x8EFF58F3F7DEA758), UINT64_C(0x8B54AC657902089F), UINT64_C(0xC6C4F1F9F8DA4D64), UINT64_C(0xBDB729048AAAC93A), - UINT64_C(0xEA76BA628F5E5CD6), UINT64_C(0x742159B728B8A979), UINT64_C(0x6D151CD3C720E53D), UINT64_C(0xE97FFF9368FCDC42), - UINT64_C(0xCA5B38314914FBDA), UINT64_C(0xDD92C91D8B858EAE), UINT64_C(0x66E5F07CF647CBF2), UINT64_C(0xD4CF9B42F4985AFB), - UINT64_C(0x72AE17AC7D92F6B7), UINT64_C(0xB8206B22AB0472E1), UINT64_C(0x385876B5CFD42479), UINT64_C(0x03294A249EBE6B26) +static const uint64_t t1ha_refval_2stream [81] = { + UINT64_C(0x3C8426E33CB41606), + UINT64_C(0xFD74BE70EE73E617), UINT64_C(0xF43DE3CDD8A20486), UINT64_C(0x882FBCB37E8EA3BB), UINT64_C(0x1AA2CDD34CAA3D4B), + UINT64_C(0xEE755B2BFAE07ED5), UINT64_C(0xD4E225250D92E213), UINT64_C(0xA09B49083205965B), UINT64_C(0xD47B21724EF9EC9E), + UINT64_C(0xAC888FC3858CEE11), UINT64_C(0x94F820D85736F244), UINT64_C(0x1707951CCA920932), UINT64_C(0x8E0E45603F7877F0), + UINT64_C(0x9FD2592C0E3A7212), UINT64_C(0x9A66370F3AE3D427), UINT64_C(0xD33382D2161DE2B7), UINT64_C(0x9A35BE079DA7115F), + UINT64_C(0x73457C7FF58B4EC3), UINT64_C(0xBE8610BD53D7CE98), UINT64_C(0x65506DFE5CCD5371), UINT64_C(0x286A321AF9D5D9FA), + UINT64_C(0xB81EF9A7EF3C536D), UINT64_C(0x2CFDB5E6825C6E86), UINT64_C(0xB2A58CBFDFDD303A), UINT64_C(0xD26094A42B950635), + UINT64_C(0xA34D666A5F02AD9A), UINT64_C(0x0151E013EBCC72E5), UINT64_C(0x9254A6EA7FCB6BB5), UINT64_C(0x10C9361B3869DC2B), + UINT64_C(0xD7EC55A060606276), UINT64_C(0xA2FF7F8BF8976FFD), UINT64_C(0xB5181BB6852DCC88), UINT64_C(0x0EE394BB6178BAFF), + UINT64_C(0x3A8B4B400D21B89C), UINT64_C(0xEC270461970960FD), UINT64_C(0x615967FAB053877E), UINT64_C(0xFA51BF1CFEB4714C), + UINT64_C(0x29FDA8383070F375), UINT64_C(0xC3B663061BC52EDA), UINT64_C(0x192BBAF1F1A57923), UINT64_C(0x6D193B52F93C53AF), + UINT64_C(0x7F6F5639FE87CA1E), UINT64_C(0x69F7F9140B32EDC8), UINT64_C(0xD0F2416FB24325B6), UINT64_C(0x62C0E37FEDD49FF3), + UINT64_C(0x57866A4B809D373D), UINT64_C(0x9848D24BD935E137), UINT64_C(0xDFC905B66734D50A), UINT64_C(0x9A938DD194A68529), + UINT64_C(0x8276C44DF0625228), UINT64_C(0xA4B35D00AD67C0AB), UINT64_C(0x3D9CB359842DB452), UINT64_C(0x4241BFA8C23B267F), + UINT64_C(0x650FA517BEF15952), UINT64_C(0x782DE2ABD8C7B1E1), UINT64_C(0x4EAE456166CA3E15), UINT64_C(0x40CDF3A02614E337), + UINT64_C(0xAD84092C46102172), UINT64_C(0x0C68479B03F9A167), UINT64_C(0x7E1BA046749E181C), UINT64_C(0x3F3AB41A697382C1), + UINT64_C(0xC5E5DD6586EBFDC4), UINT64_C(0xFF926CD4EB02555C), UINT64_C(0x035CFE67F89E709B), UINT64_C(0x89F06AB6464A1B9D), + UINT64_C(0x8EFF58F3F7DEA758), UINT64_C(0x8B54AC657902089F), UINT64_C(0xC6C4F1F9F8DA4D64), UINT64_C(0xBDB729048AAAC93A), + UINT64_C(0xEA76BA628F5E5CD6), UINT64_C(0x742159B728B8A979), UINT64_C(0x6D151CD3C720E53D), UINT64_C(0xE97FFF9368FCDC42), + UINT64_C(0xCA5B38314914FBDA), UINT64_C(0xDD92C91D8B858EAE), UINT64_C(0x66E5F07CF647CBF2), UINT64_C(0xD4CF9B42F4985AFB), + UINT64_C(0x72AE17AC7D92F6B7), UINT64_C(0xB8206B22AB0472E1), UINT64_C(0x385876B5CFD42479), UINT64_C(0x03294A249EBE6B26) }; -static const uint64_t t1ha_refval_2stream128[81] = { UINT64_C(0xCD2801D3B92237D6), - UINT64_C(0x10E4D47BD821546D), UINT64_C(0x9100704B9D65CD06), UINT64_C(0xD6951CB4016313EF), UINT64_C(0x24DB636F96F474DA), - UINT64_C(0x3F4AF7DF3C49E422), UINT64_C(0xBFF25B8AF143459B), UINT64_C(0xA157EC13538BE549), UINT64_C(0xD3F5F52C47DBD419), - UINT64_C(0x0EF3D7D735AF1575), UINT64_C(0x46B7B892823F7B1B), UINT64_C(0xEE22EA4655213289), UINT64_C(0x56AD76F02FE929BC), - UINT64_C(0x9CF6CD1AC886546E), UINT64_C(0xAF45CE47AEA0B933), UINT64_C(0x535F9DC09F3996B7), UINT64_C(0x1F0C3C01694AE128), - UINT64_C(0x18495069BE0766F7), UINT64_C(0x37E5FFB3D72A4CB1), UINT64_C(0x6D6C2E9299F30709), UINT64_C(0x4F39E693F50B41E3), - UINT64_C(0xB11FC4EF0658E116), UINT64_C(0x48BFAACB78E5079B), UINT64_C(0xE1B4C89C781B3AD0), UINT64_C(0x81D2F34888D333A1), - UINT64_C(0xF6D02270D2EA449C), UINT64_C(0xC884C3C2C3CE1503), UINT64_C(0x711AE16BA157A9B9), UINT64_C(0x1E6140C642558C9D), - UINT64_C(0x35AB3D238F5DC55B), UINT64_C(0x33F07B6AEF051177), UINT64_C(0xE57336776EEFA71C), UINT64_C(0x6D445F8318BA3752), - UINT64_C(0xD4F5F6631934C988), UINT64_C(0xD5E260085727C4A2), UINT64_C(0x5B54B41EC180B4FA), UINT64_C(0x7F5D75769C15A898), - UINT64_C(0xAE5A6DB850CA33C6), UINT64_C(0x038CCB8044663403), UINT64_C(0xDA16310133DC92B8), UINT64_C(0x6A2FFB7AB2B7CE2B), - UINT64_C(0xDC1832D9229BAE20), UINT64_C(0x8C62C479F5ABC9E4), UINT64_C(0x5EB7B617857C9CCB), UINT64_C(0xB79CF7D749A1E80D), - UINT64_C(0xDE7FAC3798324FD3), UINT64_C(0x8178911813685D06), UINT64_C(0x6A726CBD394D4410), UINT64_C(0x6CBE6B3280DA1113), - UINT64_C(0x6829BA4410CF1148), UINT64_C(0xFA7E417EB26C5BC6), UINT64_C(0x22ED87884D6E3A49), UINT64_C(0x15F1472D5115669D), - UINT64_C(0x2EA0B4C8BF69D318), UINT64_C(0xDFE87070AA545503), UINT64_C(0x6B4C14B5F7144AB9), UINT64_C(0xC1ED49C06126551A), - UINT64_C(0x351919FC425C3899), UINT64_C(0x7B569C0FA6F1BD3E), UINT64_C(0x713AC2350844CFFD), UINT64_C(0xE9367F9A638C2FF3), - UINT64_C(0x97F17D325AEA0786), UINT64_C(0xBCB907CC6CF75F91), UINT64_C(0x0CB7517DAF247719), UINT64_C(0xBE16093CC45BE8A9), - UINT64_C(0x786EEE97359AD6AB), UINT64_C(0xB7AFA4F326B97E78), UINT64_C(0x2694B67FE23E502E), UINT64_C(0x4CB492826E98E0B4), - UINT64_C(0x838D119F74A416C7), UINT64_C(0x70D6A91E4E5677FD), UINT64_C(0xF3E4027AD30000E6), UINT64_C(0x9BDF692795807F77), - UINT64_C(0x6A371F966E034A54), UINT64_C(0x8789CF41AE4D67EF), UINT64_C(0x02688755484D60AE), UINT64_C(0xD5834B3A4BF5CE42), - UINT64_C(0x9405FC61440DE25D), UINT64_C(0x35EB280A157979B6), UINT64_C(0x48D40D6A525297AC), UINT64_C(0x6A87DC185054BADA) +static const uint64_t t1ha_refval_2stream128[81] = { + UINT64_C(0xCD2801D3B92237D6), + UINT64_C(0x10E4D47BD821546D), UINT64_C(0x9100704B9D65CD06), UINT64_C(0xD6951CB4016313EF), UINT64_C(0x24DB636F96F474DA), + UINT64_C(0x3F4AF7DF3C49E422), UINT64_C(0xBFF25B8AF143459B), UINT64_C(0xA157EC13538BE549), UINT64_C(0xD3F5F52C47DBD419), + UINT64_C(0x0EF3D7D735AF1575), UINT64_C(0x46B7B892823F7B1B), UINT64_C(0xEE22EA4655213289), UINT64_C(0x56AD76F02FE929BC), + UINT64_C(0x9CF6CD1AC886546E), UINT64_C(0xAF45CE47AEA0B933), UINT64_C(0x535F9DC09F3996B7), UINT64_C(0x1F0C3C01694AE128), + UINT64_C(0x18495069BE0766F7), UINT64_C(0x37E5FFB3D72A4CB1), UINT64_C(0x6D6C2E9299F30709), UINT64_C(0x4F39E693F50B41E3), + UINT64_C(0xB11FC4EF0658E116), UINT64_C(0x48BFAACB78E5079B), UINT64_C(0xE1B4C89C781B3AD0), UINT64_C(0x81D2F34888D333A1), + UINT64_C(0xF6D02270D2EA449C), UINT64_C(0xC884C3C2C3CE1503), UINT64_C(0x711AE16BA157A9B9), UINT64_C(0x1E6140C642558C9D), + UINT64_C(0x35AB3D238F5DC55B), UINT64_C(0x33F07B6AEF051177), UINT64_C(0xE57336776EEFA71C), UINT64_C(0x6D445F8318BA3752), + UINT64_C(0xD4F5F6631934C988), UINT64_C(0xD5E260085727C4A2), UINT64_C(0x5B54B41EC180B4FA), UINT64_C(0x7F5D75769C15A898), + UINT64_C(0xAE5A6DB850CA33C6), UINT64_C(0x038CCB8044663403), UINT64_C(0xDA16310133DC92B8), UINT64_C(0x6A2FFB7AB2B7CE2B), + UINT64_C(0xDC1832D9229BAE20), UINT64_C(0x8C62C479F5ABC9E4), UINT64_C(0x5EB7B617857C9CCB), UINT64_C(0xB79CF7D749A1E80D), + UINT64_C(0xDE7FAC3798324FD3), UINT64_C(0x8178911813685D06), UINT64_C(0x6A726CBD394D4410), UINT64_C(0x6CBE6B3280DA1113), + UINT64_C(0x6829BA4410CF1148), UINT64_C(0xFA7E417EB26C5BC6), UINT64_C(0x22ED87884D6E3A49), UINT64_C(0x15F1472D5115669D), + UINT64_C(0x2EA0B4C8BF69D318), UINT64_C(0xDFE87070AA545503), UINT64_C(0x6B4C14B5F7144AB9), UINT64_C(0xC1ED49C06126551A), + UINT64_C(0x351919FC425C3899), UINT64_C(0x7B569C0FA6F1BD3E), UINT64_C(0x713AC2350844CFFD), UINT64_C(0xE9367F9A638C2FF3), + UINT64_C(0x97F17D325AEA0786), UINT64_C(0xBCB907CC6CF75F91), UINT64_C(0x0CB7517DAF247719), UINT64_C(0xBE16093CC45BE8A9), + UINT64_C(0x786EEE97359AD6AB), UINT64_C(0xB7AFA4F326B97E78), UINT64_C(0x2694B67FE23E502E), UINT64_C(0x4CB492826E98E0B4), + UINT64_C(0x838D119F74A416C7), UINT64_C(0x70D6A91E4E5677FD), UINT64_C(0xF3E4027AD30000E6), UINT64_C(0x9BDF692795807F77), + UINT64_C(0x6A371F966E034A54), UINT64_C(0x8789CF41AE4D67EF), UINT64_C(0x02688755484D60AE), UINT64_C(0xD5834B3A4BF5CE42), + UINT64_C(0x9405FC61440DE25D), UINT64_C(0x35EB280A157979B6), UINT64_C(0x48D40D6A525297AC), UINT64_C(0x6A87DC185054BADA) }; #if defined(HAVE_X86_64_AES) -static const uint64_t t1ha_refval_ia32aes_a[81] = { 0, - UINT64_C(0x772C7311BE32FF42), UINT64_C(0xB231AC660E5B23B5), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), - UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), - UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), - UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), - UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), - UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), - UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), - UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), - UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0xBB34C6A4396695D2), UINT64_C(0x7F46E1981C3256AD), - UINT64_C(0x4B25A9B217A6C5B4), UINT64_C(0x7A0A6BCDD2321DA9), UINT64_C(0x0A1F55E690A7B44E), UINT64_C(0x8F451A91D7F05244), - UINT64_C(0x624D5D3C9B9800A7), UINT64_C(0x09DDC2B6409DDC25), UINT64_C(0x3E155765865622B6), UINT64_C(0x96519FAC9511B381), - UINT64_C(0x512E58482FE4FBF0), UINT64_C(0x1AB260EA7D54AE1C), UINT64_C(0x67976F12CC28BBBD), UINT64_C(0x0607B5B2E6250156), - UINT64_C(0x7E700BEA717AD36E), UINT64_C(0x06A058D9D61CABB3), UINT64_C(0x57DA5324A824972F), UINT64_C(0x1193BA74DBEBF7E7), - UINT64_C(0xC18DC3140E7002D4), UINT64_C(0x9F7CCC11DFA0EF17), UINT64_C(0xC487D6C20666A13A), UINT64_C(0xB67190E4B50EF0C8), - UINT64_C(0xA53DAA608DF0B9A5), UINT64_C(0x7E13101DE87F9ED3), UINT64_C(0x7F8955AE2F05088B), UINT64_C(0x2DF7E5A097AD383F), - UINT64_C(0xF027683A21EA14B5), UINT64_C(0x9BB8AEC3E3360942), UINT64_C(0x92BE39B54967E7FE), UINT64_C(0x978C6D332E7AFD27), - UINT64_C(0xED512FE96A4FAE81), UINT64_C(0x9E1099B8140D7BA3), UINT64_C(0xDFD5A5BE1E6FE9A6), UINT64_C(0x1D82600E23B66DD4), - UINT64_C(0x3FA3C3B7EE7B52CE), UINT64_C(0xEE84F7D2A655EF4C), UINT64_C(0x2A4361EC769E3BEB), UINT64_C(0x22E4B38916636702), - UINT64_C(0x0063096F5D39A115), UINT64_C(0x6C51B24DAAFA5434), UINT64_C(0xBAFB1DB1B411E344), UINT64_C(0xFF529F161AE0C4B0), - UINT64_C(0x1290EAE3AC0A686F), UINT64_C(0xA7B0D4585447D1BE), UINT64_C(0xAED3D18CB6CCAD53), UINT64_C(0xFC73D46F8B41BEC6) +static const uint64_t t1ha_refval_ia32aes_a [81] = { + 0, + UINT64_C(0x772C7311BE32FF42), UINT64_C(0xB231AC660E5B23B5), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), + UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), + UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), + UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), + UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), + UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), + UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), + UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), + UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0xBB34C6A4396695D2), UINT64_C(0x7F46E1981C3256AD), + UINT64_C(0x4B25A9B217A6C5B4), UINT64_C(0x7A0A6BCDD2321DA9), UINT64_C(0x0A1F55E690A7B44E), UINT64_C(0x8F451A91D7F05244), + UINT64_C(0x624D5D3C9B9800A7), UINT64_C(0x09DDC2B6409DDC25), UINT64_C(0x3E155765865622B6), UINT64_C(0x96519FAC9511B381), + UINT64_C(0x512E58482FE4FBF0), UINT64_C(0x1AB260EA7D54AE1C), UINT64_C(0x67976F12CC28BBBD), UINT64_C(0x0607B5B2E6250156), + UINT64_C(0x7E700BEA717AD36E), UINT64_C(0x06A058D9D61CABB3), UINT64_C(0x57DA5324A824972F), UINT64_C(0x1193BA74DBEBF7E7), + UINT64_C(0xC18DC3140E7002D4), UINT64_C(0x9F7CCC11DFA0EF17), UINT64_C(0xC487D6C20666A13A), UINT64_C(0xB67190E4B50EF0C8), + UINT64_C(0xA53DAA608DF0B9A5), UINT64_C(0x7E13101DE87F9ED3), UINT64_C(0x7F8955AE2F05088B), UINT64_C(0x2DF7E5A097AD383F), + UINT64_C(0xF027683A21EA14B5), UINT64_C(0x9BB8AEC3E3360942), UINT64_C(0x92BE39B54967E7FE), UINT64_C(0x978C6D332E7AFD27), + UINT64_C(0xED512FE96A4FAE81), UINT64_C(0x9E1099B8140D7BA3), UINT64_C(0xDFD5A5BE1E6FE9A6), UINT64_C(0x1D82600E23B66DD4), + UINT64_C(0x3FA3C3B7EE7B52CE), UINT64_C(0xEE84F7D2A655EF4C), UINT64_C(0x2A4361EC769E3BEB), UINT64_C(0x22E4B38916636702), + UINT64_C(0x0063096F5D39A115), UINT64_C(0x6C51B24DAAFA5434), UINT64_C(0xBAFB1DB1B411E344), UINT64_C(0xFF529F161AE0C4B0), + UINT64_C(0x1290EAE3AC0A686F), UINT64_C(0xA7B0D4585447D1BE), UINT64_C(0xAED3D18CB6CCAD53), UINT64_C(0xFC73D46F8B41BEC6) }; -static const uint64_t t1ha_refval_ia32aes_b[81] = { 0, - UINT64_C(0x772C7311BE32FF42), UINT64_C(0x4398F62A8CB6F72A), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), - UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), - UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), - UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), - UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), - UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), - UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), - UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), - UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0xE810F88E85CEA11A), UINT64_C(0x4814F8F3B83E4394), - UINT64_C(0x9CABA22D10A2F690), UINT64_C(0x0D10032511F58111), UINT64_C(0xE9A36EF5EEA3CD58), UINT64_C(0xC79242DE194D9D7C), - UINT64_C(0xC3871AA0435EE5C8), UINT64_C(0x52890BED43CCF4CD), UINT64_C(0x07A1D0861ACCD373), UINT64_C(0x227B816FF0FEE9ED), - UINT64_C(0x59FFBF73AACFC0C4), UINT64_C(0x09AB564F2BEDAD0C), UINT64_C(0xC05F744F2EE38318), UINT64_C(0x7B50B621D547C661), - UINT64_C(0x0C1F71CB4E68E5D1), UINT64_C(0x0E33A47881D4DBAA), UINT64_C(0xF5C3BF198E9A7C2E), UINT64_C(0x16328FD8C0F68A91), - UINT64_C(0xA3E399C9AB3E9A59), UINT64_C(0x163AE71CBCBB18B8), UINT64_C(0x18F17E4A8C79F7AB), UINT64_C(0x9250E2EA37014B45), - UINT64_C(0x7BBBB111D60B03E4), UINT64_C(0x3DAA4A3071A0BD88), UINT64_C(0xA28828D790A2D6DC), UINT64_C(0xBC70FC88F64BE3F1), - UINT64_C(0xA3E48008BA4333C7), UINT64_C(0x739E435ACAFC79F7), UINT64_C(0x42BBB360BE007CC6), UINT64_C(0x4FFB6FD2AF74EC92), - UINT64_C(0x2A799A2994673146), UINT64_C(0xBE0A045B69D48E9F), UINT64_C(0x549432F54FC6A278), UINT64_C(0x371D3C60369FC702), - UINT64_C(0xDB4557D415B08CA7), UINT64_C(0xE8692F0A83850B37), UINT64_C(0x022E46AEB36E9AAB), UINT64_C(0x117AC9B814E4652D), - UINT64_C(0xA361041267AE9048), UINT64_C(0x277CB51C961C3DDA), UINT64_C(0xAFFC96F377CB8A8D), UINT64_C(0x83CC79FA01DD1BA7), - UINT64_C(0xA494842ACF4B802C), UINT64_C(0xFC6D9CDDE2C34A3F), UINT64_C(0x4ED6863CE455F7A7), UINT64_C(0x630914D0DB7AAE98) +static const uint64_t t1ha_refval_ia32aes_b [81] = { + 0, + UINT64_C(0x772C7311BE32FF42), UINT64_C(0x4398F62A8CB6F72A), UINT64_C(0x71F6DF5DA3B4F532), UINT64_C(0x555859635365F660), + UINT64_C(0xE98808F1CD39C626), UINT64_C(0x2EB18FAF2163BB09), UINT64_C(0x7B9DD892C8019C87), UINT64_C(0xE2B1431C4DA4D15A), + UINT64_C(0x1984E718A5477F70), UINT64_C(0x08DD17B266484F79), UINT64_C(0x4C83A05D766AD550), UINT64_C(0x92DCEBB131D1907D), + UINT64_C(0xD67BC6FC881B8549), UINT64_C(0xF6A9886555FBF66B), UINT64_C(0x6E31616D7F33E25E), UINT64_C(0x36E31B7426E3049D), + UINT64_C(0x4F8E4FAF46A13F5F), UINT64_C(0x03EB0CB3253F819F), UINT64_C(0x636A7769905770D2), UINT64_C(0x3ADF3781D16D1148), + UINT64_C(0x92D19CB1818BC9C2), UINT64_C(0x283E68F4D459C533), UINT64_C(0xFA83A8A88DECAA04), UINT64_C(0x8C6F00368EAC538C), + UINT64_C(0x7B66B0CF3797B322), UINT64_C(0x5131E122FDABA3FF), UINT64_C(0x6E59FF515C08C7A9), UINT64_C(0xBA2C5269B2C377B0), + UINT64_C(0xA9D24FD368FE8A2B), UINT64_C(0x22DB13D32E33E891), UINT64_C(0x7B97DFC804B876E5), UINT64_C(0xC598BDFCD0E834F9), + UINT64_C(0xB256163D3687F5A7), UINT64_C(0x66D7A73C6AEF50B3), UINT64_C(0xE810F88E85CEA11A), UINT64_C(0x4814F8F3B83E4394), + UINT64_C(0x9CABA22D10A2F690), UINT64_C(0x0D10032511F58111), UINT64_C(0xE9A36EF5EEA3CD58), UINT64_C(0xC79242DE194D9D7C), + UINT64_C(0xC3871AA0435EE5C8), UINT64_C(0x52890BED43CCF4CD), UINT64_C(0x07A1D0861ACCD373), UINT64_C(0x227B816FF0FEE9ED), + UINT64_C(0x59FFBF73AACFC0C4), UINT64_C(0x09AB564F2BEDAD0C), UINT64_C(0xC05F744F2EE38318), UINT64_C(0x7B50B621D547C661), + UINT64_C(0x0C1F71CB4E68E5D1), UINT64_C(0x0E33A47881D4DBAA), UINT64_C(0xF5C3BF198E9A7C2E), UINT64_C(0x16328FD8C0F68A91), + UINT64_C(0xA3E399C9AB3E9A59), UINT64_C(0x163AE71CBCBB18B8), UINT64_C(0x18F17E4A8C79F7AB), UINT64_C(0x9250E2EA37014B45), + UINT64_C(0x7BBBB111D60B03E4), UINT64_C(0x3DAA4A3071A0BD88), UINT64_C(0xA28828D790A2D6DC), UINT64_C(0xBC70FC88F64BE3F1), + UINT64_C(0xA3E48008BA4333C7), UINT64_C(0x739E435ACAFC79F7), UINT64_C(0x42BBB360BE007CC6), UINT64_C(0x4FFB6FD2AF74EC92), + UINT64_C(0x2A799A2994673146), UINT64_C(0xBE0A045B69D48E9F), UINT64_C(0x549432F54FC6A278), UINT64_C(0x371D3C60369FC702), + UINT64_C(0xDB4557D415B08CA7), UINT64_C(0xE8692F0A83850B37), UINT64_C(0x022E46AEB36E9AAB), UINT64_C(0x117AC9B814E4652D), + UINT64_C(0xA361041267AE9048), UINT64_C(0x277CB51C961C3DDA), UINT64_C(0xAFFC96F377CB8A8D), UINT64_C(0x83CC79FA01DD1BA7), + UINT64_C(0xA494842ACF4B802C), UINT64_C(0xFC6D9CDDE2C34A3F), UINT64_C(0x4ED6863CE455F7A7), UINT64_C(0x630914D0DB7AAE98) }; #endif static uint64_t testno; -static FORCE_INLINE bool probe(void (*hash)(const void * in, const size_t len, const seed_t seed, void * out), - const uint64_t reference, bool bswap, - const void *data, unsigned len, uint64_t seed) { +static FORCE_INLINE bool probe( void (* hash)(const void * in, const size_t len, const seed_t seed, + void * out), const uint64_t reference, bool bswap, const void * data, unsigned len, uint64_t seed ) { uint8_t result[32]; + hash(data, len, seed, &result); const uint64_t actual = bswap ? GET_U64(result, 0) : GET_U64(result, 0); testno++; @@ -1455,13 +1480,14 @@ static FORCE_INLINE bool probe(void (*hash)(const void * in, const size_t len, c return actual != reference; } -static bool t1ha_selfcheck(void (*hash)(const void * in, const size_t len, const seed_t seed, void * out), - const uint64_t *reference_values, bool bswap) { +static bool t1ha_selfcheck( void (* hash)(const void * in, const size_t len, const seed_t seed, + void * out), const uint64_t * reference_values, bool bswap ) { bool failed = false; + testno = 0; const uint64_t zero = 0; - failed |= probe(hash, /* empty-zero */ *reference_values++, bswap, NULL, 0, zero); + failed |= probe(hash, /* empty-zero */ *reference_values++, bswap, NULL, 0, zero ); failed |= probe(hash, /* empty-all1 */ *reference_values++, bswap, NULL, 0, ~zero); failed |= probe(hash, /* bin64-zero */ *reference_values++, bswap, t1ha_test_pattern, 64, zero); @@ -1469,19 +1495,20 @@ static bool t1ha_selfcheck(void (*hash)(const void * in, const size_t len, const for (int i = 1; i < 64; i++) { /* bin%i-1p%i */ failed |= probe(hash, *reference_values++, bswap, t1ha_test_pattern, i, seed); - seed <<= 1; + seed <<= 1; } seed = ~zero; for (int i = 1; i <= 7; i++) { - seed <<= 1; - /* align%i_F%i */; + seed <<= 1; + /* align%i_F%i */ failed |= probe(hash, *reference_values++, bswap, t1ha_test_pattern + i, 64 - i, seed); } uint8_t pattern_long[512]; - for (size_t i = 0; i < sizeof(pattern_long); ++i) + for (size_t i = 0; i < sizeof(pattern_long); ++i) { pattern_long[i] = (uint8_t)i; + } for (int i = 0; i <= 7; i++) { /* long-%05i */ failed |= probe(hash, *reference_values++, bswap, pattern_long + i, 128 + i * 17, seed); @@ -1490,20 +1517,16 @@ static bool t1ha_selfcheck(void (*hash)(const void * in, const size_t len, const return failed; } -static bool t1ha0_selftest(void) { +static bool t1ha0_selftest( void ) { bool failed = false; failed |= t1ha_selfcheck(isLE() ? - t1ha0 : - t1ha0, - t1ha_refval_32le, - isLE() ? false : true); + t1ha0 : + t1ha0, t1ha_refval_32le, isLE() ? false : true); failed |= t1ha_selfcheck(isLE() ? - t1ha0 : - t1ha0, - t1ha_refval_32be, - isBE() ? false : true); + t1ha0 : + t1ha0, t1ha_refval_32be, isBE() ? false : true); if (failed) { printf("t1ha0 self-test FAILED!\n"); @@ -1511,20 +1534,16 @@ static bool t1ha0_selftest(void) { return !failed; } -static bool t1ha1_selftest(void) { +static bool t1ha1_selftest( void ) { bool failed = false; failed |= t1ha_selfcheck(isLE() ? - t1ha1 : - t1ha1, - t1ha_refval_64le, - isLE() ? false : true); + t1ha1 : + t1ha1, t1ha_refval_64le, isLE() ? false : true); failed |= t1ha_selfcheck(isLE() ? - t1ha1 : - t1ha1, - t1ha_refval_64be, - isBE() ? false : true); + t1ha1 : + t1ha1, t1ha_refval_64be, isBE() ? false : true); if (failed) { printf("t1ha1 self-test FAILED!\n"); @@ -1532,20 +1551,16 @@ static bool t1ha1_selftest(void) { return !failed; } -static bool t1ha2_selftest(void) { +static bool t1ha2_selftest( void ) { bool failed = false; failed |= t1ha_selfcheck(isLE() ? - t1ha2 : - t1ha2, - t1ha_refval_2atonce, - isLE() ? false : true); + t1ha2 : + t1ha2, t1ha_refval_2atonce , isLE() ? false : true); failed |= t1ha_selfcheck(isLE() ? - t1ha2 : - t1ha2, - t1ha_refval_2atonce128, - isLE() ? false : true); + t1ha2 : + t1ha2, t1ha_refval_2atonce128, isLE() ? false : true); if (failed) { printf("t1ha2 self-test FAILED!\n"); @@ -1553,20 +1568,16 @@ static bool t1ha2_selftest(void) { return !failed; } -static bool t1ha2_incr_selftest(void) { +static bool t1ha2_incr_selftest( void ) { bool failed = false; failed |= t1ha_selfcheck(isLE() ? - t1ha2_incr : - t1ha2_incr, - t1ha_refval_2stream, - isLE() ? false : true); + t1ha2_incr : + t1ha2_incr, t1ha_refval_2stream , isLE() ? false : true); failed |= t1ha_selfcheck(isLE() ? - t1ha2_incr : - t1ha2_incr, - t1ha_refval_2stream128, - isLE() ? false : true); + t1ha2_incr : + t1ha2_incr, t1ha_refval_2stream128, isLE() ? false : true); if (failed) { printf("t1ha2-incr self-test FAILED!\n"); @@ -1575,171 +1586,169 @@ static bool t1ha2_incr_selftest(void) { } #if defined(HAVE_X86_64_AES) -static bool t1ha0_aes_selftest(void) { + +static bool t1ha0_aes_selftest( void ) { bool failed = false; - failed |= t1ha_selfcheck(t1ha0_aesA, - t1ha_refval_ia32aes_a, - false); + failed |= t1ha_selfcheck(t1ha0_aesA, t1ha_refval_ia32aes_a, false); - failed |= t1ha_selfcheck(t1ha0_aesB, - t1ha_refval_ia32aes_b, - false); + failed |= t1ha_selfcheck(t1ha0_aesB, t1ha_refval_ia32aes_b, false); if (failed) { printf("t1ha0-aes self-test FAILED!\n"); } return !failed; } + #endif REGISTER_FAMILY(t1ha, - $.src_url = "https://web.archive.org/web/20211209095620/https://github.com/erthink/t1ha", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://web.archive.org/web/20211209095620/https://github.com/erthink/t1ha", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(t1ha0, - $.desc = "Fast Positive Hash #0 (portable, 32-bit core)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0x7F7D7B29, - $.verification_BE = 0x6B552A17, // To get old 0xDA6A4061 value, see above - $.hashfn_native = isLE() ? t1ha0 : t1ha0, - $.hashfn_bswap = isLE() ? t1ha0 : t1ha0, - $.initfn = t1ha0_selftest -); + $.desc = "Fast Positive Hash #0 (portable, 32-bit core)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0x7F7D7B29, + $.verification_BE = 0x6B552A17, // To get old 0xDA6A4061 value, see above + $.hashfn_native = isLE () ? t1ha0 : t1ha0, + $.hashfn_bswap = isLE () ? t1ha0 : t1ha0, + $.initfn = t1ha0_selftest + ); REGISTER_HASH(t1ha1, - $.desc = "Fast Positive Hash #1 (portable, 64-bit core)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0xD6836381, - $.verification_BE = 0xB895E54F, // To get old 0x93F864DE value, see above - $.hashfn_native = isLE() ? t1ha1 : t1ha1, - $.hashfn_bswap = isLE() ? t1ha1 : t1ha1, - $.initfn = t1ha1_selftest -); + $.desc = "Fast Positive Hash #1 (portable, 64-bit core)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0xD6836381, + $.verification_BE = 0xB895E54F, // To get old 0x93F864DE value, see above + $.hashfn_native = isLE () ? t1ha1 : t1ha1, + $.hashfn_bswap = isLE () ? t1ha1 : t1ha1, + $.initfn = t1ha1_selftest + ); REGISTER_HASH(t1ha2_64, - $.desc = "Fast Positive Hash #2 (portable, 64-bit core)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0x8F16C948, - $.verification_BE = 0x061CB08C, - $.hashfn_native = isLE() ? t1ha2 : t1ha2, - $.hashfn_bswap = isLE() ? t1ha2 : t1ha2, - $.initfn = t1ha2_selftest -); + $.desc = "Fast Positive Hash #2 (portable, 64-bit core)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0x8F16C948, + $.verification_BE = 0x061CB08C, + $.hashfn_native = isLE () ? t1ha2 : t1ha2, + $.hashfn_bswap = isLE () ? t1ha2 : t1ha2, + $.initfn = t1ha2_selftest + ); REGISTER_HASH(t1ha2_128, - $.desc = "Fast Positive Hash #2 (portable, 64-bit core)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 128, - $.verification_LE = 0xB44C43A1, - $.verification_BE = 0x95EB2DA8, - $.hashfn_native = isLE() ? t1ha2 : t1ha2, - $.hashfn_bswap = isLE() ? t1ha2 : t1ha2, - $.initfn = t1ha2_selftest -); + $.desc = "Fast Positive Hash #2 (portable, 64-bit core)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 128, + $.verification_LE = 0xB44C43A1, + $.verification_BE = 0x95EB2DA8, + $.hashfn_native = isLE () ? t1ha2 : t1ha2, + $.hashfn_bswap = isLE () ? t1ha2 : t1ha2, + $.initfn = t1ha2_selftest + ); REGISTER_HASH(t1ha2_64__incr, - $.desc = "Fast Positive Hash #2 (portable, 64-bit core, incremental version)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_INCREMENTAL_DIFFERENT | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0xDED9B580, - $.verification_BE = 0xB355A009, - $.hashfn_native = isLE() ? t1ha2_incr : t1ha2_incr, - $.hashfn_bswap = isLE() ? t1ha2_incr : t1ha2_incr, - $.initfn = t1ha2_incr_selftest -); + $.desc = "Fast Positive Hash #2 (portable, 64-bit core, incremental version)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_INCREMENTAL_DIFFERENT | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0xDED9B580, + $.verification_BE = 0xB355A009, + $.hashfn_native = isLE () ? t1ha2_incr : t1ha2_incr, + $.hashfn_bswap = isLE () ? t1ha2_incr : t1ha2_incr, + $.initfn = t1ha2_incr_selftest + ); REGISTER_HASH(t1ha2_128__incr, - $.desc = "Fast Positive Hash #2 (portable, 64-bit core, incremental version)", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_TYPE_PUNNING | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_INCREMENTAL | - FLAG_IMPL_INCREMENTAL_DIFFERENT | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 128, - $.verification_LE = 0xE929E756, - $.verification_BE = 0x3898932B, - $.hashfn_native = isLE() ? t1ha2_incr : t1ha2_incr, - $.hashfn_bswap = isLE() ? t1ha2_incr : t1ha2_incr, - $.initfn = t1ha2_incr_selftest -); + $.desc = "Fast Positive Hash #2 (portable, 64-bit core, incremental version)", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_TYPE_PUNNING | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_INCREMENTAL | + FLAG_IMPL_INCREMENTAL_DIFFERENT | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 128, + $.verification_LE = 0xE929E756, + $.verification_BE = 0x3898932B, + $.hashfn_native = isLE () ? t1ha2_incr : t1ha2_incr, + $.hashfn_bswap = isLE () ? t1ha2_incr : t1ha2_incr, + $.initfn = t1ha2_incr_selftest + ); #if defined(HAVE_X86_64_AES) REGISTER_HASH(t1ha0__aesA, - $.desc = "Fast Positive Hash #0a (AES-NI)", - $.hash_flags = - FLAG_HASH_AES_BASED , - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0xF07C4DA5, - $.verification_BE = 0x6848847F, - $.hashfn_native = t1ha0_aesA, - $.hashfn_bswap = t1ha0_aesA, - $.initfn = t1ha0_aes_selftest -); + $.desc = "Fast Positive Hash #0a (AES-NI)", + $.hash_flags = + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0xF07C4DA5, + $.verification_BE = 0x6848847F, + $.hashfn_native = t1ha0_aesA, + $.hashfn_bswap = t1ha0_aesA, + $.initfn = t1ha0_aes_selftest + ); REGISTER_HASH(t1ha0__aesB, - $.desc = "Fast Positive Hash #0b (AES-NI)", - $.hash_flags = - FLAG_HASH_AES_BASED , - $.impl_flags = - FLAG_IMPL_READ_PAST_EOB | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_ZLIB , - $.bits = 64, - $.verification_LE = 0x8B38C599, - $.verification_BE = 0x010611E9, - $.hashfn_native = t1ha0_aesB, - $.hashfn_bswap = t1ha0_aesB, - $.initfn = t1ha0_aes_selftest -); + $.desc = "Fast Positive Hash #0b (AES-NI)", + $.hash_flags = + FLAG_HASH_AES_BASED, + $.impl_flags = + FLAG_IMPL_READ_PAST_EOB | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_ZLIB, + $.bits = 64, + $.verification_LE = 0x8B38C599, + $.verification_BE = 0x010611E9, + $.hashfn_native = t1ha0_aesB, + $.hashfn_bswap = t1ha0_aesB, + $.initfn = t1ha0_aes_selftest + ); #endif diff --git a/hashes/tabulation.cpp b/hashes/tabulation.cpp index 672fd3c4..798745ca 100644 --- a/hashes/tabulation.cpp +++ b/hashes/tabulation.cpp @@ -4,7 +4,7 @@ * Copyright (c) 2020-2021 Reini Urban * Copyright (c) 2020 Thomas Dybdahl Ahle * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. + * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -52,11 +52,11 @@ // test it with the RNG you plan on using to seed it. static uint64_t BSD_nextrand; -static void BSD_srand(uint64_t seed) { +static void BSD_srand( uint64_t seed ) { BSD_nextrand = seed; } -static uint32_t BSD_rand(void) { +static uint32_t BSD_rand( void ) { /* * Compute x = (7^5 * x) mod (2^31 - 1) * without overflowing 31 bits: @@ -65,112 +65,122 @@ static uint32_t BSD_rand(void) { * Park and Miller, Communications of the ACM, vol. 31, no. 10, * October 1988, p. 1195. */ - uint64_t hi, lo, x; - - x = (BSD_nextrand % 0x7ffffffe) + 1; - hi = x / 127773; - lo = x % 127773; - x = 16807 * lo - 2836 * hi; - if (x < 0) - x += 0x7fffffff; + uint64_t hi, lo, x; + + x = (BSD_nextrand % 0x7ffffffe) + 1; + hi = x / 127773; + lo = x % 127773; + x = 16807 * lo - 2836 * hi; + if (x < 0) { + x += 0x7fffffff; + } BSD_nextrand = --x; - return x; + return x; } static uint64_t tab_rand64() { - // we don't know how many bits we get from rand(), - // but it is at least 16, so we concatenate a couple. - uint64_t r = 0; - for (int i = 0; i < 4; i++) { - r <<= 16; - r ^= BSD_rand(); - } - return r; + // we don't know how many bits we get from rand(), + // but it is at least 16, so we concatenate a couple. + uint64_t r = 0; + + for (int i = 0; i < 4; i++) { + r <<= 16; + r ^= BSD_rand(); + } + return r; } #if defined(HAVE_INT128) + static inline uint128_t tab_rand128() { - return (uint128_t)tab_rand64() << 64 | tab_rand64(); + return (uint128_t)tab_rand64() << 64 | tab_rand64(); } + #endif //----------------------------------------------------------------------------- // 32 Bit Version -const static uint64_t MERSENNE_31 = (UINT64_C(1) << 31) - 1; -const static int CHAR_SIZE = 8; -const static int BLOCK_SIZE_32 = 1<<8; +const static uint64_t MERSENNE_31 = (UINT64_C(1) << 31) - 1; +const static int CHAR_SIZE = 8; +const static int BLOCK_SIZE_32 = 1 << 8; static uint64_t multiply_shift_random_64[BLOCK_SIZE_32]; static uint32_t multiply_shift_a_64; static uint64_t multiply_shift_b_64; -static int32_t tabulation_32[32/CHAR_SIZE][1<> 31); +static inline uint32_t combine31( uint32_t h, uint32_t x, uint32_t a ) { + uint64_t temp = (uint64_t)h * x + a; + + return ((uint32_t)temp & MERSENNE_31) + (uint32_t)(temp >> 31); } -template < bool bswap > -static void tabulation32(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * buf = (const uint8_t *)in; - size_t len_words_32 = len/4; - size_t len_blocks_32 = len_words_32/BLOCK_SIZE_32; - - uint32_t h = len ^ seed; - - for (size_t b = 0; b < len_blocks_32; b++) { - uint32_t block_hash = 0; - for (int i = 0; i < BLOCK_SIZE_32; i++, buf += 4) - block_hash ^= multiply_shift_random_64[i] * GET_U32(buf,0) >> 32; - h = combine31(h, multiply_shift_a_64, block_hash >> 2); - } - - int remaining_words = len_words_32 % BLOCK_SIZE_32; - for (int i = 0; i < remaining_words; i++, buf += 4) - h ^= multiply_shift_random_64[i] * GET_U32(buf,0) >> 32; - - int remaining_bytes = len % 4; - if (remaining_bytes) { - uint32_t last = 0; - if (remaining_bytes & 2) {last = GET_U16(buf,0); buf += 2;} - if (remaining_bytes & 1) {last = (last << 8) | (*buf);} - h ^= multiply_shift_b_64 * last >> 32; - } - - // Finalization - uint32_t tab = 0; - for (int i = 0; i < 32/CHAR_SIZE; i++, h >>= CHAR_SIZE) - tab ^= tabulation_32[i][h & ((1<(tab, (uint8_t *)out, 0); +template +static void tabulation32( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * buf = (const uint8_t *)in; + size_t len_words_32 = len / 4; + size_t len_blocks_32 = len_words_32 / BLOCK_SIZE_32; + + uint32_t h = len ^ seed; + + for (size_t b = 0; b < len_blocks_32; b++) { + uint32_t block_hash = 0; + for (int i = 0; i < BLOCK_SIZE_32; i++, buf += 4) { + block_hash ^= multiply_shift_random_64[i] * GET_U32(buf, 0) >> 32; + } + h = combine31(h, multiply_shift_a_64, block_hash >> 2); + } + + int remaining_words = len_words_32 % BLOCK_SIZE_32; + for (int i = 0; i < remaining_words; i++, buf += 4) { + h ^= multiply_shift_random_64[i] * GET_U32(buf, 0) >> 32; + } + + int remaining_bytes = len % 4; + if (remaining_bytes) { + uint32_t last = 0; + if (remaining_bytes & 2) { last = GET_U16(buf, 0); buf += 2; } + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + h ^= multiply_shift_b_64 * last >> 32; + } + + // Finalization + uint32_t tab = 0; + for (int i = 0; i < 32 / CHAR_SIZE; i++, h >>= CHAR_SIZE) { + tab ^= tabulation_32[i][h & ((1 << CHAR_SIZE) - 1)]; + } + + PUT_U32(tab, (uint8_t *)out, 0); } #if defined(HAVE_INT128) @@ -179,178 +189,185 @@ static void tabulation32(const void * in, const size_t len, const seed_t seed, v const static uint64_t TAB_MERSENNE_61 = (UINT64_C(1) << 61) - 1; // multiply shift works on fixed length strings, so we operate in blocks. // this size can be tuned depending on the system. -const static int TAB_BLOCK_SIZE = 1<<8; +const static int TAB_BLOCK_SIZE = 1 << 8; static uint128_t tab_multiply_shift_random[TAB_BLOCK_SIZE]; static uint128_t tab_multiply_shift_a; static uint128_t tab_multiply_shift_b; -static int64_t tabulation[64/CHAR_SIZE][1<= 64/CHAR_SIZE); - for (int i = 0; i < 64/CHAR_SIZE; i++) - for (int j = 0; j < 1<= 64 / CHAR_SIZE); + } + for (int i = 0; i < 64 / CHAR_SIZE; i++) { + for (int j = 0; j < 1 << CHAR_SIZE; j++) { + tabulation[i][j] = have_broken_rand ? tab_multiply_shift_random[i] : tab_rand128(); + } + } + return 0; } -static inline uint64_t combine61(uint64_t h, uint64_t x, uint64_t a) { - // we assume 2^b-1 >= 2u-1. in other words - // x <= u-1 <= 2^(b-1)-1 (at most 60 bits) - // a <= p-1 = 2^b-2 (60 bits suffices) - // actually, checking the proof, it's fine if a is 61 bits. - // h <= 2p-1 = 2^62-3. this will also be guaranteed of the output. +static inline uint64_t combine61( uint64_t h, uint64_t x, uint64_t a ) { + // we assume 2^b-1 >= 2u-1. in other words + // x <= u-1 <= 2^(b-1)-1 (at most 60 bits) + // a <= p-1 = 2^b-2 (60 bits suffices) + // actually, checking the proof, it's fine if a is 61 bits. + // h <= 2p-1 = 2^62-3. this will also be guaranteed of the output. - //uint128_t temp = (uint128_t)h * x + a; - //return ((uint64_t)temp & TAB_MERSENNE_61) + (uint64_t)(temp >> 61); + // uint128_t temp = (uint128_t)h * x + a; + // return ((uint64_t)temp & TAB_MERSENNE_61) + (uint64_t)(temp >> 61); uint64_t rhi = 0, rlo = a; + fma64_128(rlo, rhi, h, x); - rhi <<= (64 - 61); - rhi |= (rlo >> 61); - rlo &= TAB_MERSENNE_61; + rhi <<= (64 - 61); + rhi |= (rlo >> 61); + rlo &= TAB_MERSENNE_61; return rlo + rhi; } -template < bool bswap > -static void tabulation64(const void * in, const size_t len, const seed_t seed, void * out) { - const uint8_t * buf = (const uint8_t *)in; - - // the idea is to compute a fast "signature" of the string before doing - // tabulation hashing. this signature only has to be collision resistant, - // so we can use the variabe-length-hashing polynomial mod-mersenne scheme - // from thorup. - // because of the birthday paradox, the signature needs to be around twice - // as many bits as in the number of keys tested. since smhasher tests - // collisions in keys in the order of millions, we need the signatures to - // be at least 40 bits. we settle on 64. - - // we mix in len in the basis, since smhasher considers two keys - // of different length to be different, even if all the extra bits are 0. - // this is needed for the appendzero test. - - uint64_t h = len ^ seed ^ (seed << 8); - - if (len >= 8) { - const size_t len_words = len/8; - if (len_words >= TAB_BLOCK_SIZE) { - const size_t len_blocks = len_words/TAB_BLOCK_SIZE; - - // to save time, we partition the string in blocks of ~ 256 words. - // each word is hashed using a fast strongly-universal multiply-shift, - // and since the xor of independent strongly-universal hash functions - // is also universal, we get a unique value for each block. - for (size_t b = 0; b < len_blocks; b++) { - uint64_t block_hash = 0; - for (int i = 0; i < TAB_BLOCK_SIZE; i++, buf += 8) { - // we don't have to shift yet, but shifting by 64 allows the - // compiler to produce a single "high bits only" multiplication instruction. - block_hash ^= (tab_multiply_shift_random[i] * GET_U64(buf,0)) >> 64; - - // the following is very fast, basically using mum, but theoretically wrong. - // __uint128_t mum = (__uint128_t)tab_multiply_shift_random_64[i] * take64(buf); - // block_hash ^= mum ^ (mum >> 64); +template +static void tabulation64( const void * in, const size_t len, const seed_t seed, void * out ) { + const uint8_t * buf = (const uint8_t *)in; + + // the idea is to compute a fast "signature" of the string before doing + // tabulation hashing. this signature only has to be collision resistant, + // so we can use the variabe-length-hashing polynomial mod-mersenne scheme + // from thorup. + // because of the birthday paradox, the signature needs to be around twice + // as many bits as in the number of keys tested. since smhasher tests + // collisions in keys in the order of millions, we need the signatures to + // be at least 40 bits. we settle on 64. + + // we mix in len in the basis, since smhasher considers two keys + // of different length to be different, even if all the extra bits are 0. + // this is needed for the appendzero test. + + uint64_t h = len ^ seed ^ (seed << 8); + + if (len >= 8) { + const size_t len_words = len / 8; + if (len_words >= TAB_BLOCK_SIZE) { + const size_t len_blocks = len_words / TAB_BLOCK_SIZE; + + // to save time, we partition the string in blocks of ~ 256 words. + // each word is hashed using a fast strongly-universal multiply-shift, + // and since the xor of independent strongly-universal hash functions + // is also universal, we get a unique value for each block. + for (size_t b = 0; b < len_blocks; b++) { + uint64_t block_hash = 0; + for (int i = 0; i < TAB_BLOCK_SIZE; i++, buf += 8) { + // we don't have to shift yet, but shifting by 64 allows the + // compiler to produce a single "high bits only" multiplication instruction. + block_hash ^= (tab_multiply_shift_random[i] * GET_U64(buf, 0)) >> 64; + + // the following is very fast, basically using mum, but theoretically wrong. + // __uint128_t mum = (__uint128_t)tab_multiply_shift_random_64[i] * take64(buf); + // block_hash ^= mum ^ (mum >> 64); + } + + // finally we combine the block hash using variable length hashing. + // values have to be less than mersenne for the combination to work. + // we can shift down, since any shift of multiply-shift outputs is + // strongly-universal. + h = combine61(h, tab_multiply_shift_a, block_hash >> 4); } - // finally we combine the block hash using variable length hashing. - // values have to be less than mersenne for the combination to work. - // we can shift down, since any shift of multiply-shift outputs is - // strongly-universal. - h = combine61(h, tab_multiply_shift_a, block_hash >> 4); - } - - // in principle we should finish the mersenne modular reduction. - // however, this isn't be needed, since it can never reduce collisions. - // if (h >= TAB_MERSENNE_61) h -= TAB_MERSENNE_61; - } - - // then read the remaining words - const int remaining_words = len_words % TAB_BLOCK_SIZE; - for (int i = 0; i < remaining_words; i++, buf += 8) - h ^= tab_multiply_shift_random[i] * GET_U64(buf,0) >> 64; - } - - // now get the remaining bytes - const int remaining_bytes = len % 8; - if (remaining_bytes) { - uint64_t last = 0; - if (remaining_bytes & 4) {last = GET_U32(buf,0); buf += 4;} - if (remaining_bytes & 2) {last = (last << 16) | GET_U16(buf,0); buf += 2;} - if (remaining_bytes & 1) {last = (last << 8) | (*buf);} - h ^= tab_multiply_shift_b * last >> 64; - } - - uint64_t tab = 0; - for (int i = 0; i < 64/CHAR_SIZE; i++, h >>= CHAR_SIZE) - tab ^= tabulation[i][h % (1<(tab, (uint8_t *)out, 0); + // in principle we should finish the mersenne modular reduction. + // however, this isn't be needed, since it can never reduce collisions. + // if (h >= TAB_MERSENNE_61) h -= TAB_MERSENNE_61; + } + + // then read the remaining words + const int remaining_words = len_words % TAB_BLOCK_SIZE; + for (int i = 0; i < remaining_words; i++, buf += 8) { + h ^= tab_multiply_shift_random[i] * GET_U64(buf, 0) >> 64; + } + } + + // now get the remaining bytes + const int remaining_bytes = len % 8; + if (remaining_bytes) { + uint64_t last = 0; + if (remaining_bytes & 4) { last = GET_U32(buf, 0); buf += 4; } + if (remaining_bytes & 2) { last = (last << 16) | GET_U16(buf, 0); buf += 2; } + if (remaining_bytes & 1) { last = (last << 8) | (*buf); } + h ^= tab_multiply_shift_b * last >> 64; + } + + uint64_t tab = 0; + for (int i = 0; i < 64 / CHAR_SIZE; i++, h >>= CHAR_SIZE) { + tab ^= tabulation[i][h % (1 << CHAR_SIZE)]; + } + + PUT_U64(tab, (uint8_t *)out, 0); } #endif //----------------------------------------------------------------------------- REGISTER_FAMILY(tabulation, - $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/rurban/smhasher/blob/master/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(tabulation_32, - $.desc = "32-bit Tabulation with Multiply-Shift Mixer", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xF951BEFF, - $.verification_BE = 0xFEB31CB2, - $.seedfn = tabulation32_seed, - $.hashfn_native = tabulation32, - $.hashfn_bswap = tabulation32 -); + $.desc = "32-bit Tabulation with Multiply-Shift Mixer", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xF951BEFF, + $.verification_BE = 0xFEB31CB2, + $.seedfn = tabulation32_seed, + $.hashfn_native = tabulation32, + $.hashfn_bswap = tabulation32 + ); #if defined(HAVE_INT128) REGISTER_HASH(tabulation_64, - $.desc = "64-bit Tabulation with Multiply-Shift Mixer", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_SYSTEM_SPECIFIC, - $.impl_flags = - FLAG_IMPL_SANITY_FAILS | // Implementation not yet thread-safe - FLAG_IMPL_128BIT | - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0x9CE7C3BC, - $.verification_BE = 0x4EE5569F, - $.seedfn = tabulation64_seed, - $.hashfn_native = tabulation64, - $.hashfn_bswap = tabulation64 -); + $.desc = "64-bit Tabulation with Multiply-Shift Mixer", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_SYSTEM_SPECIFIC, + $.impl_flags = + FLAG_IMPL_SANITY_FAILS |// Implementation not yet thread-safe + FLAG_IMPL_128BIT | + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0x9CE7C3BC, + $.verification_BE = 0x4EE5569F, + $.seedfn = tabulation64_seed, + $.hashfn_native = tabulation64, + $.hashfn_bswap = tabulation64 + ); #endif diff --git a/hashes/umash.cpp b/hashes/umash.cpp index 1485efef..a028bd1e 100644 --- a/hashes/umash.cpp +++ b/hashes/umash.cpp @@ -28,51 +28,53 @@ #include "Hashlib.h" #if defined(HAVE_X86_64_CLMUL) -#include "Intrinsics.h" -#include + #include "Intrinsics.h" + #include /* We only use 128-bit vector, as pairs of 64-bit integers. */ typedef __m128i v128; //------------------------------------------------------------ -#include "Mathmult.h" + #include "Mathmult.h" -static inline void mul128(uint64_t x, uint64_t y, uint64_t & hi, uint64_t & lo) { +static inline void mul128( uint64_t x, uint64_t y, uint64_t & hi, uint64_t & lo ) { mult64_128(lo, hi, x, y); } // This is an efficient and portable replacement for GCC's // __builtin_uaddl_overflow(). XXX The builtin detection might happen // later, but for now this is good enough. -static inline bool add_overflow(uint64_t x, uint64_t y, uint64_t * sumlo) { -//#if defined(HAVE_BUILTIN_UADD) +static inline bool add_overflow( uint64_t x, uint64_t y, uint64_t * sumlo ) { +// #if defined(HAVE_BUILTIN_UADD) // return __builtin_uaddl_overflow(x, y, sumlo); -//#else +// #else uint64_t c = 0; - x += y; - c += (x < y); + + x += y; + c += (x < y); *sumlo = x; return (c == 0) ? false : true; -//#endif +// #endif } -static NEVER_INLINE uint64_t add_mod_slow_slow_path(uint64_t sum, uint64_t fixup) { +static NEVER_INLINE uint64_t add_mod_slow_slow_path( uint64_t sum, uint64_t fixup ) { /* Reduce sum, mod 2**64 - 8. */ - sum = (sum >= (uint64_t)-8) ? sum + 8 : sum; + sum = (sum >= (uint64_t)-8) ? sum + 8 : sum; /* sum < 2**64 - 8, so this doesn't overflow. */ sum += fixup; /* Reduce again. */ - sum = (sum >= (uint64_t)-8) ? sum + 8 : sum; + sum = (sum >= (uint64_t)-8) ? sum + 8 : sum; return sum; } -static inline uint64_t add_mod_slow(uint64_t x, uint64_t y) { +static inline uint64_t add_mod_slow( uint64_t x, uint64_t y ) { uint64_t sum; uint64_t fixup = 0; /* x + y \equiv sum + fixup */ - if (add_overflow(x, y, &sum)) + if (add_overflow(x, y, &sum)) { fixup = 8; + } /* * We must ensure `sum + fixup < 2**64 - 8`. @@ -82,71 +84,73 @@ static inline uint64_t add_mod_slow(uint64_t x, uint64_t y) { * pseudorandom inputs, but `sum < 2**64 - 16` is almost * always true, for pseudorandom `sum`. */ - if (likely(sum < (uint64_t)-16)) + if (likely(sum < (uint64_t)-16)) { return sum + fixup; + } return add_mod_slow_slow_path(sum, fixup); } -static inline uint64_t add_mod_fast(uint64_t x, uint64_t y) { +static inline uint64_t add_mod_fast( uint64_t x, uint64_t y ) { uint64_t sum; /* If `sum` overflows, `sum + 8` does not. */ - return (add_overflow(x, y, &sum) ? sum + 8 : sum); + return add_overflow(x, y, &sum) ? sum + 8 : sum; } -static inline uint64_t mul_mod_fast(uint64_t m, uint64_t x) { +static inline uint64_t mul_mod_fast( uint64_t m, uint64_t x ) { uint64_t hi, lo; mul128(m, x, hi, lo); return add_mod_fast(lo, 8 * hi); } -static inline uint64_t horner_double_update(uint64_t acc, uint64_t m0, uint64_t m1, uint64_t x, uint64_t y) { +static inline uint64_t horner_double_update( uint64_t acc, uint64_t m0, uint64_t m1, uint64_t x, uint64_t y ) { acc = add_mod_fast(acc, x); return add_mod_slow(mul_mod_fast(m0, acc), mul_mod_fast(m1, y)); } -static inline v128 v128_create(uint64_t lo, uint64_t hi) { +static inline v128 v128_create( uint64_t lo, uint64_t hi ) { return _mm_set_epi64x(hi, lo); } -static inline uint64_t v128_getlo(v128 x) { +static inline uint64_t v128_getlo( v128 x ) { return _mm_cvtsi128_si64(x); } -static inline uint64_t v128_gethi(v128 x) { +static inline uint64_t v128_gethi( v128 x ) { return _mm_extract_epi64(x, 1); } /* Shift each 64-bit lane left by one bit. */ -static inline v128 v128_shift(v128 x) { +static inline v128 v128_shift( v128 x ) { return _mm_add_epi64(x, x); } /* Computes the 128-bit carryless product of x and y. */ -static inline v128 v128_clmul(uint64_t x, uint64_t y) { +static inline v128 v128_clmul( uint64_t x, uint64_t y ) { return _mm_clmulepi64_si128(_mm_cvtsi64_si128(x), _mm_cvtsi64_si128(y), 0x00); } /* Computes the 128-bit carryless product of the high and low halves of x. */ -static inline v128 v128_clmul_cross(v128 x) { +static inline v128 v128_clmul_cross( v128 x ) { return _mm_clmulepi64_si128(x, x, 0x01); } //------------------------------------------------------------ enum { - UMASH_OH_PARAM_COUNT = 32, - UMASH_OH_TWISTING_COUNT = 2, - BLOCK_SIZE = (sizeof(uint64_t) * UMASH_OH_PARAM_COUNT), + UMASH_OH_PARAM_COUNT = 32, + UMASH_OH_TWISTING_COUNT = 2, + BLOCK_SIZE = (sizeof(uint64_t) * UMASH_OH_PARAM_COUNT), UMASH_MULTIPLE_BLOCKS_THRESHOLD = 1024, - SPLIT_ACCUMULATOR_MAX_FIXUP = 3, - OH_SHORT_HASH_SHIFT = 4, + SPLIT_ACCUMULATOR_MAX_FIXUP = 3, + OH_SHORT_HASH_SHIFT = 4, }; -#define ARRAY_SIZE(ARR) (sizeof(ARR) / sizeof(ARR[0])) + #define ARRAY_SIZE(ARR) (sizeof(ARR) / sizeof(ARR[0])) -/** +/* + * * A single UMASH params struct stores the parameters for a pair of * independent `UMASH` functions. */ @@ -155,12 +159,12 @@ struct umash_params { * Each uint64_t[2] array consists of {f^2, f}, where f is a * random multiplier in mod 2**61 - 1. */ - uint64_t poly[2][2]; + uint64_t poly[2][2]; /* * The second (twisted) OH function uses an additional * 128-bit constant stored in the last two elements. */ - uint64_t oh[UMASH_OH_PARAM_COUNT + UMASH_OH_TWISTING_COUNT]; + uint64_t oh[UMASH_OH_PARAM_COUNT + UMASH_OH_TWISTING_COUNT]; /* * The seed value that the params were derived from. This is added * for SMHasher3, so that the seed input parameter to the hash @@ -168,45 +172,47 @@ struct umash_params { * thread-local umash_params table. It lets this umash * implementation be thread-safe. */ - uint64_t base_seed; + uint64_t base_seed; }; -/** +/* + * * A fingerprint consists of two independent `UMASH` hash values. */ struct umash_fp { - uint64_t hash[2]; + uint64_t hash[2]; }; -/** +/* + * * Returns `then` if `cond` is true, `otherwise` if false. * * This noise helps compiler emit conditional moves. */ -static inline const void * select_ptr(bool cond, const void * then, const void * otherwise) { +static inline const void * select_ptr( bool cond, const void * then, const void * otherwise ) { const void * ret; -#if defined(HAVE_X86_64_ASM) + #if defined(HAVE_X86_64_ASM) /* Force strict evaluation of both arguments. */ - __asm__("" ::"r"(then), "r"(otherwise)); -#endif + __asm__ ("" ::"r"(then), "r"(otherwise)); + #endif ret = (cond) ? then : otherwise; -#if defined(HAVE_X86_64_ASM) + #if defined(HAVE_X86_64_ASM) /* And also force the result to be materialised with a blackhole. */ - __asm__("" : "+r"(ret)); -#endif + __asm__ ("" : "+r"(ret)); + #endif return ret; } //------------------------------------------------------------ // SHORT -- [0, 8] byte inputs -template < bool bswap > -static inline uint64_t vec_to_u64(const void *data, size_t n_bytes) { +template +static inline uint64_t vec_to_u64( const void * data, size_t n_bytes ) { const uint8_t zeros[2] = { 0 }; - uint32_t hi, lo; + uint32_t hi, lo; /* * If there are at least 4 bytes to read, read the first 4 in @@ -219,7 +225,7 @@ static inline uint64_t vec_to_u64(const void *data, size_t n_bytes) { } else { /* 0 <= n_bytes < 4. Decode the size in binary. */ uint16_t word; - uint8_t byte; + uint8_t byte; /* * If the size is odd, load the first byte in `byte`; @@ -232,8 +238,7 @@ static inline uint64_t vec_to_u64(const void *data, size_t n_bytes) { * If the size is 2 or 3, load the last two bytes in `word`; * otherwise, load in a zero. */ - memcpy(&word, - select_ptr(n_bytes & 2, (const uint8_t *)data + n_bytes - 2, zeros), 2); + memcpy(&word, select_ptr(n_bytes & 2, (const uint8_t *)data + n_bytes - 2, zeros), 2); /* * We have now read `bytes[0 ... n_bytes - 1]` * exactly once without overwriting any data. @@ -248,66 +253,64 @@ static inline uint64_t vec_to_u64(const void *data, size_t n_bytes) { return COND_BSWAP(((uint64_t)hi << 32) | (lo + hi), bswap); } -template < bool bswap > -static uint64_t umash_short(const uint64_t *params, uint64_t seed, - const void *data, size_t n_bytes) { +template +static uint64_t umash_short( const uint64_t * params, uint64_t seed, const void * data, size_t n_bytes ) { uint64_t h; seed += params[n_bytes]; - h = vec_to_u64(data, n_bytes); - h ^= h >> 30; - h *= UINT64_C(0xbf58476d1ce4e5b9); - h = (h ^ seed) ^ (h >> 27); - h *= UINT64_C(0x94d049bb133111eb); - h ^= h >> 31; + h = vec_to_u64(data, n_bytes); + h ^= h >> 30; + h *= UINT64_C(0xbf58476d1ce4e5b9); + h = (h ^ seed) ^ (h >> 27); + h *= UINT64_C(0x94d049bb133111eb); + h ^= h >> 31; return h; } -template < bool bswap > -static struct umash_fp umash_fp_short(const uint64_t *params, uint64_t seed, - const void *data, size_t n_bytes) { +template +static struct umash_fp umash_fp_short( const uint64_t * params, uint64_t seed, const void * data, size_t n_bytes ) { struct umash_fp ret; - uint64_t h; + uint64_t h; ret.hash[0] = seed + params[n_bytes]; ret.hash[1] = seed + params[n_bytes + OH_SHORT_HASH_SHIFT]; - h = vec_to_u64(data, n_bytes); + h = vec_to_u64(data, n_bytes); h ^= h >> 30; h *= UINT64_C(0xbf58476d1ce4e5b9); h ^= h >> 27; -#define TAIL(i) \ - do { \ - ret.hash[i] ^= h; \ - ret.hash[i] *= UINT64_C(0x94d049bb133111eb); \ - ret.hash[i] ^= ret.hash[i] >> 31; \ +#define TAIL(i) \ + do { \ + ret.hash[i] ^= h; \ + ret.hash[i] *= UINT64_C(0x94d049bb133111eb); \ + ret.hash[i] ^= ret.hash[i] >> 31; \ } while (0) TAIL(0); TAIL(1); -#undef TAIL + #undef TAIL return ret; } //------------------------------------------------------------ // MEDIUM -- [9, 16] byte inputs -static inline uint64_t finalize(uint64_t x) { +static inline uint64_t finalize( uint64_t x ) { return (x ^ ROTL64(x, 8)) ^ ROTL64(x, 33); } -template < bool bswap > -static uint64_t umash_medium(const uint64_t multipliers[2], const uint64_t *oh, uint64_t seed, - const void *data, size_t n_bytes) { +template +static uint64_t umash_medium( const uint64_t multipliers[2], const uint64_t * oh, + uint64_t seed, const void * data, size_t n_bytes ) { uint64_t enh_hi, enh_lo; { const uint8_t * data8 = (const uint8_t *)data; - uint64_t x, y; + uint64_t x, y; - x = GET_U64(data8, 0); - y = GET_U64(data8, n_bytes - 8); + x = GET_U64(data8, 0); + y = GET_U64(data8, n_bytes - 8); x += oh[0]; y += oh[1]; @@ -317,45 +320,45 @@ static uint64_t umash_medium(const uint64_t multipliers[2], const uint64_t *oh, enh_hi ^= enh_lo; return finalize(horner_double_update( - /*acc=*/0, multipliers[0], multipliers[1], enh_lo, enh_hi)); + /*acc=*/ 0, multipliers[0], multipliers[1], enh_lo, enh_hi)); } -template < bool bswap > -static struct umash_fp umash_fp_medium(const uint64_t multipliers[2][2], - const uint64_t *oh, uint64_t seed, const void *data, size_t n_bytes) { +template +static struct umash_fp umash_fp_medium( const uint64_t multipliers[2][2], const uint64_t * oh, + uint64_t seed, const void * data, size_t n_bytes ) { struct umash_fp ret; - const uint64_t offset = seed ^ n_bytes; - uint64_t enh_hi, enh_lo; - v128 v; - uint64_t lrc[2] = { oh[UMASH_OH_PARAM_COUNT], oh[UMASH_OH_PARAM_COUNT + 1] }; - uint64_t x, y; - uint64_t a, b; + const uint64_t offset = seed ^ n_bytes; + uint64_t enh_hi, enh_lo; + v128 v; + uint64_t lrc[2] = { oh[UMASH_OH_PARAM_COUNT], oh[UMASH_OH_PARAM_COUNT + 1] }; + uint64_t x, y; + uint64_t a, b; /* Expand the 9-16 bytes to 16. */ const uint8_t * data8 = (const uint8_t *)data; - x = GET_U64(data8, 0); - y = GET_U64(data8, n_bytes - 8); - a = oh[0]; - b = oh[1]; + x = GET_U64(data8, 0); + y = GET_U64(data8, n_bytes - 8); + + a = oh[0]; + b = oh[1]; lrc[0] ^= x ^ a; lrc[1] ^= y ^ b; - v = v128_clmul(lrc[0], lrc[1]); + v = v128_clmul(lrc[0], lrc[1]); - a += x; - b += y; + a += x; + b += y; mul128(a, b, enh_hi, enh_lo); - enh_hi += offset; - enh_hi ^= enh_lo; + enh_hi += offset; + enh_hi ^= enh_lo; ret.hash[0] = finalize(horner_double_update( - /*acc=*/0, multipliers[0][0], multipliers[0][1], enh_lo, enh_hi)); + /*acc=*/ 0, multipliers[0][0], multipliers[0][1], enh_lo, enh_hi)); - ret.hash[1] = finalize(horner_double_update(/*acc=*/0, - multipliers[1][0], multipliers[1][1], - enh_lo ^ v128_getlo(v), enh_hi ^ v128_gethi(v))); + ret.hash[1] = finalize(horner_double_update(/*acc=*/ 0, multipliers[1][0], multipliers[1][1], + enh_lo ^ v128_getlo(v), enh_hi ^ v128_gethi(v))); return ret; } @@ -363,26 +366,24 @@ static struct umash_fp umash_fp_medium(const uint64_t multipliers[2][2], //------------------------------------------------------------ // LONG -- [17, size_t) byte inputs struct umash_oh { - uint64_t bits[2]; + uint64_t bits[2]; }; struct split_accumulator { - uint64_t base; - uint64_t fixup; + uint64_t base; + uint64_t fixup; }; -static inline uint64_t split_accumulator_eval(struct split_accumulator acc) { +static inline uint64_t split_accumulator_eval( struct split_accumulator acc ) { return add_mod_slow(acc.base, 8 * acc.fixup); } -static inline struct split_accumulator split_accumulator_update( - const struct split_accumulator acc, const uint64_t m0, - const uint64_t m1, uint64_t h0, const uint64_t h1) { - +static inline struct split_accumulator split_accumulator_update( const struct split_accumulator acc, + const uint64_t m0, const uint64_t m1, uint64_t h0, const uint64_t h1 ) { uint64_t partial; uint64_t lo0, hi0, lo1, hi1; uint64_t hi, sum; - int8_t fixup; + int8_t fixup; mul128(m1, h1, hi1, lo1); @@ -407,25 +408,24 @@ static inline struct split_accumulator split_accumulator_update( assert(hi0 < (1UL << 61)); assert(hi1 < (1UL << 61)); /* hi0 and hi1 < 2**61, so this addition never overflows. */ - hi = hi0 + hi1; + hi = hi0 + hi1; fixup += (hi & (1ULL << 61)) != 0; - hi *= 8; + hi *= 8; fixup += add_overflow(sum, hi, &sum); return (struct split_accumulator) { - .base = sum, - /* Avoid sign extension: we know `fixup` is non-negative. */ - .fixup = (uint8_t)fixup, + . base = sum, + /* Avoid sign extension: we know `fixup` is non-negative. */ + .fixup = (uint8_t)fixup, }; } // This is umash_multiple_blocks_generic(). -template < bool bswap > -static uint64_t umash_multiple_blocks(uint64_t initial, - const uint64_t multipliers[2], const uint64_t *oh_ptr, uint64_t seed, - const void *blocks, size_t n_blocks) { +template +static uint64_t umash_multiple_blocks( uint64_t initial, const uint64_t multipliers[2], const uint64_t * oh_ptr, + uint64_t seed, const void * blocks, size_t n_blocks ) { const uint64_t m0 = multipliers[0]; const uint64_t m1 = multipliers[1]; const uint64_t kx = oh_ptr[UMASH_OH_PARAM_COUNT - 2]; @@ -437,7 +437,7 @@ static uint64_t umash_multiple_blocks(uint64_t initial, do { const uint8_t * data = (const uint8_t *)blocks; struct umash_oh oh; - v128 acc = { 0, 0 }; + v128 acc = { 0, 0 }; blocks = (const uint8_t *)blocks + BLOCK_SIZE; @@ -448,19 +448,19 @@ static uint64_t umash_multiple_blocks(uint64_t initial, * the inner loop's xor-reduction tree widely: the * bottleneck is in the carryless multiplications. */ -#define FORCE() ((void)0) - -#define PH(I) \ - do { \ - v128 x, k; \ - \ - x = _mm_loadu_si128((const v128 *)data); \ - if (bswap) { x = mm_bswap64(x); } \ - data = data + sizeof(x); \ - \ - k = _mm_loadu_si128((const v128 *)&oh_ptr[I]); \ - x ^= k; \ - acc ^= v128_clmul_cross(x); \ + #define FORCE() ((void)0) + +#define PH(I) \ + do { \ + v128 x, k; \ + \ + x = _mm_loadu_si128((const v128 *)data); \ + if (bswap) { x = mm_bswap64(x); } \ + data = data + sizeof(x); \ + \ + k = _mm_loadu_si128((const v128 *)&oh_ptr[I]); \ + x ^= k; \ + acc ^= v128_clmul_cross(x); \ } while (0) PH(0); @@ -471,7 +471,7 @@ static uint64_t umash_multiple_blocks(uint64_t initial, PH(6); FORCE(); - PH(8); + PH( 8); PH(10); FORCE(); @@ -493,8 +493,8 @@ static uint64_t umash_multiple_blocks(uint64_t initial, PH(28); -#undef PH -#undef FORCE + #undef PH + #undef FORCE memcpy(&oh, &acc, sizeof(oh)); @@ -502,72 +502,70 @@ static uint64_t umash_multiple_blocks(uint64_t initial, { uint64_t x, y, enh_hi, enh_lo; - x = GET_U64(data, 0); - y = GET_U64(data, 8); + x = GET_U64(data, 0); + y = GET_U64(data, 8); x += kx; y += ky; mul128(x, y, enh_hi, enh_lo); - enh_hi += seed; + enh_hi += seed; oh.bits[0] ^= enh_lo; oh.bits[1] ^= enh_hi ^ enh_lo; } ret = split_accumulator_update(ret, m0, m1, oh.bits[0], oh.bits[1]); - } while (--n_blocks); return split_accumulator_eval(ret); } -template < bool bswap > -static struct umash_fp umash_fprint_multiple_blocks(struct umash_fp initial, - const uint64_t multipliers[2][2], const uint64_t *oh, uint64_t seed, - const void * blocks, size_t n_blocks) { +template +static struct umash_fp umash_fprint_multiple_blocks( struct umash_fp initial, const uint64_t multipliers[2][2], + const uint64_t * oh, uint64_t seed, const void * blocks, size_t n_blocks ) { const v128 lrc_init = - v128_create(oh[UMASH_OH_PARAM_COUNT], oh[UMASH_OH_PARAM_COUNT + 1]); - const uint64_t m00 = multipliers[0][0]; - const uint64_t m01 = multipliers[0][1]; - const uint64_t m10 = multipliers[1][0]; - const uint64_t m11 = multipliers[1][1]; + v128_create(oh[UMASH_OH_PARAM_COUNT], oh[UMASH_OH_PARAM_COUNT + 1]); + const uint64_t m00 = multipliers[0][0]; + const uint64_t m01 = multipliers[0][1]; + const uint64_t m10 = multipliers[1][0]; + const uint64_t m11 = multipliers[1][1]; struct split_accumulator acc0 = { .base = initial.hash[0] }; struct split_accumulator acc1 = { .base = initial.hash[1] }; do { struct umash_oh compressed[2]; - v128 acc = { 0, 0 }; /* Base umash */ - v128 acc_shifted = { 0, 0 }; /* Accumulates shifted values */ - v128 lrc = lrc_init; - const uint8_t * data = (const uint8_t *)blocks; + v128 acc = { 0, 0 }; /* Base umash */ + v128 acc_shifted = { 0, 0 }; /* Accumulates shifted values */ + v128 lrc = lrc_init; + const uint8_t * data = (const uint8_t *)blocks; blocks = (const uint8_t *)blocks + BLOCK_SIZE; -#define FORCE() ((void)0) - -#define TWIST(I) \ - do { \ - v128 x, k; \ - \ - x = _mm_loadu_si128((const v128 *)data); \ - if (bswap) { x = mm_bswap64(x); } \ - data = data + sizeof(x); \ - \ - k = _mm_loadu_si128((const v128 *)&oh[I]); \ - \ - x ^= k; \ - lrc ^= x; \ - \ - x = v128_clmul_cross(x); \ - \ - acc ^= x; \ - \ - if (I == 28) \ - break; \ - \ - acc_shifted ^= x; \ - acc_shifted = v128_shift(acc_shifted); \ + #define FORCE() ((void)0) + +#define TWIST(I) \ + do { \ + v128 x, k; \ + \ + x = _mm_loadu_si128((const v128 *)data); \ + if (bswap) { x = mm_bswap64(x); } \ + data = data + sizeof(x); \ + \ + k = _mm_loadu_si128((const v128 *)&oh[I]); \ + \ + x ^= k; \ + lrc ^= x; \ + \ + x = v128_clmul_cross(x); \ + \ + acc ^= x; \ + \ + if (I == 28) \ + break; \ + \ + acc_shifted ^= x; \ + acc_shifted = v128_shift(acc_shifted); \ } while (0) TWIST(0); @@ -601,32 +599,32 @@ static struct umash_fp umash_fprint_multiple_blocks(struct umash_fp initial, TWIST(28); FORCE(); -#undef TWIST -#undef FORCE + #undef TWIST + #undef FORCE { v128 x, k; - x = _mm_loadu_si128((const v128 *)data); + x = _mm_loadu_si128((const v128 *)data); if (bswap) { x = mm_bswap64(x); } - k = _mm_loadu_si128((const v128 *)&oh[30]); + k = _mm_loadu_si128((const v128 *)&oh[30]); lrc ^= x ^ k; } acc_shifted ^= acc; - acc_shifted = v128_shift(acc_shifted); + acc_shifted = v128_shift(acc_shifted); acc_shifted ^= v128_clmul_cross(lrc); - memcpy(&compressed[0], &acc, sizeof(compressed[0])); + memcpy(&compressed[0], &acc , sizeof(compressed[0])); memcpy(&compressed[1], &acc_shifted, sizeof(compressed[1])); { uint64_t x, y, kx, ky, enh_hi, enh_lo; - x = GET_U64(data, 0); - y = GET_U64(data, 8); + x = GET_U64(data, 0); + y = GET_U64(data, 8); kx = x + oh[30]; ky = y + oh[31]; @@ -642,42 +640,39 @@ static struct umash_fp umash_fprint_multiple_blocks(struct umash_fp initial, compressed[1].bits[1] ^= enh_hi; } - acc0 = split_accumulator_update( - acc0, m00, m01, compressed[0].bits[0], compressed[0].bits[1]); - acc1 = split_accumulator_update( - acc1, m10, m11, compressed[1].bits[0], compressed[1].bits[1]); + acc0 = split_accumulator_update(acc0, m00, m01, compressed[0].bits[0], compressed[0].bits[1]); + acc1 = split_accumulator_update(acc1, m10, m11, compressed[1].bits[0], compressed[1].bits[1]); } while (--n_blocks); return (struct umash_fp) { - .hash = { - split_accumulator_eval(acc0), - split_accumulator_eval(acc1), - }, - }; + . hash = { + split_accumulator_eval(acc0), + split_accumulator_eval(acc1), + }, + }; } -template < bool bswap > -static struct umash_oh oh_varblock(const uint64_t *params, uint64_t tag, - const void * block, size_t n_bytes) { +template +static struct umash_oh oh_varblock( const uint64_t * params, uint64_t tag, const void * block, size_t n_bytes ) { struct umash_oh ret; - v128 acc = { 0, 0 }; + v128 acc = { 0, 0 }; /* The final block processes `remaining > 0` bytes. */ - size_t remaining = 1 + ((n_bytes - 1) % sizeof(v128)); - size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t); - const uint8_t * last_ptr = (const uint8_t *)block + n_bytes - sizeof(v128); - size_t i; + size_t remaining = 1 + ((n_bytes - 1 ) % sizeof(v128) ); + size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t); + const uint8_t * last_ptr = (const uint8_t *)block + n_bytes - sizeof(v128); + size_t i; for (i = 0; i < end_full_pairs; i += 2) { v128 x, k; - x = _mm_loadu_si128((const v128 *)block); + x = _mm_loadu_si128((const v128 *)block); if (bswap) { x = mm_bswap64(x); } block = (const uint8_t *)block + sizeof(x); - k = _mm_loadu_si128((const v128 *)¶ms[i]); - x ^= k; - acc ^= v128_clmul_cross(x); + k = _mm_loadu_si128((const v128 *)¶ms[i]); + x ^= k; + acc ^= v128_clmul_cross(x); } memcpy(&ret, &acc, sizeof(ret)); @@ -686,13 +681,13 @@ static struct umash_oh oh_varblock(const uint64_t *params, uint64_t tag, { uint64_t x, y, enh_hi, enh_lo; - x = GET_U64(last_ptr, 0); - y = GET_U64(last_ptr, 8); + x = GET_U64(last_ptr, 0); + y = GET_U64(last_ptr, 8); - x += params[i]; - y += params[i + 1]; + x += params[i]; + y += params[i + 1]; mul128(x, y, enh_hi, enh_lo); - enh_hi += tag; + enh_hi += tag; ret.bits[0] ^= enh_lo; ret.bits[1] ^= enh_hi ^ enh_lo; @@ -701,39 +696,40 @@ static struct umash_oh oh_varblock(const uint64_t *params, uint64_t tag, return ret; } -template < bool bswap > -static void oh_varblock_fprint(struct umash_oh dst[2], const uint64_t *params, - uint64_t tag, const void * block, size_t n_bytes) { - v128 acc = { 0, 0 }; /* Base umash */ +template +static void oh_varblock_fprint( struct umash_oh dst[2], const uint64_t * params, + uint64_t tag, const void * block, size_t n_bytes ) { + v128 acc = { 0, 0 }; /* Base umash */ v128 acc_shifted = { 0, 0 }; /* Accumulates shifted values */ v128 lrc; /* The final block processes `remaining > 0` bytes. */ - size_t remaining = 1 + ((n_bytes - 1) % sizeof(v128)); - size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t); - const uint8_t * last_ptr = (const uint8_t *)block + n_bytes - sizeof(v128); - size_t i; + size_t remaining = 1 + ((n_bytes - 1 ) % sizeof(v128) ); + size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t); + const uint8_t * last_ptr = (const uint8_t *)block + n_bytes - sizeof(v128); + size_t i; lrc = v128_create(params[UMASH_OH_PARAM_COUNT], params[UMASH_OH_PARAM_COUNT + 1]); for (i = 0; i < end_full_pairs; i += 2) { v128 x, k; - x = _mm_loadu_si128((const v128 *)block); + x = _mm_loadu_si128((const v128 *)block); if (bswap) { x = mm_bswap64(x); } block = (const uint8_t *)block + sizeof(x); - k = _mm_loadu_si128((const v128 *)¶ms[i]); + k = _mm_loadu_si128((const v128 *)¶ms[i]); - x ^= k; - lrc ^= x; + x ^= k; + lrc ^= x; - x = v128_clmul_cross(x); + x = v128_clmul_cross(x); - acc ^= x; - if (i + 2 >= end_full_pairs) + acc ^= x; + if (i + 2 >= end_full_pairs) { break; + } acc_shifted ^= x; - acc_shifted = v128_shift(acc_shifted); + acc_shifted = v128_shift(acc_shifted); } /* @@ -743,34 +739,34 @@ static void oh_varblock_fprint(struct umash_oh dst[2], const uint64_t *params, { v128 x, k; - x = _mm_loadu_si128((const v128 *)last_ptr); + x = _mm_loadu_si128((const v128 *)last_ptr); if (bswap) { x = mm_bswap64(x); } - k = _mm_loadu_si128((const v128 *)¶ms[end_full_pairs]); + k = _mm_loadu_si128((const v128 *)¶ms[end_full_pairs]); lrc ^= x ^ k; } acc_shifted ^= acc; - acc_shifted = v128_shift(acc_shifted); + acc_shifted = v128_shift(acc_shifted); acc_shifted ^= v128_clmul_cross(lrc); - memcpy(&dst[0], &acc, sizeof(dst[0])); + memcpy(&dst[0], &acc , sizeof(dst[0])); memcpy(&dst[1], &acc_shifted, sizeof(dst[1])); { uint64_t x, y, kx, ky, enh_hi, enh_lo; - x = GET_U64(last_ptr, 0); - y = GET_U64(last_ptr, 8); + x = GET_U64(last_ptr, 0); + y = GET_U64(last_ptr, 8); - kx = x + params[end_full_pairs]; + kx = x + params[end_full_pairs ]; ky = y + params[end_full_pairs + 1]; mul128(kx, ky, enh_hi, enh_lo); - enh_hi += tag; + enh_hi += tag; - enh_hi ^= enh_lo; + enh_hi ^= enh_lo; dst[0].bits[0] ^= enh_lo; dst[0].bits[1] ^= enh_hi; @@ -779,23 +775,24 @@ static void oh_varblock_fprint(struct umash_oh dst[2], const uint64_t *params, } } -template < bool bswap > -static uint64_t umash_long(const uint64_t multipliers[2], const uint64_t *oh, - uint64_t seed, const void *data, size_t n_bytes) { +template +static uint64_t umash_long( const uint64_t multipliers[2], const uint64_t * oh, + uint64_t seed, const void * data, size_t n_bytes ) { uint64_t acc = 0; // This invokes the optional routines for very long inputs if (unlikely(n_bytes >= UMASH_MULTIPLE_BLOCKS_THRESHOLD)) { - size_t n_block = n_bytes / BLOCK_SIZE; - const void *remaining; + size_t n_block = n_bytes / BLOCK_SIZE; + const void * remaining; - n_bytes %= BLOCK_SIZE; + n_bytes %= BLOCK_SIZE; remaining = (const uint8_t *)data + (n_block * BLOCK_SIZE); - acc = umash_multiple_blocks(acc, multipliers, oh, seed, data, n_block); + acc = umash_multiple_blocks(acc, multipliers, oh, seed, data, n_block); - data = remaining; - if (n_bytes == 0) + data = remaining; + if (n_bytes == 0) { goto finalize; + } goto last_block; } @@ -804,51 +801,50 @@ static uint64_t umash_long(const uint64_t multipliers[2], const uint64_t *oh, struct umash_oh compressed; compressed = oh_varblock(oh, seed, data, BLOCK_SIZE); - data = (const uint8_t *)data + BLOCK_SIZE; - n_bytes -= BLOCK_SIZE; + data = (const uint8_t *)data + BLOCK_SIZE; + n_bytes -= BLOCK_SIZE; - acc = horner_double_update(acc, multipliers[0], multipliers[1], - compressed.bits[0], compressed.bits[1]); + acc = horner_double_update(acc, multipliers[0], multipliers[1], compressed.bits[0], compressed.bits[1]); } -last_block: + last_block: /* Do the final block. */ { struct umash_oh compressed; - seed ^= (uint8_t)n_bytes; + seed ^= (uint8_t)n_bytes; compressed = oh_varblock(oh, seed, data, n_bytes); - acc = horner_double_update(acc, multipliers[0], multipliers[1], - compressed.bits[0], compressed.bits[1]); + acc = horner_double_update(acc, multipliers[0], multipliers[1], compressed.bits[0], compressed.bits[1]); } -finalize: + finalize: return finalize(acc); } -template < bool bswap > -static struct umash_fp umash_fp_long(const uint64_t multipliers[2][2], const uint64_t *oh, - uint64_t seed, const void *data, size_t n_bytes) { +template +static struct umash_fp umash_fp_long( const uint64_t multipliers[2][2], const uint64_t * oh, + uint64_t seed, const void * data, size_t n_bytes ) { struct umash_oh compressed[2]; struct umash_fp ret; - uint64_t acc[2] = { 0, 0 }; + uint64_t acc[2] = { 0, 0 }; // This invokes the optional routines for very long inputs if (unlikely(n_bytes >= UMASH_MULTIPLE_BLOCKS_THRESHOLD)) { - struct umash_fp poly = { .hash = { 0, 0 } }; - size_t n_block = n_bytes / BLOCK_SIZE; - const void *remaining; + struct umash_fp poly = { .hash = { 0 , 0 } }; + size_t n_block = n_bytes / BLOCK_SIZE; + const void * remaining; - n_bytes %= BLOCK_SIZE; + n_bytes %= BLOCK_SIZE; remaining = (const uint8_t *)data + (n_block * BLOCK_SIZE); - poly = umash_fprint_multiple_blocks(poly, multipliers, oh, seed, data, n_block); + poly = umash_fprint_multiple_blocks(poly, multipliers, oh, seed, data, n_block); - acc[0] = poly.hash[0]; - acc[1] = poly.hash[1]; + acc[0] = poly.hash[0]; + acc[1] = poly.hash[1]; - data = remaining; - if (n_bytes == 0) + data = remaining; + if (n_bytes == 0) { goto finalize; + } goto last_block; } @@ -862,27 +858,27 @@ static struct umash_fp umash_fp_long(const uint64_t multipliers[2][2], const uin UPDATE(0); UPDATE(1); -#undef UPDATE + #undef UPDATE - data = (const uint8_t *)data + BLOCK_SIZE; + data = (const uint8_t *)data + BLOCK_SIZE; n_bytes -= BLOCK_SIZE; } -last_block: + last_block: oh_varblock_fprint(compressed, oh, seed ^ (uint8_t)n_bytes, data, n_bytes); -#define FINAL(i) \ - do { \ - acc[i] = horner_double_update(acc[i], multipliers[i][0], \ - multipliers[i][1], compressed[i].bits[0], \ - compressed[i].bits[1]); \ +#define FINAL(i) \ + do { \ + acc[i] = horner_double_update(acc[i], multipliers[i][0], \ + multipliers[i][1], compressed[i].bits[0], \ + compressed[i].bits[1]); \ } while (0) FINAL(0); FINAL(1); -#undef FINAL + #undef FINAL -finalize: + finalize: ret.hash[0] = finalize(acc[0]); ret.hash[1] = finalize(acc[1]); return ret; @@ -890,9 +886,8 @@ static struct umash_fp umash_fp_long(const uint64_t multipliers[2][2], const uin //------------------------------------------------------------ // This is hardcoded to which == 0. -template < bool bswap > -static uint64_t umash_full(const struct umash_params *params, uint64_t seed, - const void *data, size_t n_bytes) { +template +static uint64_t umash_full( const struct umash_params * params, uint64_t seed, const void * data, size_t n_bytes ) { /* * It's not that short inputs are necessarily more likely, but * we want to make sure they fall through correctly to @@ -909,9 +904,9 @@ static uint64_t umash_full(const struct umash_params *params, uint64_t seed, } } -template < bool bswap > -static struct umash_fp umash_fprint(const struct umash_params *params, uint64_t seed, - const void *data, size_t n_bytes) { +template +static struct umash_fp umash_fprint( const struct umash_params * params, + uint64_t seed, const void * data, size_t n_bytes ) { if (likely(n_bytes <= sizeof(v128))) { if (likely(n_bytes <= sizeof(uint64_t))) { return umash_fp_short(params->oh, seed, data, n_bytes); @@ -924,74 +919,73 @@ static struct umash_fp umash_fprint(const struct umash_params *params, uint64_t } //------------------------------------------------------------ -static void core_salsa20(uint8_t * out, const uint8_t in[16], const uint8_t key[32], - const uint8_t constant[16]) { +static void core_salsa20( uint8_t * out, const uint8_t in[16], const uint8_t key[32], const uint8_t constant[16] ) { enum { ROUNDS = 20 }; uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - j0 = x0 = GET_U32(constant, 0); - j1 = x1 = GET_U32(key, 0); - j2 = x2 = GET_U32(key, 4); - j3 = x3 = GET_U32(key, 8); - j4 = x4 = GET_U32(key, 12); - j5 = x5 = GET_U32(constant, 4); - j6 = x6 = GET_U32(in, 0); - j7 = x7 = GET_U32(in, 4); - j8 = x8 = GET_U32(in, 8); - j9 = x9 = GET_U32(in, 12); - j10 = x10 = GET_U32(constant, 8); - j11 = x11 = GET_U32(key, 16); - j12 = x12 = GET_U32(key, 20); - j13 = x13 = GET_U32(key, 24); - j14 = x14 = GET_U32(key, 28); + j0 = x0 = GET_U32(constant, 0); + j1 = x1 = GET_U32(key , 0); + j2 = x2 = GET_U32(key , 4); + j3 = x3 = GET_U32(key , 8); + j4 = x4 = GET_U32(key , 12); + j5 = x5 = GET_U32(constant, 4); + j6 = x6 = GET_U32(in , 0); + j7 = x7 = GET_U32(in , 4); + j8 = x8 = GET_U32(in , 8); + j9 = x9 = GET_U32(in , 12); + j10 = x10 = GET_U32(constant, 8); + j11 = x11 = GET_U32(key , 16); + j12 = x12 = GET_U32(key , 20); + j13 = x13 = GET_U32(key , 24); + j14 = x14 = GET_U32(key , 28); j15 = x15 = GET_U32(constant, 12); for (size_t i = 0; i < ROUNDS; i += 2) { - x4 ^= ROTL32(x0 + x12, 7); - x8 ^= ROTL32(x4 + x0, 9); - x12 ^= ROTL32(x8 + x4, 13); - x0 ^= ROTL32(x12 + x8, 18); - x9 ^= ROTL32(x5 + x1, 7); - x13 ^= ROTL32(x9 + x5, 9); - x1 ^= ROTL32(x13 + x9, 13); - x5 ^= ROTL32(x1 + x13, 18); - x14 ^= ROTL32(x10 + x6, 7); - x2 ^= ROTL32(x14 + x10, 9); - x6 ^= ROTL32(x2 + x14, 13); - x10 ^= ROTL32(x6 + x2, 18); - x3 ^= ROTL32(x15 + x11, 7); - x7 ^= ROTL32(x3 + x15, 9); - x11 ^= ROTL32(x7 + x3, 13); - x15 ^= ROTL32(x11 + x7, 18); - x1 ^= ROTL32(x0 + x3, 7); - x2 ^= ROTL32(x1 + x0, 9); - x3 ^= ROTL32(x2 + x1, 13); - x0 ^= ROTL32(x3 + x2, 18); - x6 ^= ROTL32(x5 + x4, 7); - x7 ^= ROTL32(x6 + x5, 9); - x4 ^= ROTL32(x7 + x6, 13); - x5 ^= ROTL32(x4 + x7, 18); - x11 ^= ROTL32(x10 + x9, 7); - x8 ^= ROTL32(x11 + x10, 9); - x9 ^= ROTL32(x8 + x11, 13); - x10 ^= ROTL32(x9 + x8, 18); - x12 ^= ROTL32(x15 + x14, 7); - x13 ^= ROTL32(x12 + x15, 9); + x4 ^= ROTL32(x0 + x12 , 7); + x8 ^= ROTL32(x4 + x0 , 9); + x12 ^= ROTL32(x8 + x4 , 13); + x0 ^= ROTL32(x12 + x8 , 18); + x9 ^= ROTL32(x5 + x1 , 7); + x13 ^= ROTL32(x9 + x5 , 9); + x1 ^= ROTL32(x13 + x9 , 13); + x5 ^= ROTL32(x1 + x13 , 18); + x14 ^= ROTL32(x10 + x6 , 7); + x2 ^= ROTL32(x14 + x10, 9); + x6 ^= ROTL32(x2 + x14 , 13); + x10 ^= ROTL32(x6 + x2 , 18); + x3 ^= ROTL32(x15 + x11, 7); + x7 ^= ROTL32(x3 + x15 , 9); + x11 ^= ROTL32(x7 + x3 , 13); + x15 ^= ROTL32(x11 + x7 , 18); + x1 ^= ROTL32(x0 + x3 , 7); + x2 ^= ROTL32(x1 + x0 , 9); + x3 ^= ROTL32(x2 + x1 , 13); + x0 ^= ROTL32(x3 + x2 , 18); + x6 ^= ROTL32(x5 + x4 , 7); + x7 ^= ROTL32(x6 + x5 , 9); + x4 ^= ROTL32(x7 + x6 , 13); + x5 ^= ROTL32(x4 + x7 , 18); + x11 ^= ROTL32(x10 + x9 , 7); + x8 ^= ROTL32(x11 + x10, 9); + x9 ^= ROTL32(x8 + x11 , 13); + x10 ^= ROTL32(x9 + x8 , 18); + x12 ^= ROTL32(x15 + x14, 7); + x13 ^= ROTL32(x12 + x15, 9); x14 ^= ROTL32(x13 + x12, 13); x15 ^= ROTL32(x14 + x13, 18); } - x0 += j0; - x1 += j1; - x2 += j2; - x3 += j3; - x4 += j4; - x5 += j5; - x6 += j6; - x7 += j7; - x8 += j8; - x9 += j9; + x0 += j0; + x1 += j1; + x2 += j2; + x3 += j3; + x4 += j4; + x5 += j5; + x6 += j6; + x7 += j7; + x8 += j8; + x9 += j9; x10 += j10; x11 += j11; x12 += j12; @@ -999,16 +993,16 @@ static void core_salsa20(uint8_t * out, const uint8_t in[16], const uint8_t key[ x14 += j14; x15 += j15; - PUT_U32( x0, out, 0); - PUT_U32( x1, out, 4); - PUT_U32( x2, out, 8); - PUT_U32( x3, out, 12); - PUT_U32( x4, out, 16); - PUT_U32( x5, out, 20); - PUT_U32( x6, out, 24); - PUT_U32( x7, out, 28); - PUT_U32( x8, out, 32); - PUT_U32( x9, out, 36); + PUT_U32(x0 , out, 0); + PUT_U32(x1 , out, 4); + PUT_U32(x2 , out, 8); + PUT_U32(x3 , out, 12); + PUT_U32(x4 , out, 16); + PUT_U32(x5 , out, 20); + PUT_U32(x6 , out, 24); + PUT_U32(x7 , out, 28); + PUT_U32(x8 , out, 32); + PUT_U32(x9 , out, 36); PUT_U32(x10, out, 40); PUT_U32(x11, out, 44); PUT_U32(x12, out, 48); @@ -1017,13 +1011,13 @@ static void core_salsa20(uint8_t * out, const uint8_t in[16], const uint8_t key[ PUT_U32(x15, out, 60); } -static void salsa20_stream(void *dst, size_t len, const uint8_t nonce[8], - const uint8_t key[32]) { +static void salsa20_stream( void * dst, size_t len, const uint8_t nonce[8], const uint8_t key[32] ) { static const uint8_t sigma[17] = "expand 32-byte k"; uint8_t in[16]; - if (len == 0) + if (len == 0) { return; + } memcpy(in, nonce, 8); memset(in + 8, 0, 8); @@ -1033,12 +1027,12 @@ static void salsa20_stream(void *dst, size_t len, const uint8_t nonce[8], unsigned int u = 1; for (size_t i = 8; i < 16; i++) { - u += in[i]; + u += in[i]; in[i] = u; - u >>= 8; + u >>= 8; } - dst = (uint8_t *)dst + 64; + dst = (uint8_t *)dst + 64; len -= 64; } @@ -1049,30 +1043,31 @@ static void salsa20_stream(void *dst, size_t len, const uint8_t nonce[8], } } -static bool value_is_repeated(const uint64_t *values, size_t n, uint64_t needle) { +static bool value_is_repeated( const uint64_t * values, size_t n, uint64_t needle ) { for (size_t i = 0; i < n; i++) { - if (values[i] == needle) + if (values[i] == needle) { return true; + } } return false; } -static bool umash_params_prepare(struct umash_params *params) { +static bool umash_params_prepare( struct umash_params * params ) { static const uint64_t modulo = (1UL << 61) - 1; /* * The polynomial parameters have two redundant fields (for * the pre-squared multipliers). Use them as our source of * extra entropy if needed. */ - uint64_t buf[] = { params->poly[0][0], params->poly[1][0] }; - size_t buf_idx = 0; - -#define GET_RANDOM(DST) \ - do { \ - if (buf_idx >= ARRAY_SIZE(buf)) \ - return false; \ - \ - (DST) = buf[buf_idx++]; \ + uint64_t buf[] = { params->poly[0][0], params->poly[1][0] }; + size_t buf_idx = 0; + +#define GET_RANDOM(DST) \ + do { \ + if (buf_idx >= ARRAY_SIZE(buf)) \ + return false; \ + \ + (DST) = buf[buf_idx++]; \ } while (0) /* Check the polynomial multipliers: we don't want 0s. */ @@ -1085,8 +1080,9 @@ static bool umash_params_prepare(struct umash_params *params) { * guarantee uniformity. */ f &= (1UL << 61) - 1; - if (f != 0 && f < modulo) + if ((f != 0) && (f < modulo)) { break; + } GET_RANDOM(f); } @@ -1098,35 +1094,38 @@ static bool umash_params_prepare(struct umash_params *params) { /* Avoid repeated OH noise values. */ for (size_t i = 0; i < ARRAY_SIZE(params->oh); i++) { - while (value_is_repeated(params->oh, i, params->oh[i])) + while (value_is_repeated(params->oh, i, params->oh[i])) { GET_RANDOM(params->oh[i]); + } } return true; } -static void umash_params_derive(struct umash_params *params, uint64_t bits, const void *key) { +static void umash_params_derive( struct umash_params * params, uint64_t bits, const void * key ) { uint8_t umash_key[33] = "Do not use UMASH VS adversaries."; params->base_seed = bits; - if (key != NULL) + if (key != NULL) { memcpy(umash_key, key, sizeof(umash_key)); + } while (true) { uint8_t nonce[8]; - for (size_t i = 0; i < 8; i++) + for (size_t i = 0; i < 8; i++) { nonce[i] = bits >> (8 * i); + } /* * The "- sizeof(uint64_t)" is so that params->base_seed * doesn't get overwritten. */ - salsa20_stream(params, sizeof(*params) - sizeof(uint64_t), - nonce, umash_key); - if (umash_params_prepare(params)) + salsa20_stream(params, sizeof(*params) - sizeof(uint64_t), nonce, umash_key); + if (umash_params_prepare(params)) { return; + } /* * This should practically never fail, so really @@ -1145,37 +1144,40 @@ static void umash_params_derive(struct umash_params *params, uint64_t bits, cons // mode. This is because the (now) thread-local global table would // never be initialized in the thread, and so would be all zeroes. -static uintptr_t umash_slow_reseed(const seed_t seed) { +static uintptr_t umash_slow_reseed( const seed_t seed ) { static thread_local struct umash_params umash_params_local; + umash_params_derive(&umash_params_local, seed, NULL); return (uintptr_t)(&umash_params_local); } static struct umash_params umash_params_global; -static bool umash_init(void) { +static bool umash_init( void ) { umash_params_derive(&umash_params_global, 0, NULL); umash_slow_reseed(0); return true; } -template < bool reseed, bool bswap > -static void UMASH(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void UMASH( const void * in, const size_t len, const seed_t seed, void * out ) { const struct umash_params * params = reseed ? - (const struct umash_params *)(uintptr_t)seed : - &umash_params_global; + (const struct umash_params *)(uintptr_t)seed : + &umash_params_global; const uint64_t hseed = reseed ? params->base_seed : (uint64_t)seed; - uint64_t hash = umash_full(params, hseed, in, len); + uint64_t hash = umash_full(params, hseed, in, len); + PUT_U64(hash, (uint8_t *)out, 0); } -template < bool reseed, bool bswap > -static void UMASH_FP(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void UMASH_FP( const void * in, const size_t len, const seed_t seed, void * out ) { const struct umash_params * params = reseed ? - (const struct umash_params *)(uintptr_t)seed : - &umash_params_global; - const uint64_t hseed = reseed ? params->base_seed : (uint64_t)seed; - struct umash_fp hash = umash_fprint(params, hseed, in, len); + (const struct umash_params *)(uintptr_t)seed : + &umash_params_global; + const uint64_t hseed = reseed ? params->base_seed : (uint64_t)seed; + struct umash_fp hash = umash_fprint(params, hseed, in, len); + PUT_U64(hash.hash[0], (uint8_t *)out, 0); PUT_U64(hash.hash[1], (uint8_t *)out, 8); } @@ -1183,80 +1185,80 @@ static void UMASH_FP(const void * in, const size_t len, const seed_t seed, void #endif //------------------------------------------------------------ REGISTER_FAMILY(umash, - $.src_url = "https://github.com/backtrace-labs/umash", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/backtrace-labs/umash", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); #if defined(HAVE_X86_64_CLMUL) REGISTER_HASH(UMASH_64, - $.desc = "UMASH-64 (which == 0)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_CLMUL_BASED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x36A264CD, - $.verification_BE = 0x84DA635B, - $.hashfn_native = UMASH, - $.hashfn_bswap = UMASH, - $.initfn = umash_init -); + $.desc = "UMASH-64 (which == 0)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_CLMUL_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x36A264CD, + $.verification_BE = 0x84DA635B, + $.hashfn_native = UMASH, + $.hashfn_bswap = UMASH, + $.initfn = umash_init + ); REGISTER_HASH(UMASH_64__reseed, - $.desc = "UMASH-64 (which == 0, with full reseeding)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_CLMUL_BASED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 64, - $.verification_LE = 0x161495C6, - $.verification_BE = 0xF18B8420, - $.hashfn_native = UMASH, - $.hashfn_bswap = UMASH, - $.seedfn = umash_slow_reseed, - $.initfn = umash_init -); + $.desc = "UMASH-64 (which == 0, with full reseeding)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_CLMUL_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 64, + $.verification_LE = 0x161495C6, + $.verification_BE = 0xF18B8420, + $.hashfn_native = UMASH, + $.hashfn_bswap = UMASH, + $.seedfn = umash_slow_reseed, + $.initfn = umash_init + ); REGISTER_HASH(UMASH_128, - $.desc = "UMASH-128", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_CLMUL_BASED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x63857D05, - $.verification_BE = 0xE87FFB4B, - $.hashfn_native = UMASH_FP, - $.hashfn_bswap = UMASH_FP, - $.initfn = umash_init -); + $.desc = "UMASH-128", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_CLMUL_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x63857D05, + $.verification_BE = 0xE87FFB4B, + $.hashfn_native = UMASH_FP, + $.hashfn_bswap = UMASH_FP, + $.initfn = umash_init + ); REGISTER_HASH(UMASH_128__reseed, - $.desc = "UMASH-128 (with full reseeding)", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_CLMUL_BASED, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_MIT, - $.bits = 128, - $.verification_LE = 0x36D4EC95, - $.verification_BE = 0x9F870C9C, - $.hashfn_native = UMASH_FP, - $.hashfn_bswap = UMASH_FP, - $.seedfn = umash_slow_reseed, - $.initfn = umash_init -); + $.desc = "UMASH-128 (with full reseeding)", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_CLMUL_BASED, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_MIT, + $.bits = 128, + $.verification_LE = 0x36D4EC95, + $.verification_BE = 0x9F870C9C, + $.hashfn_native = UMASH_FP, + $.hashfn_bswap = UMASH_FP, + $.seedfn = umash_slow_reseed, + $.initfn = umash_init + ); #endif diff --git a/hashes/vmac.cpp b/hashes/vmac.cpp index 62e95e9f..8cbdead8 100644 --- a/hashes/vmac.cpp +++ b/hashes/vmac.cpp @@ -40,44 +40,46 @@ //----------------------------------------------------------------------------- // Constants and masks -const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */ -const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */ -const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */ -const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */ -const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */ +const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */ +const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */ +const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */ +const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */ +const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */ //----------------------------------------------------------------------------- // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C #if defined(__GNUC__) - // define these in two steps to allow arguments to be expanded - #define GNU_AS2(x, y) #x ", " #y ";" - #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" - #define GNU_ASL(x) "\n" #x ":" - #define GNU_ASJ(x, y, z) #x " " #y #z ";" - #define AS2(x, y) GNU_AS2(x, y) - #define AS3(x, y, z) GNU_AS3(x, y, z) - #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" - #define ASL(x) GNU_ASL(x) - #define ASJ(x, y, z) GNU_ASJ(x, y, z) +// define these in two steps to allow arguments to be expanded + #define GNU_AS2(x, y) #x ", " #y ";" + #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" + #define GNU_ASL(x) "\n" #x ":" + #define GNU_ASJ(x, y, z) #x " " #y #z ";" + #define AS2(x, y) GNU_AS2(x, y) + #define AS3(x, y, z) GNU_AS3(x, y, z) + #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" + #define ASL(x) GNU_ASL(x) + #define ASJ(x, y, z) GNU_ASJ(x, y, z) #else - #define AS2(x, y) __asm {x, y} - #define AS3(x, y, z) __asm {x, y, z} - #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)} - #define ASL(x) __asm {label##x:} - #define ASJ(x, y, z) __asm {x label##y} + #define AS2(x, y) __asm { x, y } + #define AS3(x, y, z) __asm { x, y, z } + #define ASS(x, y, a, b, c, d) __asm { x, y, _MM_SHUFFLE(a, b, c, d) } + #define ASL(x) __asm { \ + label ## x: \ + } + #define ASJ(x, y, z) __asm { x label ## y } #endif //----------------------------------------------------------------------------- -#define ADD128(rh,rl,ih,il) add128(rl, rh, il, ih) +#define ADD128(rh, rl, ih, il) add128(rl, rh, il, ih) -#define MUL64(rh,rl,i1,i2) mult64_128(rl, rh, i1, i2) +#define MUL64(rh, rl, i1, i2) mult64_128(rl, rh, i1, i2) // PMUL is a special case of MUL where one carry bit is guaranteed to // not be needed. We'll just ignore that for now. #define PMUL64 MUL64 -#define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2)) +#define MUL32(i1, i2) ((uint64_t)(uint32_t)(i1) * (uint32_t)(i2)) //----------------------------------------------------------------------------- // For highest performance the L1 NH and L2 polynomial hashes should be @@ -86,27 +88,29 @@ const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */ //----------------------------------------------------------------------------- // Portable code (64-bit/32-bit details are behind mathmult.h macros) -template < bool bswap > -static inline void nh_16_portable(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl) { - //uint64_t th, tl; +template +static inline void nh_16_portable( const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl ) { + // uint64_t th, tl; rh = rl = 0; - for (size_t i = 0; i < nw; i+= 2) { + for (size_t i = 0; i < nw; i += 2) { #if 0 - MUL64(th, tl, (GET_U64(mp, i*8) + kp[i]), (GET_U64(mp, i*8 + 8) + kp[i + 1])); + MUL64(th, tl, (GET_U64(mp, i * 8) + kp[i]), (GET_U64(mp, i * 8 + 8) + kp[i + 1])); ADD128(rh, rl, th, tl); #else - fma64_128(rl, rh, (GET_U64(mp, i*8) + kp[i]), (GET_U64(mp, i*8 + 8) + kp[i + 1])); + fma64_128(rl, rh, (GET_U64(mp, i * 8) + kp[i]), (GET_U64(mp, i * 8 + 8) + kp[i + 1])); #endif } } // Using fma64_128() here is a tiny bit slower because there is less // freedom to reorder things and take advantage of more registers -template < bool bswap > -static inline void nh_vmac_nhbytes_portable(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl) { +template +static inline void nh_vmac_nhbytes_portable( const uint8_t * mp, const uint64_t * kp, + size_t nw, uint64_t & rh, uint64_t & rl ) { uint64_t th, tl; + rh = rl = 0; - for (size_t i = 0; i < nw; i+= 8) { + for (size_t i = 0; i < nw; i += 8) { MUL64(th, tl, (GET_U64(mp, (i + 0) * 8) + kp[i + 0]), (GET_U64(mp, (i + 1) * 8) + kp[i + 1])); ADD128(rh, rl, th, tl); MUL64(th, tl, (GET_U64(mp, (i + 2) * 8) + kp[i + 2]), (GET_U64(mp, (i + 3) * 8) + kp[i + 3])); @@ -118,28 +122,28 @@ static inline void nh_vmac_nhbytes_portable(const uint8_t * mp, const uint64_t * } } -static inline void poly_step_portable(uint64_t & ah, uint64_t & al, const uint64_t & kh, - const uint64_t & kl, const uint64_t & mh, const uint64_t & ml) { +static inline void poly_step_portable( uint64_t & ah, uint64_t & al, const uint64_t & kh, + const uint64_t & kl, const uint64_t & mh, const uint64_t & ml ) { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z = 0; /* compute ab*cd, put bd into result registers */ - PMUL64(t3h, t3l, al, kh); - PMUL64(t2h, t2l, ah, kl); - PMUL64(t1h, t1l, ah, 2*kh); - PMUL64(ah, al, al, kl); + PMUL64(t3h, t3l, al, kh ); + PMUL64(t2h, t2l, ah, kl ); + PMUL64(t1h, t1l, ah, 2 * kh); + PMUL64(ah , al , al, kl ); /* add 2 * ac to result */ - ADD128(ah, al, t1h, t1l); + ADD128(ah , al , t1h, t1l); /* add together ad + bc */ ADD128(t2h, t2l, t3h, t3l); /* now (ah,al), (t2l,2*t2h) need summing */ /* first add the high registers, carrying into t2h */ - ADD128(t2h, ah, z, t2l); + ADD128(t2h, ah , z , t2l); /* double t2h and add top bit of ah */ t2h = 2 * t2h + (ah >> 63); ah &= m63; /* now add the low registers */ - ADD128(ah, al, mh, ml); - ADD128(ah, al, z, t2h); + ADD128(ah , al , mh , ml ); + ADD128(ah , al , z , t2h); } //----------------------------------------------------------------------------- @@ -147,236 +151,237 @@ static inline void poly_step_portable(uint64_t & ah, uint64_t & al, const uint64 #if defined(HAVE_32BIT_PLATFORM) && defined(HAVE_SSE_2) -template < bool bswap > -static void nh_16_sse2(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl) { - // This assembly version, using MMX registers, is just as fast as the - // intrinsics version (which uses XMM registers) on the Intel Core 2, - // but is much faster on the Pentium 4. In order to schedule multiplies - // as early as possible, the loop interleaves operations for the current - // block and the next block. To mask out high 32-bits, we use "movd" - // to move the lower 32-bits to the stack and then back. Surprisingly, - // this is faster than any other method. -#if defined(__GNUC__) - __asm__ __volatile__ - ( - ".intel_syntax noprefix;" -#else - AS2( mov esi, mp) - AS2( mov edi, kp) - AS2( mov ecx, nw) - AS2( mov eax, &rl) - AS2( mov edx, &rh) -#endif - AS2( sub esp, 12) - AS2( movq mm6, [esi]) - AS2( paddq mm6, [edi]) - AS2( movq mm5, [esi+8]) - AS2( paddq mm5, [edi+8]) - AS2( add esi, 16) - AS2( add edi, 16) - AS2( movq mm4, mm6) - ASS( pshufw mm2, mm6, 1, 0, 3, 2) - AS2( pmuludq mm6, mm5) - ASS( pshufw mm3, mm5, 1, 0, 3, 2) - AS2( pmuludq mm5, mm2) - AS2( pmuludq mm2, mm3) - AS2( pmuludq mm3, mm4) - AS2( pxor mm7, mm7) - AS2( movd [esp], mm6) - AS2( psrlq mm6, 32) - AS2( movd [esp+4], mm5) - AS2( psrlq mm5, 32) - AS2( sub ecx, 2) - ASJ( jz, 1, f) - ASL(0) - AS2( movq mm0, [esi]) - AS2( paddq mm0, [edi]) - AS2( movq mm1, [esi+8]) - AS2( paddq mm1, [edi+8]) - AS2( add esi, 16) - AS2( add edi, 16) - AS2( movq mm4, mm0) - AS2( paddq mm5, mm2) - ASS( pshufw mm2, mm0, 1, 0, 3, 2) - AS2( pmuludq mm0, mm1) - AS2( movd [esp+8], mm3) - AS2( psrlq mm3, 32) - AS2( paddq mm5, mm3) - ASS( pshufw mm3, mm1, 1, 0, 3, 2) - AS2( pmuludq mm1, mm2) - AS2( pmuludq mm2, mm3) - AS2( pmuludq mm3, mm4) - AS2( movd mm4, [esp]) - AS2( paddq mm7, mm4) - AS2( movd mm4, [esp+4]) - AS2( paddq mm6, mm4) - AS2( movd mm4, [esp+8]) - AS2( paddq mm6, mm4) - AS2( movd [esp], mm0) - AS2( psrlq mm0, 32) - AS2( paddq mm6, mm0) - AS2( movd [esp+4], mm1) - AS2( psrlq mm1, 32) - AS2( paddq mm5, mm1) - AS2( sub ecx, 2) - ASJ( jnz, 0, b) - ASL(1) - AS2( paddq mm5, mm2) - AS2( movd [esp+8], mm3) - AS2( psrlq mm3, 32) - AS2( paddq mm5, mm3) - AS2( movd mm4, [esp]) - AS2( paddq mm7, mm4) - AS2( movd mm4, [esp+4]) - AS2( paddq mm6, mm4) - AS2( movd mm4, [esp+8]) - AS2( paddq mm6, mm4) - - ASS( pshufw mm0, mm7, 3, 2, 1, 0) - AS2( psrlq mm7, 32) - AS2( paddq mm6, mm7) - AS2( punpckldq mm0, mm6) - AS2( psrlq mm6, 32) - AS2( paddq mm5, mm6) - AS2( movq [eax], mm0) - AS2( movq [edx], mm5) - AS2( add esp, 12) -#if defined(__GNUC__) - ".att_syntax prefix;" - : - : "S" (mp), "D" (kp), "c" (nw), "a" (&rl), "d" (&rh) - : "memory", "cc" - ); -#endif +template +static void nh_16_sse2( const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl ) { + // This assembly version, using MMX registers, is just as fast as the + // intrinsics version (which uses XMM registers) on the Intel Core 2, + // but is much faster on the Pentium 4. In order to schedule multiplies + // as early as possible, the loop interleaves operations for the current + // block and the next block. To mask out high 32-bits, we use "movd" + // to move the lower 32-bits to the stack and then back. Surprisingly, + // this is faster than any other method. + #if defined(__GNUC__) + __asm__ __volatile__ + ( + ".intel_syntax noprefix;" + #else + AS2(mov esi, mp ) + AS2(mov edi, kp ) + AS2(mov ecx, nw ) + AS2(mov eax, &rl) + AS2(mov edx, &rh) + #endif + AS2(sub esp, 12 ) + AS2(movq mm6, [esi] ) + AS2(paddq mm6, [edi] ) + AS2(movq mm5, [esi + 8]) + AS2(paddq mm5, [edi + 8]) + AS2(add esi, 16 ) + AS2(add edi, 16 ) + AS2(movq mm4, mm6 ) + ASS( pshufw mm2, mm6, 1, 0, 3, 2) + AS2( pmuludq mm6, mm5) + ASS( pshufw mm3, mm5, 1, 0, 3, 2) + AS2(pmuludq mm5 , mm2) + AS2(pmuludq mm2 , mm3) + AS2(pmuludq mm3 , mm4) + AS2(pxor mm7 , mm7) + AS2(movd [esp] , mm6) + AS2(psrlq mm6 , 32) + AS2(movd [esp+4], mm5) + AS2(psrlq mm5 , 32) + AS2(sub ecx , 2) + ASJ( jz, 1, f) + ASL(0) + AS2(movq mm0, [esi] ) + AS2(paddq mm0, [edi] ) + AS2(movq mm1, [esi + 8]) + AS2(paddq mm1, [edi + 8]) + AS2(add esi, 16 ) + AS2(add edi, 16 ) + AS2(movq mm4, mm0 ) + AS2(paddq mm5, mm2 ) + ASS( pshufw mm2, mm0, 1, 0, 3, 2) + AS2(pmuludq mm0 , mm1) + AS2(movd [esp+8], mm3) + AS2(psrlq mm3 , 32) + AS2(paddq mm5 , mm3) + ASS( pshufw mm3, mm1, 1, 0, 3, 2) + AS2(pmuludq mm1 , mm2 ) + AS2(pmuludq mm2 , mm3 ) + AS2(pmuludq mm3 , mm4 ) + AS2(movd mm4 , [esp] ) + AS2(paddq mm7 , mm4 ) + AS2(movd mm4 , [esp + 4]) + AS2(paddq mm6 , mm4 ) + AS2(movd mm4 , [esp + 8]) + AS2(paddq mm6 , mm4 ) + AS2(movd [esp] , mm0 ) + AS2(psrlq mm0 , 32 ) + AS2(paddq mm6 , mm0 ) + AS2(movd [esp+4], mm1 ) + AS2(psrlq mm1 , 32 ) + AS2(paddq mm5 , mm1 ) + AS2(sub ecx , 2 ) + ASJ( jnz, 0, b) + ASL(1) + AS2(paddq mm5 , mm2 ) + AS2(movd [esp+8], mm3 ) + AS2(psrlq mm3 , 32 ) + AS2(paddq mm5 , mm3 ) + AS2(movd mm4 , [esp] ) + AS2(paddq mm7 , mm4 ) + AS2(movd mm4 , [esp + 4]) + AS2(paddq mm6 , mm4 ) + AS2(movd mm4 , [esp + 8]) + AS2(paddq mm6 , mm4 ) + + ASS( pshufw mm0, mm7, 3, 2, 1, 0) + AS2(psrlq mm7 , 32) + AS2(paddq mm6 , mm7) + AS2(punpckldq mm0, mm6) + AS2(psrlq mm6 , 32) + AS2(paddq mm5 , mm6) + AS2(movq [eax] , mm0) + AS2(movq [edx] , mm5) + AS2(add esp , 12) + #if defined(__GNUC__) + ".att_syntax prefix;" + : + : "S" (mp), "D" (kp), "c" (nw), "a" (&rl), "d" (&rh) + : "memory", "cc" + ); + #else + #endif } -static void poly_step_sse2(uint64_t & ah, uint64_t & al, const uint64_t & kh, - const uint64_t & kl, const uint64_t & mh, const uint64_t & ml) { +static void poly_step_sse2( uint64_t & ah, uint64_t & al, const uint64_t & kh, + const uint64_t & kl, const uint64_t & mh, const uint64_t & ml ) { // This code tries to schedule the multiplies as early as possible to overcome // the long latencies on the Pentium 4. It also minimizes "movq" instructions // which are very expensive on the P4. -#define a0 [eax+0] -#define a1 [eax+4] -#define a2 [ebx+0] -#define a3 [ebx+4] -#define k0 [ecx+0] -#define k1 [ecx+4] -#define k2 [edx+0] -#define k3 [edx+4] - -#if defined(__GNUC__) - uint32_t temp; - __asm__ __volatile__ - ( - "mov %%ebx, %0;" - "mov %1, %%ebx;" - ".intel_syntax noprefix;" -#else - AS2( mov ebx, &ah) - AS2( mov edx, &kh) - AS2( mov eax, &al) - AS2( mov ecx, &kl) - AS2( mov esi, &mh) - AS2( mov edi, &ml) -#endif - - AS2( movd mm0, a3) - AS2( movq mm4, mm0) - AS2( pmuludq mm0, k3) // a3*k3 - AS2( movd mm1, a0) - AS2( pmuludq mm1, k2) // a0*k2 - AS2( movd mm2, a1) - AS2( movd mm6, k1) - AS2( pmuludq mm2, mm6) // a1*k1 - AS2( movd mm3, a2) - AS2( movq mm5, mm3) - AS2( movd mm7, k0) - AS2( pmuludq mm3, mm7) // a2*k0 - AS2( pmuludq mm4, mm7) // a3*k0 - AS2( pmuludq mm5, mm6) // a2*k1 - AS2( psllq mm0, 1) - AS2( paddq mm0, [esi]) - AS2( paddq mm0, mm1) - AS2( movd mm1, a1) - AS2( paddq mm4, mm5) - AS2( movq mm5, mm1) - AS2( pmuludq mm1, k2) // a1*k2 - AS2( paddq mm0, mm2) - AS2( movd mm2, a0) - AS2( paddq mm0, mm3) - AS2( movq mm3, mm2) - AS2( pmuludq mm2, k3) // a0*k3 - AS2( pmuludq mm3, mm7) // a0*k0 - AS2( movd esi, mm0) - AS2( psrlq mm0, 32) - AS2( pmuludq mm7, mm5) // a1*k0 - AS2( pmuludq mm5, k3) // a1*k3 - AS2( paddq mm0, mm1) - AS2( movd mm1, a2) - AS2( pmuludq mm1, k2) // a2*k2 - AS2( paddq mm0, mm2) - AS2( paddq mm0, mm4) - AS2( movq mm4, mm0) - AS2( movd mm2, a3) - AS2( pmuludq mm2, mm6) // a3*k1 - AS2( pmuludq mm6, a0) // a0*k1 - AS2( psrlq mm0, 31) - AS2( paddq mm0, mm3) - AS2( movd mm3, [edi]) - AS2( paddq mm0, mm3) - AS2( movd mm3, a2) - AS2( pmuludq mm3, k3) // a2*k3 - AS2( paddq mm5, mm1) - AS2( movd mm1, a3) - AS2( pmuludq mm1, k2) // a3*k2 - AS2( paddq mm5, mm2) - AS2( movd mm2, [edi+4]) - AS2( psllq mm5, 1) - AS2( paddq mm0, mm5) - AS2( movq mm5, mm0) - AS2( psllq mm4, 33) - AS2( psrlq mm0, 32) - AS2( paddq mm6, mm7) - AS2( movd mm7, esi) - AS2( paddq mm0, mm6) - AS2( paddq mm0, mm2) - AS2( paddq mm3, mm1) - AS2( psllq mm3, 1) - AS2( paddq mm0, mm3) - AS2( psrlq mm4, 1) - AS2( punpckldq mm5, mm0) - AS2( psrlq mm0, 32) - AS2( por mm4, mm7) - AS2( paddq mm0, mm4) - AS2( movq a0, mm5) - AS2( movq a2, mm0) -#if defined(__GNUC__) - ".att_syntax prefix;" - "mov %0, %%ebx;" - : "=m" (temp) - : "m" (&ah), "D" (&ml), "d" (&kh), "a" (&al), "S" (&mh), "c" (&kl) - : "memory", "cc" - ); -#endif - - -#undef a0 -#undef a1 -#undef a2 -#undef a3 -#undef k0 -#undef k1 -#undef k2 -#undef k3 + #define a0 [eax + 0] + #define a1 [eax + 4] + #define a2 [ebx + 0] + #define a3 [ebx + 4] + #define k0 [ecx + 0] + #define k1 [ecx + 4] + #define k2 [edx + 0] + #define k3 [edx + 4] + + #if defined(__GNUC__) + uint32_t temp; + __asm__ __volatile__ + ( + "mov %%ebx, %0;" + "mov %1, %%ebx;" + ".intel_syntax noprefix;" + #else + AS2(mov ebx, &ah) + AS2(mov edx, &kh) + AS2(mov eax, &al) + AS2(mov ecx, &kl) + AS2(mov esi, &mh) + AS2(mov edi, &ml) + #endif + + AS2(movd mm0 , a3 ) + AS2(movq mm4 , mm0 ) + AS2(pmuludq mm0 , k3 ) // a3*k3 + AS2(movd mm1 , a0 ) + AS2(pmuludq mm1 , k2 ) // a0*k2 + AS2(movd mm2 , a1 ) + AS2(movd mm6 , k1 ) + AS2(pmuludq mm2 , mm6 ) // a1*k1 + AS2(movd mm3 , a2 ) + AS2(movq mm5 , mm3 ) + AS2(movd mm7 , k0 ) + AS2(pmuludq mm3 , mm7 ) // a2*k0 + AS2(pmuludq mm4 , mm7 ) // a3*k0 + AS2(pmuludq mm5 , mm6 ) // a2*k1 + AS2(psllq mm0 , 1 ) + AS2(paddq mm0 , [esi] ) + AS2(paddq mm0 , mm1 ) + AS2(movd mm1 , a1 ) + AS2(paddq mm4 , mm5 ) + AS2(movq mm5 , mm1 ) + AS2(pmuludq mm1 , k2 ) // a1*k2 + AS2(paddq mm0 , mm2 ) + AS2(movd mm2 , a0 ) + AS2(paddq mm0 , mm3 ) + AS2(movq mm3 , mm2 ) + AS2(pmuludq mm2 , k3 ) // a0*k3 + AS2(pmuludq mm3 , mm7 ) // a0*k0 + AS2(movd esi , mm0 ) + AS2(psrlq mm0 , 32 ) + AS2(pmuludq mm7 , mm5 ) // a1*k0 + AS2(pmuludq mm5 , k3 ) // a1*k3 + AS2(paddq mm0 , mm1 ) + AS2(movd mm1 , a2 ) + AS2(pmuludq mm1 , k2 ) // a2*k2 + AS2(paddq mm0 , mm2 ) + AS2(paddq mm0 , mm4 ) + AS2(movq mm4 , mm0 ) + AS2(movd mm2 , a3 ) + AS2(pmuludq mm2 , mm6 ) // a3*k1 + AS2(pmuludq mm6 , a0 ) // a0*k1 + AS2(psrlq mm0 , 31 ) + AS2(paddq mm0 , mm3 ) + AS2(movd mm3 , [edi] ) + AS2(paddq mm0 , mm3 ) + AS2(movd mm3 , a2 ) + AS2(pmuludq mm3 , k3 ) // a2*k3 + AS2(paddq mm5 , mm1 ) + AS2(movd mm1 , a3 ) + AS2(pmuludq mm1 , k2 ) // a3*k2 + AS2(paddq mm5 , mm2 ) + AS2(movd mm2 , [edi + 4]) + AS2(psllq mm5 , 1 ) + AS2(paddq mm0 , mm5 ) + AS2(movq mm5 , mm0 ) + AS2(psllq mm4 , 33 ) + AS2(psrlq mm0 , 32 ) + AS2(paddq mm6 , mm7 ) + AS2(movd mm7 , esi ) + AS2(paddq mm0 , mm6 ) + AS2(paddq mm0 , mm2 ) + AS2(paddq mm3 , mm1 ) + AS2(psllq mm3 , 1 ) + AS2(paddq mm0 , mm3 ) + AS2(psrlq mm4 , 1 ) + AS2(punpckldq mm5, mm0 ) + AS2(psrlq mm0 , 32 ) + AS2(por mm4 , mm7 ) + AS2(paddq mm0 , mm4 ) + AS2(movq a0 , mm5 ) + AS2(movq a2 , mm0 ) + #if defined(__GNUC__) + ".att_syntax prefix;" + "mov %0, %%ebx;" + : "=m" (temp) + : "m" (&ah), "D" (&ml), "d" (&kh), "a" (&al), "S" (&mh), "c" (&kl) + : "memory", "cc" + ); + #else + #endif + + #undef a0 + #undef a1 + #undef a2 + #undef a3 + #undef k0 + #undef k1 + #undef k2 + #undef k3 } #endif //----------------------------------------------------------------------------- // Wrapper implementations -template < bool bswap > -static void nh_16(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl) { +template +static void nh_16( const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl ) { #if defined(HAVE_32BIT_PLATFORM) && defined(HAVE_SSE_2) nh_16_sse2(mp, kp, nw, rh, rl); #else @@ -384,8 +389,8 @@ static void nh_16(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & #endif } -template < bool bswap > -static void nh_vmac_nhbytes(const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl) { +template +static void nh_vmac_nhbytes( const uint8_t * mp, const uint64_t * kp, size_t nw, uint64_t & rh, uint64_t & rl ) { #if defined(HAVE_32BIT_PLATFORM) && defined(HAVE_SSE_2) nh_16_sse2(mp, kp, nw, rh, rl); #else @@ -393,8 +398,8 @@ static void nh_vmac_nhbytes(const uint8_t * mp, const uint64_t * kp, size_t nw, #endif } -static void poly_step(uint64_t & ah, uint64_t & al, const uint64_t & kh, - const uint64_t & kl, const uint64_t & mh, const uint64_t & ml) { +static void poly_step( uint64_t & ah, uint64_t & al, const uint64_t & kh, const uint64_t & kl, + const uint64_t & mh, const uint64_t & ml ) { #if defined(HAVE_32BIT_PLATFORM) && defined(HAVE_SSE_2) poly_step_sse2(ah, al, kh, kl, mh, ml); #else @@ -410,38 +415,38 @@ static void poly_step(uint64_t & ah, uint64_t & al, const uint64_t & kh, //----------------------------------------------------------------------------- #include "AES.h" -typedef uint32_t aes_int_key[4*(VMAC_KEY_LEN/32+7)]; +typedef uint32_t aes_int_key[4 * (VMAC_KEY_LEN / 32 + 7)]; -#define aes_encryption(in,out,int_key) \ - AES_Encrypt<10>(int_key, \ - (const uint8_t *)(in), \ +#define aes_encryption(in,out,int_key) \ + AES_Encrypt<10>(int_key, \ + (const uint8_t *)(in), \ (uint8_t *)(out)) -#define aes_key_setup(user_key,int_key) \ - AES_KeySetup_Enc(int_key, \ - (const uint8_t *)(user_key), \ +#define aes_key_setup(user_key,int_key) \ + AES_KeySetup_Enc(int_key, \ + (const uint8_t *)(user_key), \ VMAC_KEY_LEN) //----------------------------------------------------------------------------- typedef struct { - uint64_t nhkey [(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)]; - uint64_t polykey[2*VMAC_TAG_LEN/64]; - uint64_t l3key [2*VMAC_TAG_LEN/64]; - aes_int_key cipher_key; + uint64_t nhkey[(VMAC_NHBYTES / 8) + 2 * (VMAC_TAG_LEN / 64 - 1)]; + uint64_t polykey[2 * VMAC_TAG_LEN / 64]; + uint64_t l3key[2 * VMAC_TAG_LEN / 64]; + aes_int_key cipher_key; } vmac_ctx_t; //----------------------------------------------------------------------------- #if defined(_MSC_VER) -# if !defined(_WIN64) -# define _mmm_empty _mm_empty(); -# else // _WIN64 -# define _mmm_empty -# endif // _WIN64 + #if !defined(_WIN64) + #define _mmm_empty _mm_empty(); + #else // _WIN64 + #define _mmm_empty + #endif // _WIN64 #else // _MSC_VER -# define _mmm_empty __asm volatile ( "emms" ::: "memory" ); + #define _mmm_empty __asm volatile ("emms" ::: "memory"); #endif // _MSC_VER -static void vhash_abort(vmac_ctx_t *ctx) { +static void vhash_abort( vmac_ctx_t * ctx ) { #if defined(HAVE_32BIT_PLATFORM) && defined(HAVE_SSE_2) _mmm_empty /* SSE2 version of poly_step uses mmx instructions */ #endif @@ -449,64 +454,64 @@ static void vhash_abort(vmac_ctx_t *ctx) { #undef _mmm_empty -template < bool bswap > -static void vmac_set_key(uint8_t user_key[], vmac_ctx_t *ctx) { - uint64_t in[2] = {0}, out[2]; +template +static void vmac_set_key( uint8_t user_key[], vmac_ctx_t * ctx ) { + uint64_t in[2] = { 0 }, out[2]; uint32_t i; aes_key_setup(user_key, ctx->cipher_key); /* Fill nh key */ ((uint8_t *)in)[0] = 0x80; - for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) { + for (i = 0; i < sizeof(ctx->nhkey) / 8; i += 2) { aes_encryption((uint8_t *)in, (uint8_t *)out, ctx->cipher_key); - ctx->nhkey[i ] = GET_U64((uint8_t *)out, 0); - ctx->nhkey[i+1] = GET_U64((uint8_t *)out, 8); + ctx->nhkey[i ] = GET_U64((uint8_t *)out, 0); + ctx->nhkey[i + 1] = GET_U64((uint8_t *)out, 8); ((uint8_t *)in)[15] += 1; } /* Fill poly key */ ((uint8_t *)in)[0] = 0xC0; - in[1] = 0; - for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) { + in [1] = 0; + for (i = 0; i < sizeof(ctx->polykey) / 8; i += 2) { aes_encryption((uint8_t *)in, (uint8_t *)out, ctx->cipher_key); // "& mpoly" code is moved into vhash() due to new seeding - ctx->polykey[i ] = GET_U64((uint8_t *)out, 0); - ctx->polykey[i+1] = GET_U64((uint8_t *)out, 8); + ctx->polykey[i ] = GET_U64((uint8_t *)out, 0); + ctx->polykey[i + 1] = GET_U64((uint8_t *)out, 8); ((uint8_t *)in)[15] += 1; } /* Fill ip key */ ((uint8_t *)in)[0] = 0xE0; - in[1] = 0; - for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) { + in [1] = 0; + for (i = 0; i < sizeof(ctx->l3key) / 8; i += 2) { do { aes_encryption((uint8_t *)in, (uint8_t *)out, ctx->cipher_key); - ctx->l3key[i ] = GET_U64((uint8_t *)out, 0); - ctx->l3key[i+1] = GET_U64((uint8_t *)out, 8); + ctx->l3key[i ] = GET_U64((uint8_t *)out, 0); + ctx->l3key[i + 1] = GET_U64((uint8_t *)out, 8); ((uint8_t *)in)[15] += 1; - } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64); + } while (ctx->l3key[i] >= p64 || ctx->l3key[i + 1] >= p64); } } -static uint64_t l3hash(uint64_t p1, uint64_t p2, uint64_t k1, uint64_t k2, uint64_t len) { - uint64_t rh, rl, t, z=0; +static uint64_t l3hash( uint64_t p1, uint64_t p2, uint64_t k1, uint64_t k2, uint64_t len ) { + uint64_t rh, rl, t, z = 0; /* fully reduce (p1,p2)+(len,0) mod p127 */ - t = p1 >> 63; + t = p1 >> 63; p1 &= m63; ADD128(p1, p2, len, t); /* At this point, (p1,p2) is at most 2^127+(len<<64) */ - t = (p1 > m63) + ((p1 == m63) && (p2 == m64)); - ADD128(p1, p2, z, t); + t = (p1 > m63) + ((p1 == m63) && (p2 == m64)); + ADD128(p1, p2, z , t); p1 &= m63; /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */ - t = p1 + (p2 >> 32); - t += (t >> 32); - t += (uint32_t)t > 0xfffffffeu; - p1 += (t >> 32); - p2 += (p1 << 32); + t = p1 + (p2 >> 32); + t += (t >> 32); + t += (uint32_t)t > 0xfffffffeu; + p1 += (t >> 32); + p2 += (p1 << 32); /* compute (p1+k1)%p64 and (p2+k2)%p64 */ p1 += k1; @@ -516,42 +521,42 @@ static uint64_t l3hash(uint64_t p1, uint64_t p2, uint64_t k1, uint64_t k2, uint6 /* compute (p1+k1)*(p2+k2)%p64 */ MUL64(rh, rl, p1, p2); - t = rh >> 56; + t = rh >> 56; ADD128(t, rl, z, rh); rh <<= 8; ADD128(t, rl, z, rh); - t += t << 8; - rl += t; - rl += (0 - (rl < t)) & 257; - rl += (0 - (rl > p64-1)) & 257; + t += t << 8; + rl += t; + rl += (0 - (rl < t )) & 257; + rl += (0 - (rl > p64 - 1)) & 257; return rl; } // Homegrown (unofficial) seeding -template < bool bswap > -static uint64_t vhash(const uint8_t * mptr, size_t mbytes, uint64_t seed, vmac_ctx_t * ctx) { - uint64_t rh, rl; +template +static uint64_t vhash( const uint8_t * mptr, size_t mbytes, uint64_t seed, vmac_ctx_t * ctx ) { + uint64_t rh, rl; const uint64_t * kptr = ctx->nhkey; - size_t i, remaining; - uint64_t ch, cl; - uint64_t pkh = (ctx->polykey[0] ^ ROTR64(seed, 24)) & mpoly; - uint64_t pkl = (ctx->polykey[1] ^ seed ) & mpoly; + size_t i, remaining; + uint64_t ch, cl; + uint64_t pkh = (ctx->polykey[0] ^ ROTR64(seed, 24)) & mpoly; + uint64_t pkl = (ctx->polykey[1] ^ seed ) & mpoly; - i = mbytes / VMAC_NHBYTES; + i = mbytes / VMAC_NHBYTES; remaining = mbytes % VMAC_NHBYTES; if (i) { - nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl); + nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES / 8, ch, cl); ch &= m62; - ADD128(ch,cl,pkh,pkl); + ADD128(ch, cl, pkh, pkl); i--; } else if (remaining) { alignas(16) uint8_t buf[VMAC_NHBYTES]; memcpy(buf, mptr, remaining); memset(buf + remaining, 0, sizeof(buf) - remaining); - nh_16(buf,kptr,2*((remaining+15)/16),ch,cl); + nh_16(buf, kptr, 2 * ((remaining + 15) / 16), ch, cl); ch &= m62; - ADD128(ch,cl,pkh,pkl); + ADD128(ch, cl, pkh, pkl); goto do_l3; } else { ch = pkh; cl = pkl; @@ -560,20 +565,20 @@ static uint64_t vhash(const uint8_t * mptr, size_t mbytes, uint64_t seed, vmac_c while (i--) { mptr += VMAC_NHBYTES; - nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); - rh &= m62; - poly_step(ch,cl,pkh,pkl,rh,rl); + nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES / 8, rh, rl); + rh &= m62; + poly_step(ch, cl, pkh, pkl, rh, rl); } if (remaining) { alignas(16) uint8_t buf[VMAC_NHBYTES]; memcpy(buf, mptr + VMAC_NHBYTES, remaining); memset(buf + remaining, 0, sizeof(buf) - remaining); - nh_16(buf,kptr,2*((remaining+15)/16),rh,rl); + nh_16(buf, kptr, 2 * ((remaining + 15) / 16), rh, rl); rh &= m62; - poly_step(ch,cl,pkh,pkl,rh,rl); + poly_step(ch, cl, pkh, pkl, rh, rl); } -do_l3: + do_l3: vhash_abort(ctx); remaining *= 8; return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1], remaining); @@ -582,76 +587,77 @@ static uint64_t vhash(const uint8_t * mptr, size_t mbytes, uint64_t seed, vmac_c //----------------------------------------------------------------------------- class VHASH_initializer { -public: - alignas(16) vmac_ctx_t ctx; + public: + alignas(16) vmac_ctx_t ctx; - VHASH_initializer() { - alignas(4) uint8_t key[1 + VMAC_KEY_LEN/8] = "abcdefghijklmnop"; + VHASH_initializer() { + alignas(4) uint8_t key[1 + VMAC_KEY_LEN / 8] = "abcdefghijklmnop"; if (isBE()) { vmac_set_key(key, &ctx); } else { vmac_set_key(key, &ctx); } - } + } - ~VHASH_initializer() { - } -}; + ~VHASH_initializer() {} +}; // class VHASH_initializer // WARNING: this is shared across CPUs, and so must be read-only // during hashing!! // Making this thread-local has a sizable performance hit. static VHASH_initializer vhi; -template < bool bswap > -static void VHASH32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void VHASH32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t hash = vhash((const uint8_t *)in, len, (uint64_t)seed, &(vhi.ctx)); + PUT_U32(hash, (uint8_t *)out, 0); } -template < bool bswap > -static void VHASH64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void VHASH64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t hash = vhash((const uint8_t *)in, len, (uint64_t)seed, &(vhi.ctx)); + PUT_U64(hash, (uint8_t *)out, 0); } //----------------------------------------------------------------------------- REGISTER_FAMILY(vmac, - $.src_url = "https://www.fastcrypto.org/vmac/", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://www.fastcrypto.org/vmac/", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(VHASH__32, - $.desc = "VHASH low 32 bits, by Ted Krovetz and Wei Dai", - $.hash_flags = - FLAG_HASH_AES_BASED | - FLAG_HASH_CRYPTOGRAPHIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_ASM | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x613E4735, - $.verification_BE = 0x8797E01C, - $.hashfn_native = VHASH32, - $.hashfn_bswap = VHASH32 -); + $.desc = "VHASH low 32 bits, by Ted Krovetz and Wei Dai", + $.hash_flags = + FLAG_HASH_AES_BASED | + FLAG_HASH_CRYPTOGRAPHIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_ASM | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x613E4735, + $.verification_BE = 0x8797E01C, + $.hashfn_native = VHASH32, + $.hashfn_bswap = VHASH32 + ); REGISTER_HASH(VHASH, - $.desc = "VHASH, by Ted Krovetz and Wei Dai", - $.hash_flags = - FLAG_HASH_AES_BASED | - FLAG_HASH_CRYPTOGRAPHIC, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_ASM | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x7417A00F, - $.verification_BE = 0x81C8B066, - $.hashfn_native = VHASH64, - $.hashfn_bswap = VHASH64 -); + $.desc = "VHASH, by Ted Krovetz and Wei Dai", + $.hash_flags = + FLAG_HASH_AES_BASED | + FLAG_HASH_CRYPTOGRAPHIC, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_ASM | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x7417A00F, + $.verification_BE = 0x81C8B066, + $.hashfn_native = VHASH64, + $.hashfn_bswap = VHASH64 + ); diff --git a/hashes/wyhash.cpp b/hashes/wyhash.cpp index 110a14ed..b48e8bdf 100644 --- a/hashes/wyhash.cpp +++ b/hashes/wyhash.cpp @@ -41,18 +41,18 @@ //----------------------------------------------------------------------------- // Data reading functions, common to 32- and 64-bit hashes -template < bool bswap > -static inline uint64_t _wyr8(const uint8_t * p) { - return GET_U64(p, 0); +template +static inline uint64_t _wyr8( const uint8_t * p ) { + return GET_U64(p, 0); } -template < bool bswap > -static inline uint64_t _wyr4(const uint8_t * p) { - return GET_U32(p, 0); +template +static inline uint64_t _wyr4( const uint8_t * p ) { + return GET_U32(p, 0); } -static inline uint64_t _wyr3(const uint8_t * p, size_t k) { - return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1]; +static inline uint64_t _wyr3( const uint8_t * p, size_t k ) { + return (((uint64_t)p[0]) << 16) | (((uint64_t)p[k >> 1]) << 8) | p[k - 1]; } //----------------------------------------------------------------------------- @@ -62,216 +62,219 @@ static inline uint64_t _wyr3(const uint8_t * p, size_t k) { // choice of strict. I.e. for a given set of template parameter // choices, this function should always give the same answer // regardless of platform. -static inline uint64_t _wyrot(uint64_t x) { return ROTL64(x, 32); } +static inline uint64_t _wyrot( uint64_t x ) { return ROTL64(x, 32); } // TODO: pass mum32bit template param through _wyhash64 -template < bool mum32bit, bool strict > -static inline void _wymum(uint64_t *A, uint64_t *B){ - if (mum32bit) { - uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(uint32_t)*B, lh=(uint32_t)*A*(*B>>32), ll=(uint64_t)(uint32_t)*A*(uint32_t)*B; - if (strict) { - *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll; +template +static inline void _wymum( uint64_t * A, uint64_t * B ) { + if (mum32bit) { + uint64_t hh = (*A >> 32) * (*B >> 32), hl = (*A >> 32) * (uint32_t)*B, + lh = (uint32_t)*A * (*B >> 32), ll = (uint64_t)(uint32_t)*A * (uint32_t)*B; + if (strict) { + *A ^= _wyrot(hl) ^ hh; *B ^= _wyrot(lh) ^ ll; + } else { + *A = _wyrot(hl) ^ hh; *B = _wyrot(lh) ^ ll; + } } else { - *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll; + uint64_t rlo, rhi; + mult64_128(rlo, rhi, *A, *B); + if (strict) { + *A ^= rlo; *B ^= rhi; + } else { + *A = rlo; *B = rhi; + } } - } else { - uint64_t rlo, rhi; - mult64_128(rlo, rhi, *A, *B); - if (strict) { - *A^=rlo; *B^=rhi; - } else { - *A=rlo; *B=rhi; - } - } } //----------------------------------------------------------------------------- // multiply and xor mix function, aka MUM -template < bool strict > -static inline uint64_t _wymix(uint64_t A, uint64_t B) { - _wymum(&A,&B); - return A^B; +template +static inline uint64_t _wymix( uint64_t A, uint64_t B ) { + _wymum(&A, &B); + return A ^ B; } // wyhash64 main function -template < bool bswap, bool strict > -static inline uint64_t _wyhash64(const void * key, size_t len, uint64_t seed, const uint64_t * secrets) { - const uint8_t * p = (const uint8_t *)key; - uint64_t a, b; +template +static inline uint64_t _wyhash64( const void * key, size_t len, uint64_t seed, const uint64_t * secrets ) { + const uint8_t * p = (const uint8_t *)key; + uint64_t a, b; - seed ^= secrets[0]; + seed ^= secrets[0]; - if (likely(len <= 16)) { - if (likely(len >= 4)) { - a = (_wyr4(p) << 32) | _wyr4(p+((len>>3)<<2)); - b = (_wyr4(p+len-4)<<32)| _wyr4(p+len-4-((len>>3)<<2)); - } else if (likely(len>0)) { - a = _wyr3(p,len); - b=0; + if (likely(len <= 16)) { + if (likely(len >= 4)) { + a = (_wyr4(p) << 32) | _wyr4(p + ((len >> 3) << 2)); + b = (_wyr4(p + len - 4) << 32) | _wyr4(p + len - 4 - ((len >> 3) << 2)); + } else if (likely(len > 0)) { + a = _wyr3(p, len); + b = 0; + } else { + a = b = 0; + } } else { - a = b = 0; - } - } else { - size_t i = len; - if (unlikely(i>48)) { - uint64_t see1=seed, see2=seed; - do { - seed=_wymix(_wyr8(p) ^secrets[1], _wyr8(p+8) ^seed); - see1=_wymix(_wyr8(p+16)^secrets[2], _wyr8(p+24)^see1); - see2=_wymix(_wyr8(p+32)^secrets[3], _wyr8(p+40)^see2); - p+=48; i-=48; - } while(likely(i>48)); - seed ^= see1 ^ see2; + size_t i = len; + if (unlikely(i > 48)) { + uint64_t see1 = seed, see2 = seed; + do { + seed = _wymix(_wyr8(p) ^ secrets[1], _wyr8(p + 8) ^ seed); + see1 = _wymix(_wyr8(p + 16) ^ secrets[2], _wyr8(p + 24) ^ see1); + see2 = _wymix(_wyr8(p + 32) ^ secrets[3], _wyr8(p + 40) ^ see2); + p += 48; i -= 48; + } while (likely(i > 48)); + seed ^= see1 ^ see2; + } + while (unlikely(i > 16)) { + seed = _wymix(_wyr8(p) ^ secrets[1], _wyr8(p + 8) ^ seed); + i -= 16; p += 16; + } + a = _wyr8(p + i - 16); + b = _wyr8(p + i - 8); } - while (unlikely(i>16)) { - seed = _wymix(_wyr8(p)^secrets[1], _wyr8(p+8)^seed); - i-=16; p+=16; - } - a=_wyr8(p+i-16); - b=_wyr8(p+i-8); - } - return _wymix(secrets[1]^len, _wymix(a^secrets[1], b^seed)); + return _wymix(secrets[1] ^ len, _wymix(a ^ secrets[1], b ^ seed)); } //----------------------------------------------------------------------------- // 32-bit hash function -static inline void _wymix32(uint32_t * A, uint32_t * B) { - uint64_t c; - c = *A ^ 0x53c5ca59; - c *= *B ^ 0x74743c1b; - *A = (uint32_t)c; - *B = (uint32_t)(c >> 32); +static inline void _wymix32( uint32_t * A, uint32_t * B ) { + uint64_t c; + + c = *A ^ 0x53c5ca59; + c *= *B ^ 0x74743c1b; + *A = (uint32_t)c; + *B = (uint32_t)(c >> 32); } -template < bool bswap > -static inline uint32_t _wyhash32(const void * key, uint64_t len, uint32_t seed) { - const uint8_t * p = (const uint8_t *)key; - uint64_t i = len; - uint32_t see1 = (uint32_t)len; +template +static inline uint32_t _wyhash32( const void * key, uint64_t len, uint32_t seed ) { + const uint8_t * p = (const uint8_t *)key; + uint64_t i = len; + uint32_t see1 = (uint32_t )len; - seed ^= (uint32_t)(len>>32); - _wymix32(&seed, &see1); + seed ^= (uint32_t)(len >> 32); + _wymix32(&seed, &see1); - for (;i>8;i-=8,p+=8) { - seed ^= _wyr4(p); - see1 ^= _wyr4(p+4); + for (; i > 8; i -= 8, p += 8) { + seed ^= _wyr4(p ); + see1 ^= _wyr4(p + 4); + _wymix32(&seed, &see1); + } + if (i >= 4) { + seed ^= _wyr4(p ); + see1 ^= _wyr4(p + i - 4); + } else if (i) { + seed ^= _wyr3(p, (size_t)i); + } + _wymix32(&seed, &see1); _wymix32(&seed, &see1); - } - if (i>=4) { - seed ^= _wyr4(p); - see1 ^= _wyr4(p + i - 4); - } else if (i) { - seed ^= _wyr3(p, (size_t)i); - } - _wymix32(&seed, &see1); - _wymix32(&seed, &see1); - return seed ^ see1; + return seed ^ see1; } //----------------------------------------------------------------------------- // the default secret parameters static const uint64_t _wyp[4] = { - UINT64_C(0xa0761d6478bd642f), UINT64_C(0xe7037ed1a0b428db), - UINT64_C(0x8ebc6af09c88c6e3), UINT64_C(0x589965cc75374cc3) + UINT64_C(0xa0761d6478bd642f), UINT64_C(0xe7037ed1a0b428db), + UINT64_C(0x8ebc6af09c88c6e3), UINT64_C(0x589965cc75374cc3) }; //----------------------------------------------------------------------------- -template < bool bswap > -static void Wyhash32(const void * in, const size_t len, const seed_t seed, void * out) { - PUT_U32(_wyhash32(in, (uint64_t)len, (uint32_t)seed), (uint8_t *)out, 0); +template +static void Wyhash32( const void * in, const size_t len, const seed_t seed, void * out ) { + PUT_U32(_wyhash32(in, (uint64_t)len, (uint32_t)seed), (uint8_t *)out, 0); } -template < bool bswap, bool strict > -static void Wyhash64(const void * in, const size_t len, const seed_t seed, void * out) { - PUT_U64(_wyhash64(in, len, (uint64_t)seed, _wyp), (uint8_t *)out, 0); +template +static void Wyhash64( const void * in, const size_t len, const seed_t seed, void * out ) { + PUT_U64(_wyhash64(in, len, (uint64_t)seed, _wyp), (uint8_t *)out, 0); } //----------------------------------------------------------------------------- -static bool wyhash64_selftest(void) { - struct { - const uint64_t hash; - const char * key; - } selftests[] = { - { UINT64_C(0x42bc986dc5eec4d3), "" }, - { UINT64_C(0x84508dc903c31551), "a" }, - { UINT64_C(0x0bc54887cfc9ecb1), "abc" }, - { UINT64_C(0x6e2ff3298208a67c), "message digest" }, - { UINT64_C(0x9a64e42e897195b9), "abcdefghijklmnopqrstuvwxyz" }, - { UINT64_C(0x9199383239c32554), "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" }, - { UINT64_C(0x7c1ccf6bba30f5a5), "12345678901234567890123456789012345678901234567890123456789012345678901234567890" }, - }; +static bool wyhash64_selftest( void ) { + struct { + const uint64_t hash; + const char * key; + } selftests[] = { + { UINT64_C (0x42bc986dc5eec4d3), "" } , + { UINT64_C (0x84508dc903c31551), "a" } , + { UINT64_C (0x0bc54887cfc9ecb1), "abc" } , + { UINT64_C (0x6e2ff3298208a67c), "message digest" } , + { UINT64_C (0x9a64e42e897195b9), "abcdefghijklmnopqrstuvwxyz" }, + { UINT64_C (0x9199383239c32554), "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" }, + { + UINT64_C(0x7c1ccf6bba30f5a5), + "12345678901234567890123456789012345678901234567890123456789012345678901234567890" + }, + }; - for (int i = 0; i < sizeof(selftests)/sizeof(selftests[0]); i++) { - uint64_t h; - if (isLE()) { - Wyhash64(selftests[i].key, strlen(selftests[i].key), i, &h); - } else { - Wyhash64(selftests[i].key, strlen(selftests[i].key), i, &h); - // h is in little-endian format - h = COND_BSWAP(h, true); - } - if (h != selftests[i].hash) { - printf("Hash %016lx != expected %016lx for string \"%s\"\n", - h, selftests[i].hash, selftests[i].key); - return false; + for (int i = 0; i < sizeof(selftests) / sizeof(selftests[0]); i++) { + uint64_t h; + if (isLE()) { + Wyhash64(selftests[i].key, strlen(selftests[i].key), i, &h); + } else { + Wyhash64(selftests[i].key, strlen(selftests[i].key), i, &h); + // h is in little-endian format + h = COND_BSWAP(h, true); + } + if (h != selftests[i].hash) { + printf("Hash %016lx != expected %016lx for string \"%s\"\n", h, selftests[i].hash, selftests[i].key); + return false; + } } - } - return true; + return true; } - //----------------------------------------------------------------------------- REGISTER_FAMILY(wyhash, - $.src_url = "https://github.com/wangyi-fudan/wyhash", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/wangyi-fudan/wyhash", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(wyhash_32, - $.desc = "wyhash v3, 32-bit native version", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 32, - $.verification_LE = 0x09DE8066, - $.verification_BE = 0x9D86BAC7, - $.hashfn_native = Wyhash32, - $.hashfn_bswap = Wyhash32, - $.seedfixfn = excludeBadseeds, - $.badseeds = { 0x429dacdd, 0xd637dbf3 } -); + $.desc = "wyhash v3, 32-bit native version", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 32, + $.verification_LE = 0x09DE8066, + $.verification_BE = 0x9D86BAC7, + $.hashfn_native = Wyhash32, + $.hashfn_bswap = Wyhash32, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x429dacdd, 0xd637dbf3 } + ); REGISTER_HASH(wyhash, - $.desc = "wyhash v3, 64-bit non-strict version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0x67031D43, - $.verification_BE = 0x912E4607, - $.hashfn_native = Wyhash64, - $.hashfn_bswap = Wyhash64, - $.initfn = wyhash64_selftest, - $.seedfixfn = excludeBadseeds, - $.badseeds = { 0x14cc886e, 0x1bf4ed84, UINT64_C(0x14cc886e14cc886e) } // all seeds with those lower bits ? -); + $.desc = "wyhash v3, 64-bit non-strict version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0x67031D43, + $.verification_BE = 0x912E4607, + $.hashfn_native = Wyhash64, + $.hashfn_bswap = Wyhash64, + $.initfn = wyhash64_selftest, + $.seedfixfn = excludeBadseeds, + $.badseeds = { 0x14cc886e, 0x1bf4ed84, UINT64_C (0x14cc886e14cc886e) } // all seeds with those lower bits ? + ); REGISTER_HASH(wyhash__strict, - $.desc = "wyhash v3, 64-bit strict version", - $.hash_flags = - 0, - $.impl_flags = - FLAG_IMPL_MULTIPLY_64_128 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, - $.bits = 64, - $.verification_LE = 0xA82DBAD7, - $.verification_BE = 0xDB7957D4, - $.hashfn_native = Wyhash64, - $.hashfn_bswap = Wyhash64 -); + $.desc = "wyhash v3, 64-bit strict version", + $.hash_flags = + 0, + $.impl_flags = + FLAG_IMPL_MULTIPLY_64_128 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_PUBLIC_DOMAIN, + $.bits = 64, + $.verification_LE = 0xA82DBAD7, + $.verification_BE = 0xDB7957D4, + $.hashfn_native = Wyhash64, + $.hashfn_bswap = Wyhash64 + ); diff --git a/hashes/x17.cpp b/hashes/x17.cpp index 68b3e702..82e687c7 100644 --- a/hashes/x17.cpp +++ b/hashes/x17.cpp @@ -28,37 +28,38 @@ #include "Hashlib.h" //------------------------------------------------------------ -static uint32_t x17_impl(const uint8_t * data, size_t len, uint32_t h) { - for(size_t i = 0; i < len; ++i) { +static uint32_t x17_impl( const uint8_t * data, size_t len, uint32_t h ) { + for (size_t i = 0; i < len; ++i) { h = 17 * h + (data[i] - ' '); } return h ^ (h >> 16); } //------------------------------------------------------------ -template < bool bswap > -static void x17(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void x17( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = x17_impl((const uint8_t *)in, len, (uint32_t)seed); + PUT_U32(h, (uint8_t *)out, 0); } //------------------------------------------------------------ REGISTER_FAMILY(x17, - $.src_url = "https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp", - $.src_status = HashFamilyInfo::SRC_FROZEN -); + $.src_url = "https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp", + $.src_status = HashFamilyInfo::SRC_FROZEN + ); REGISTER_HASH(x17, - $.desc = "x17", - $.hash_flags = - FLAG_HASH_SMALL_SEED, - $.impl_flags = - FLAG_IMPL_SLOW | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_LICENSE_MIT, - $.bits = 32, - $.verification_LE = 0x8128E14C, - $.verification_BE = 0x9AD0FE22, - $.hashfn_native = x17, - $.hashfn_bswap = x17 -); + $.desc = "x17", + $.hash_flags = + FLAG_HASH_SMALL_SEED, + $.impl_flags = + FLAG_IMPL_SLOW | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_LICENSE_MIT, + $.bits = 32, + $.verification_LE = 0x8128E14C, + $.verification_BE = 0x9AD0FE22, + $.hashfn_native = x17, + $.hashfn_bswap = x17 + ); diff --git a/hashes/xxhash.cpp b/hashes/xxhash.cpp index cc515ec7..6aa2496e 100644 --- a/hashes/xxhash.cpp +++ b/hashes/xxhash.cpp @@ -34,13 +34,13 @@ #include "Mathmult.h" -//#define FORCE_SCALAR +// #define FORCE_SCALAR //------------------------------------------------------------ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 #define XXH_VERSION_RELEASE 1 -#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + XXH_VERSION_RELEASE) // Used to prevent unwanted optimizations for var. // @@ -56,26 +56,26 @@ // XXH3_initCustomSecret_scalar(). #if defined(HAVE_X86_64_ASM) || defined(HAVE_ARM_ASM) || \ defined(HAVE_ARM64_ASM) || defined(HAVE_PPC_ASM) -#define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) + #define XXH_COMPILER_GUARD(var) __asm__ __volatile__ ("" : "+r" (var)) #else -#define XXH_COMPILER_GUARD(var) ((void)var) + #define XXH_COMPILER_GUARD(var) ((void)var) #endif //------------------------------------------------------------ // XXH32 family -- functions used in the classic 32-bit xxHash algorithm // #define instead of static const, to be used as initializers -#define XXH_PRIME32_1 0x9E3779B1 // 0b10011110001101110111100110110001 -#define XXH_PRIME32_2 0x85EBCA77 // 0b10000101111010111100101001110111 -#define XXH_PRIME32_3 0xC2B2AE3D // 0b11000010101100101010111000111101 -#define XXH_PRIME32_4 0x27D4EB2F // 0b00100111110101001110101100101111 -#define XXH_PRIME32_5 0x165667B1 // 0b00010110010101100110011110110001 +#define XXH_PRIME32_1 0x9E3779B1 // 0b10011110001101110111100110110001 +#define XXH_PRIME32_2 0x85EBCA77 // 0b10000101111010111100101001110111 +#define XXH_PRIME32_3 0xC2B2AE3D // 0b11000010101100101010111000111101 +#define XXH_PRIME32_4 0x27D4EB2F // 0b00100111110101001110101100101111 +#define XXH_PRIME32_5 0x165667B1 // 0b00010110010101100110011110110001 // Mixes all bits to finalize the hash. // The final mix ensures that all input bits have a chance to impact // any bit in the output digest, resulting in an unbiased // distribution. -static uint32_t XXH32_avalanche(uint32_t hash) { +static uint32_t XXH32_avalanche( uint32_t hash ) { hash ^= hash >> 15; hash *= XXH_PRIME32_2; hash ^= hash >> 13; @@ -88,17 +88,17 @@ static uint32_t XXH32_avalanche(uint32_t hash) { // There may be up to 15 bytes remaining to consume from the input. // This final stage will digest them to ensure that all input bytes // are present in the final mix. -template < bool bswap > -static uint32_t XXH32_finalize(uint32_t hash, const uint8_t * ptr, size_t len) { +template +static uint32_t XXH32_finalize( uint32_t hash, const uint8_t * ptr, size_t len ) { while (len >= 4) { hash += GET_U32(ptr, 0) * XXH_PRIME32_3; - ptr += 4; - hash = ROTL32(hash, 17) * XXH_PRIME32_4; - len -= 4; + ptr += 4; + hash = ROTL32(hash, 17) * XXH_PRIME32_4; + len -= 4; } while (len > 0) { hash += (*ptr++) * XXH_PRIME32_5; - hash = ROTL32(hash, 11) * XXH_PRIME32_1; + hash = ROTL32(hash, 11) * XXH_PRIME32_1; --len; } return XXH32_avalanche(hash); @@ -138,7 +138,7 @@ static uint32_t XXH32_finalize(uint32_t hash, const uint8_t * ptr, size_t len) { // This is also enabled on AArch64, as Clang autovectorizes it incorrectly // and it is pointless writing a NEON implementation that is basically the // same speed as scalar for XXH32. -static uint32_t XXH32_round(uint32_t acc, uint32_t input) { +static uint32_t XXH32_round( uint32_t acc, uint32_t input ) { acc += input * XXH_PRIME32_2; acc = ROTL32(acc, 13); acc *= XXH_PRIME32_1; @@ -148,12 +148,12 @@ static uint32_t XXH32_round(uint32_t acc, uint32_t input) { return acc; } -template < bool bswap > -static uint32_t XXH32_impl(const uint8_t * input, size_t len, uint32_t seed) { +template +static uint32_t XXH32_impl( const uint8_t * input, size_t len, uint32_t seed ) { uint32_t h32; - if (len>=16) { - const uint8_t * const bEnd = input + len; + if (len >= 16) { + const uint8_t * const bEnd = input + len; const uint8_t * const limit = bEnd - 15; uint32_t v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; uint32_t v2 = seed + XXH_PRIME32_2; @@ -161,21 +161,21 @@ static uint32_t XXH32_impl(const uint8_t * input, size_t len, uint32_t seed) { uint32_t v4 = seed - XXH_PRIME32_1; do { - v1 = XXH32_round(v1, GET_U32(input, 0)); - v2 = XXH32_round(v2, GET_U32(input, 4)); - v3 = XXH32_round(v3, GET_U32(input, 8)); - v4 = XXH32_round(v4, GET_U32(input, 12)); + v1 = XXH32_round(v1, GET_U32(input, 0)); + v2 = XXH32_round(v2, GET_U32(input, 4)); + v3 = XXH32_round(v3, GET_U32(input, 8)); + v4 = XXH32_round(v4, GET_U32(input, 12)); input += 16; } while (input < limit); h32 = ROTL32(v1, 1) + ROTL32(v2, 7) + ROTL32(v3, 12) + ROTL32(v4, 18); } else { - h32 = seed + XXH_PRIME32_5; + h32 = seed + XXH_PRIME32_5; } h32 += (uint32_t)len; - return XXH32_finalize(h32, input, len&15); + return XXH32_finalize(h32, input, len & 15); } //------------------------------------------------------------ @@ -193,21 +193,21 @@ static uint32_t XXH32_impl(const uint8_t * input, size_t len, uint32_t seed) { // 0b0010011111010100111010110010111100010110010101100110011111000101 #define XXH_PRIME64_5 UINT64_C(0x27D4EB2F165667C5) -static uint64_t XXH64_round(uint64_t acc, uint64_t input) { +static uint64_t XXH64_round( uint64_t acc, uint64_t input ) { acc += input * XXH_PRIME64_2; acc = ROTL64(acc, 31); acc *= XXH_PRIME64_1; return acc; } -static uint64_t XXH64_mergeRound(uint64_t acc, uint64_t val) { +static uint64_t XXH64_mergeRound( uint64_t acc, uint64_t val ) { val = XXH64_round(0, val); acc ^= val; acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; return acc; } -static uint64_t XXH64_avalanche(uint64_t hash) { +static uint64_t XXH64_avalanche( uint64_t hash ) { hash ^= hash >> 33; hash *= XXH_PRIME64_2; hash ^= hash >> 29; @@ -220,35 +220,35 @@ static uint64_t XXH64_avalanche(uint64_t hash) { // There may be up to 31 bytes remaining to consume from the input. // This final stage will digest them to ensure that all input bytes // are present in the final mix. -template < bool bswap > -static uint64_t XXH64_finalize(uint64_t hash, const uint8_t * ptr, size_t len) { +template +static uint64_t XXH64_finalize( uint64_t hash, const uint8_t * ptr, size_t len ) { while (len >= 8) { uint64_t const k1 = XXH64_round(0, GET_U64(ptr, 0)); - ptr += 8; + ptr += 8; hash ^= k1; - hash = ROTL64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; - len -= 8; + hash = ROTL64(hash, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; } if (len >= 4) { hash ^= (uint64_t)(GET_U32(ptr, 0)) * XXH_PRIME64_1; - ptr += 4; - hash = ROTL64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; - len -= 4; + ptr += 4; + hash = ROTL64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; } while (len > 0) { hash ^= (*ptr++) * XXH_PRIME64_5; - hash = ROTL64(hash, 11) * XXH_PRIME64_1; + hash = ROTL64(hash, 11) * XXH_PRIME64_1; --len; } - return XXH64_avalanche(hash); + return XXH64_avalanche(hash); } -template < bool bswap > -static uint64_t XXH64_impl(const uint8_t * input, size_t len, uint64_t seed) { +template +static uint64_t XXH64_impl( const uint8_t * input, size_t len, uint64_t seed ) { uint64_t h64; - if (len>=32) { - const uint8_t * const bEnd = input + len; + if (len >= 32) { + const uint8_t * const bEnd = input + len; const uint8_t * const limit = bEnd - 31; uint64_t v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; uint64_t v2 = seed + XXH_PRIME64_2; @@ -256,12 +256,12 @@ static uint64_t XXH64_impl(const uint8_t * input, size_t len, uint64_t seed) { uint64_t v4 = seed - XXH_PRIME64_1; do { - v1 = XXH64_round(v1, GET_U64(input, 0)); - v2 = XXH64_round(v2, GET_U64(input, 8)); - v3 = XXH64_round(v3, GET_U64(input, 16)); - v4 = XXH64_round(v4, GET_U64(input, 24)); + v1 = XXH64_round(v1, GET_U64(input, 0)); + v2 = XXH64_round(v2, GET_U64(input, 8)); + v3 = XXH64_round(v3, GET_U64(input, 16)); + v4 = XXH64_round(v4, GET_U64(input, 24)); input += 32; - } while (input(h64, input, len&31); + return XXH64_finalize(h64, input, len & 31); } //------------------------------------------------------------ @@ -412,59 +412,62 @@ alignas(64) static const uint8_t XXH3_kSecret[XXH3_SECRET_DEFAULT_SIZE] = { * -O2, but the other one we can't control without "failed to inline always * inline function due to target mismatch" warnings. */ -# if defined(__GNUC__) && !defined(__clang__) && /* GCC, not Clang */ \ + #if defined(__GNUC__) && !defined(__clang__) && /* GCC, not Clang */ \ defined(__OPTIMIZE__) -# define XXH3_POP_PRAGMA -# pragma GCC push_options -# pragma GCC optimize("-O2") -# endif + #define XXH3_POP_PRAGMA + #pragma GCC push_options + #pragma GCC optimize("-O2") + #endif #endif //------------------------------------------------------------ typedef struct { - uint64_t low64; // value & 0xFFFFFFFFFFFFFFFF - uint64_t high64; // value >> 64 + uint64_t low64; // value & 0xFFFFFFFFFFFFFFFF + uint64_t high64; // value >> 64 } XXH128_hash_t; -static inline uint64_t XXH_mult32to64(uint32_t lhs, uint32_t rhs) { +static inline uint64_t XXH_mult32to64( uint32_t lhs, uint32_t rhs ) { uint64_t r64; + mult32_64(r64, lhs, rhs); return r64; } -static inline XXH128_hash_t XXH_mult64to128(uint64_t lhs, uint64_t rhs) { +static inline XXH128_hash_t XXH_mult64to128( uint64_t lhs, uint64_t rhs ) { XXH128_hash_t r128; + mult64_128(r128.low64, r128.high64, lhs, rhs); return r128; } -static uint64_t XXH3_mul128_fold64(uint64_t lhs, uint64_t rhs) { +static uint64_t XXH3_mul128_fold64( uint64_t lhs, uint64_t rhs ) { XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; } // Seems to produce slightly better code on GCC for some reason. -static FORCE_INLINE uint64_t XXH_xorshift64(uint64_t v64, const int shift) { - //static_assert(0 <= shift && shift < 64, "valid shift value"); +static FORCE_INLINE uint64_t XXH_xorshift64( uint64_t v64, const int shift ) { + // static_assert(0 <= shift && shift < 64, "valid shift value"); return v64 ^ (v64 >> shift); } // This is a fast avalanche stage, suitable when input bits are // already partially mixed. -static uint64_t XXH3_avalanche(uint64_t h64) { - h64 = XXH_xorshift64(h64, 37); +static uint64_t XXH3_avalanche( uint64_t h64 ) { + h64 = XXH_xorshift64(h64, 37); h64 *= UINT64_C(0x165667919E3779F9); - h64 = XXH_xorshift64(h64, 32); + h64 = XXH_xorshift64(h64, 32); return h64; } // This is a stronger avalanche, inspired by Pelle Evensen's rrmxmx. // preferable when input has not been previously mixed. -static uint64_t XXH3_rrmxmx(uint64_t h64, uint64_t len) { +static uint64_t XXH3_rrmxmx( uint64_t h64, uint64_t len ) { /* this mix is inspired by Pelle Evensen's rrmxmx */ h64 ^= ROTL64(h64, 49) ^ ROTL64(h64, 24); h64 *= UINT64_C(0x9FB21C651E98DF25); - h64 ^= (h64 >> 35) + len ; + h64 ^= (h64 >> 35) + len; h64 *= UINT64_C(0x9FB21C651E98DF25); return XXH_xorshift64(h64, 28); } @@ -502,50 +505,56 @@ static uint64_t XXH3_rrmxmx(uint64_t h64, uint64_t len) { // // This adds an extra layer of strength for custom secrets. -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_len_1to3_64b(const uint8_t * input, size_t len, const uint8_t * secret, uint64_t seed) { +template +static FORCE_INLINE uint64_t XXH3_len_1to3_64b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { // len = 1: combined = { input[0], 0x01, input[0], input[0] } // len = 2: combined = { input[1], 0x02, input[0], input[1] } // len = 3: combined = { input[2], 0x03, input[0], input[1] } - uint8_t const c1 = input[0]; - uint8_t const c2 = input[len >> 1]; - uint8_t const c3 = input[len - 1]; - uint32_t const combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) | - ((uint32_t)c3 << 0) | ((uint32_t)len << 8); - uint64_t const bitflip = (GET_U32(secret,0) ^ GET_U32(secret,4)) + seed; - uint64_t const keyed = (uint64_t)combined ^ bitflip; + uint8_t const c1 = input[0]; + uint8_t const c2 = input[len >> 1]; + uint8_t const c3 = input[len - 1]; + uint32_t const combined = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) | + ((uint32_t)c3 << 0) | ((uint32_t)len << 8); + uint64_t const bitflip = (GET_U32(secret, 0) ^ GET_U32(secret, 4)) + seed; + uint64_t const keyed = (uint64_t)combined ^ bitflip; + return XXH64_avalanche(keyed); } -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_len_4to8_64b(const uint8_t * input, size_t len, const uint8_t * secret, uint64_t seed) { +template +static FORCE_INLINE uint64_t XXH3_len_4to8_64b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { seed ^= (uint64_t)BSWAP((uint32_t)seed) << 32; - uint32_t const input1 = GET_U32(input, 0); - uint32_t const input2 = GET_U32(input, len - 4); + uint32_t const input1 = GET_U32(input, 0 ); + uint32_t const input2 = GET_U32(input, len - 4); uint64_t const input64 = input2 + (((uint64_t)input1) << 32); - uint64_t const bitflip = (GET_U64(secret, 8) ^ GET_U64(secret,16)) - seed; - uint64_t const keyed = input64 ^ bitflip; + uint64_t const bitflip = (GET_U64(secret, 8) ^ GET_U64(secret, 16)) - seed; + uint64_t const keyed = input64 ^ bitflip; return XXH3_rrmxmx(keyed, len); } -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_len_9to16_64b(const uint8_t * input, size_t len, const uint8_t * secret, uint64_t seed) { - uint64_t const bitflip1 = (GET_U64(secret,24) ^ GET_U64(secret,32)) + seed; - uint64_t const bitflip2 = (GET_U64(secret,40) ^ GET_U64(secret,48)) - seed; - uint64_t const input_lo = GET_U64(input, 0) ^ bitflip1; +template +static FORCE_INLINE uint64_t XXH3_len_9to16_64b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { + uint64_t const bitflip1 = (GET_U64(secret, 24) ^ GET_U64(secret, 32)) + seed; + uint64_t const bitflip2 = (GET_U64(secret, 40) ^ GET_U64(secret, 48)) - seed; + uint64_t const input_lo = GET_U64(input, 0 ) ^ bitflip1; uint64_t const input_hi = GET_U64(input, len - 8) ^ bitflip2; - uint64_t const acc = len + input_hi + BSWAP(input_lo) + - XXH3_mul128_fold64(input_lo, input_hi); + uint64_t const acc = len + input_hi + BSWAP(input_lo) + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); } -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_len_0to16_64b(const uint8_t * input, size_t len, const uint8_t * secret, uint64_t seed) { - if (likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); - if (likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); - if (len) return XXH3_len_1to3_64b(input, len, secret, seed); - return XXH64_avalanche(seed ^ GET_U64(secret,56) ^ - GET_U64(secret,64)); +template +static FORCE_INLINE uint64_t XXH3_len_0to16_64b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { + if (likely(len > 8)) { return XXH3_len_9to16_64b(input, len, secret, seed); } + if (likely(len >= 4)) { return XXH3_len_4to8_64b(input, len, secret, seed); } + if (len) { return XXH3_len_1to3_64b(input, len, secret, seed); } + return XXH64_avalanche(seed ^ GET_U64(secret, 56) ^ + GET_U64(secret, 64)); } //------------------------------------------------------------ @@ -592,41 +601,38 @@ static FORCE_INLINE uint64_t XXH3_len_0to16_64b(const uint8_t * input, size_t le #define XXH3_MIDSIZE_MAX 240 -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_mix16B(const uint8_t * RESTRICT input, - const uint8_t * RESTRICT secret, uint64_t seed64) { -#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ +template +static FORCE_INLINE uint64_t XXH3_mix16B( const uint8_t * RESTRICT input, + const uint8_t * RESTRICT secret, uint64_t seed64 ) { +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ XXH_COMPILER_GUARD(seed64); #endif uint64_t const input_lo = GET_U64(input, 0); uint64_t const input_hi = GET_U64(input, 8); - return XXH3_mul128_fold64( - input_lo ^ (GET_U64(secret, 0) + seed64), + return XXH3_mul128_fold64(input_lo ^ (GET_U64(secret, 0) + seed64), input_hi ^ (GET_U64(secret, 8) - seed64)); } -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_len_17to128_64b( - const uint8_t * RESTRICT input, size_t len, - const uint8_t * RESTRICT secret, size_t secretSize, - uint64_t seed) { +template +static FORCE_INLINE uint64_t XXH3_len_17to128_64b( const uint8_t * RESTRICT input, size_t len, + const uint8_t * RESTRICT secret, size_t secretSize, uint64_t seed ) { uint64_t acc = len * XXH_PRIME64_1; if (len > 32) { if (len > 64) { if (len > 96) { - acc += XXH3_mix16B(input+48, secret+96, seed); - acc += XXH3_mix16B(input+len-64, secret+112, seed); + acc += XXH3_mix16B(input + 48 , secret + 96, seed); + acc += XXH3_mix16B(input + len - 64, secret + 112, seed); } - acc += XXH3_mix16B(input+32, secret+64, seed); - acc += XXH3_mix16B(input+len-48, secret+80, seed); + acc += XXH3_mix16B(input + 32 , secret + 64, seed); + acc += XXH3_mix16B(input + len - 48, secret + 80, seed); } - acc += XXH3_mix16B(input+16, secret+32, seed); - acc += XXH3_mix16B(input+len-32, secret+48, seed); + acc += XXH3_mix16B(input + 16 , secret + 32, seed); + acc += XXH3_mix16B(input + len - 32, secret + 48, seed); } - acc += XXH3_mix16B(input+0, secret+0, seed); - acc += XXH3_mix16B(input+len-16, secret+16, seed); + acc += XXH3_mix16B(input + 0 , secret + 0, seed); + acc += XXH3_mix16B(input + len - 16, secret + 16, seed); return XXH3_avalanche(acc); } @@ -649,27 +655,25 @@ static FORCE_INLINE uint64_t XXH3_len_17to128_64b( // This loop is the easiest to fix, as unlike XXH32, this pragma // _actually works_ because it is a loop vectorization instead of an // SLP vectorization. -template < bool bswap > -static NEVER_INLINE uint64_t XXH3_len_129to240_64b( - const uint8_t * RESTRICT input, size_t len, - const uint8_t * RESTRICT secret, size_t secretSize, - uint64_t seed) { - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 +template +static NEVER_INLINE uint64_t XXH3_len_129to240_64b( const uint8_t * RESTRICT input, size_t len, + const uint8_t * RESTRICT secret, size_t secretSize, uint64_t seed ) { +#define XXH3_MIDSIZE_STARTOFFSET 3 +#define XXH3_MIDSIZE_LASTOFFSET 17 - uint64_t acc = len * XXH_PRIME64_1; + uint64_t acc = len * XXH_PRIME64_1; int const nbRounds = (int)len / 16; for (int i = 0; i < 8; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed); } acc = XXH3_avalanche(acc); #if defined(__clang__) && (defined(__ARM_NEON) || defined(__ARM_NEON__)) -# pragma clang loop vectorize(disable) + #pragma clang loop vectorize(disable) #endif - for (int i = 8 ; i < nbRounds; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + for (int i = 8; i < nbRounds; i++) { + acc += XXH3_mix16B(input + (16 * i), secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed); } /* last bytes */ acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); @@ -695,42 +699,43 @@ static NEVER_INLINE uint64_t XXH3_len_129to240_64b( // XXH64). // A doubled version of 1to3_64b with different constants. -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const uint8_t* input, - size_t len, const uint8_t* secret, uint64_t seed) { +template +static FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { /* * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } */ - uint8_t const c1 = input[0]; - uint8_t const c2 = input[len >> 1]; - uint8_t const c3 = input[len - 1]; - uint32_t const combinedl = ((uint32_t)c1 <<16) | ((uint32_t)c2 << 24) - | ((uint32_t)c3 << 0) | ((uint32_t)len << 8); + uint8_t const c1 = input[0]; + uint8_t const c2 = input[len >> 1]; + uint8_t const c3 = input[len - 1]; + uint32_t const combinedl = ((uint32_t)c1 << 16) | ((uint32_t)c2 << 24) | + ((uint32_t)c3 << 0) | ((uint32_t)len << 8); uint32_t const combinedh = ROTL32(BSWAP(combinedl), 13); - uint64_t const bitflipl = (GET_U32(secret,0) ^ GET_U32(secret, 4)) + seed; - uint64_t const bitfliph = (GET_U32(secret,8) ^ GET_U32(secret,12)) - seed; - uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl; - uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph; - XXH128_hash_t h128 = { XXH64_avalanche(keyed_lo), XXH64_avalanche(keyed_hi) }; + uint64_t const bitflipl = (GET_U32(secret, 0) ^ GET_U32(secret, 4)) + seed; + uint64_t const bitfliph = (GET_U32(secret, 8) ^ GET_U32(secret, 12)) - seed; + uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl; + uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph; + XXH128_hash_t h128 = { XXH64_avalanche(keyed_lo), XXH64_avalanche(keyed_hi) }; + return h128; } -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const uint8_t* input, - size_t len, const uint8_t* secret, uint64_t seed) { +template +static FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { seed ^= (uint64_t)BSWAP((uint32_t)seed) << 32; - uint32_t const input_lo = GET_U32(input, 0); + uint32_t const input_lo = GET_U32(input, 0 ); uint32_t const input_hi = GET_U32(input, len - 4); uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32); - uint64_t const bitflip = (GET_U64(secret,16) ^ GET_U64(secret,24)) + seed; - uint64_t const keyed = input_64 ^ bitflip; + uint64_t const bitflip = (GET_U64(secret, 16) ^ GET_U64(secret, 24)) + seed; + uint64_t const keyed = input_64 ^ bitflip; /* Shift len to the left to ensure it is even, this avoids even multiplies. */ XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); - m128.high64 += (m128.low64 << 1); + m128.high64 += (m128.low64 << 1); m128.low64 ^= (m128.high64 >> 3); m128.low64 = XXH_xorshift64(m128.low64, 35); @@ -740,14 +745,15 @@ static FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const uint8_t* input, return m128; } -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const uint8_t* input, - size_t len, const uint8_t* secret, uint64_t seed) { - uint64_t const bitflipl = (GET_U64(secret,32) ^ GET_U64(secret,40)) - seed; - uint64_t const bitfliph = (GET_U64(secret,48) ^ GET_U64(secret,56)) + seed; - uint64_t const input_lo = GET_U64(input, 0); +template +static FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { + uint64_t const bitflipl = (GET_U64(secret, 32) ^ GET_U64(secret, 40)) - seed; + uint64_t const bitfliph = (GET_U64(secret, 48) ^ GET_U64(secret, 56)) + seed; + uint64_t const input_lo = GET_U64(input, 0 ); uint64_t input_hi = GET_U64(input, len - 8); - XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* * Put len in the middle of m128 to ensure that the length gets mixed to * both the low and high bits in the 128x64 multiply below. @@ -798,28 +804,28 @@ static FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const uint8_t* input, m128.high64 += input_hi + XXH_mult32to64((uint32_t)input_hi, XXH_PRIME32_2 - 1); #endif /* m128 ^= XXH_swap64(m128 >> 64); */ - m128.low64 ^= BSWAP(m128.high64); + m128.low64 ^= BSWAP(m128.high64); /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); h128.high64 += m128.high64 * XXH_PRIME64_2; - h128.low64 = XXH3_avalanche(h128.low64); + h128.low64 = XXH3_avalanche(h128.low64 ); h128.high64 = XXH3_avalanche(h128.high64); return h128; } // Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const uint8_t* input, - size_t len, const uint8_t* secret, uint64_t seed) { - if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); - if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); - if (len) return XXH3_len_1to3_128b(input, len, secret, seed); - - uint64_t const bitflipl = GET_U64(secret,64) ^ GET_U64(secret,72); - uint64_t const bitfliph = GET_U64(secret,80) ^ GET_U64(secret,88); - XXH128_hash_t h128 = { XXH64_avalanche(seed ^ bitflipl), XXH64_avalanche( seed ^ bitfliph) }; +template +static FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b( const uint8_t * input, + size_t len, const uint8_t * secret, uint64_t seed ) { + if (len > 8) { return XXH3_len_9to16_128b(input, len, secret, seed); } + if (len >= 4) { return XXH3_len_4to8_128b(input, len, secret, seed); } + if (len) { return XXH3_len_1to3_128b(input, len, secret, seed); } + + uint64_t const bitflipl = GET_U64(secret, 64) ^ GET_U64(secret, 72); + uint64_t const bitfliph = GET_U64(secret, 80) ^ GET_U64(secret, 88); + XXH128_hash_t h128 = { XXH64_avalanche(seed ^ bitflipl), XXH64_avalanche(seed ^ bitfliph) }; return h128; } @@ -827,83 +833,70 @@ static FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const uint8_t* input, // XXH3-128 mid-range keys // A bit slower than XXH3_mix16B, but handles multiply by zero better. -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc, - const uint8_t* input_1, const uint8_t* input_2, - const uint8_t* secret, uint64_t seed) { - acc.low64 += XXH3_mix16B(input_1, secret+0, seed); +template +static FORCE_INLINE XXH128_hash_t XXH128_mix32B( XXH128_hash_t acc, const uint8_t * input_1, + const uint8_t * input_2, const uint8_t * secret, uint64_t seed ) { + acc.low64 += XXH3_mix16B(input_1, secret + 0, seed); acc.low64 ^= GET_U64(input_2, 0) + GET_U64(input_2, 8); - acc.high64 += XXH3_mix16B(input_2, secret+16, seed); + acc.high64 += XXH3_mix16B(input_2, secret + 16, seed); acc.high64 ^= GET_U64(input_1, 0) + GET_U64(input_1, 8); return acc; } -template < bool bswap > -static FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b( - const uint8_t* RESTRICT input, size_t len, - const uint8_t* RESTRICT secret, size_t secretSize, uint64_t seed) { +template +static FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b( const uint8_t * RESTRICT input, size_t len, + const uint8_t * RESTRICT secret, size_t secretSize, uint64_t seed ) { XXH128_hash_t acc = { len * XXH_PRIME64_1, acc.high64 = 0 }; if (len > 32) { if (len > 64) { if (len > 96) { - acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, seed); } - acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + acc = XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed); } - acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed); } - acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed); XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) + - (acc.high64 * XXH_PRIME64_4) + - ((len - seed) * XXH_PRIME64_2) ; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ( (len - seed ) * XXH_PRIME64_2); h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); return h128; } -template < bool bswap > -static NEVER_INLINE XXH128_hash_t XXH3_len_129to240_128b( - const uint8_t* RESTRICT input, size_t len, - const uint8_t* RESTRICT secret, size_t secretSize, uint64_t seed) { +template +static NEVER_INLINE XXH128_hash_t XXH3_len_129to240_128b( const uint8_t * RESTRICT input, size_t len, + const uint8_t * RESTRICT secret, size_t secretSize, uint64_t seed ) { XXH128_hash_t acc; - int const nbRounds = (int)len / 32; + int const nbRounds = (int)len / 32; - acc.low64 = len * XXH_PRIME64_1; + acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; for (int i = 0; i < 4; i++) { - acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + (32 * i), - seed); + acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, secret + (32 * i), seed); } - acc.low64 = XXH3_avalanche(acc.low64); + acc.low64 = XXH3_avalanche(acc.low64 ); acc.high64 = XXH3_avalanche(acc.high64); for (int i = 4; i < nbRounds; i++) { - acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), - seed); + acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), seed); } /* last bytes */ - acc = XXH128_mix32B(acc, - input + len - 16, - input + len - 32, - secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, - UINT64_C(0) - seed); + acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, UINT64_C(0) - seed); XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) + - (acc.high64 * XXH_PRIME64_4) + - ((len - seed) * XXH_PRIME64_2) ; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ( (len - seed ) * XXH_PRIME64_2); h128.low64 = XXH3_avalanche(h128.low64); h128.high64 = (uint64_t)0 - XXH3_avalanche(h128.high64); return h128; @@ -933,26 +926,27 @@ static NEVER_INLINE XXH128_hash_t XXH3_len_129to240_128b( // // This doesn't matter on 64-bit hashes since they all get merged // together in the end, so we skip the extra step. -template < bool bswap > -static FORCE_INLINE void XXH3_scalarRound(void * RESTRICT acc, - void const * RESTRICT input, void const * RESTRICT secret, size_t lane) { - uint64_t * xacc = (uint64_t*) acc; - uint8_t const * xinput = (uint8_t const*) input; - uint8_t const * xsecret = (uint8_t const*) secret; - uint64_t const data_val = GET_U64(xinput, lane * 8); - uint64_t const data_key = data_val ^ GET_U64(xsecret, lane * 8); +template +static FORCE_INLINE void XXH3_scalarRound( void * RESTRICT acc, void const * RESTRICT input, + void const * RESTRICT secret, size_t lane ) { + uint64_t * xacc = (uint64_t * )acc; + uint8_t const * xinput = (uint8_t const *)input; + uint8_t const * xsecret = (uint8_t const *)secret; + uint64_t const data_val = GET_U64 (xinput, lane * 8); + uint64_t const data_key = data_val ^ GET_U64(xsecret, lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ - xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); } -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_scalar(void * RESTRICT acc, - const void * RESTRICT input, const void * RESTRICT secret) { +template +static FORCE_INLINE void XXH3_accumulate_512_scalar( void * RESTRICT acc, + const void * RESTRICT input, const void * RESTRICT secret ) { /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ -#if defined(__GNUC__) && !defined(__clang__) \ - && (defined(__arm__) || defined(__thumb2__)) \ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ -# pragma GCC unroll 8 + #pragma GCC unroll 8 #endif for (size_t i = 0; i < XXH_ACC_NB; i++) { XXH3_scalarRound(acc, input, secret, i); @@ -973,22 +967,21 @@ static FORCE_INLINE void XXH3_accumulate_512_scalar(void * RESTRICT acc, // Since our algorithm uses a pseudorandom secret to add some variance // into the mix, we don't need to (or want to) mix as often or as much // as HighwayHash does. -template < bool bswap > -static FORCE_INLINE void XXH3_scalarScrambleRound(void * RESTRICT acc, - void const* RESTRICT secret, size_t lane) { - uint64_t* const xacc = (uint64_t*) acc; /* presumed aligned */ - const uint8_t* const xsecret = (const uint8_t*) secret; /* no alignment restriction */ - uint64_t const key64 = GET_U64(xsecret, lane * 8); +template +static FORCE_INLINE void XXH3_scalarScrambleRound( void * RESTRICT acc, void const * RESTRICT secret, size_t lane ) { + uint64_t * const xacc = (uint64_t * )acc; /* presumed aligned */ + const uint8_t * const xsecret = (const uint8_t *)secret; /* no alignment restriction */ + uint64_t const key64 = GET_U64(xsecret, lane * 8); uint64_t acc64 = xacc[lane]; - acc64 = XXH_xorshift64(acc64, 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; + + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; xacc[lane] = acc64; } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_scalar(void * RESTRICT acc, - const void * RESTRICT secret) { +template +static FORCE_INLINE void XXH3_scrambleAcc_scalar( void * RESTRICT acc, const void * RESTRICT secret ) { for (size_t i = 0; i < XXH_ACC_NB; i++) { XXH3_scalarScrambleRound(acc, secret, i); } @@ -1024,15 +1017,15 @@ static FORCE_INLINE void XXH3_scrambleAcc_scalar(void * RESTRICT acc, // XXH3_64bits_withSeed, len == 256, Snapdragon 835 // without hack: 2654.4 MB/s // with hack: 3202.9 MB/s -template < bool bswap > -static FORCE_INLINE void XXH3_initCustomSecret_scalar(void * RESTRICT customSecret, - uint64_t seed64) { +template +static FORCE_INLINE void XXH3_initCustomSecret_scalar( void * RESTRICT customSecret, uint64_t seed64 ) { /* * We need a separate pointer for the GUARD hack below, * which requires a non-const pointer. * Any decent compiler will optimize this out otherwise. */ - const uint8_t* kSecretPtr = XXH3_kSecret; + const uint8_t * kSecretPtr = XXH3_kSecret; + #if defined(__clang__) && defined(__aarch64__) XXH_COMPILER_GUARD(kSecretPtr); #endif @@ -1045,10 +1038,10 @@ static FORCE_INLINE void XXH3_initCustomSecret_scalar(void * RESTRICT customSecr * loads together for free. Putting the loads together before the stores * properly generates LDP. */ - uint64_t lo = GET_U64(kSecretPtr, 16*i) + seed64; - uint64_t hi = GET_U64(kSecretPtr, 16*i + 8) - seed64; - PUT_U64(lo, (uint8_t*)customSecret, 16*i ); - PUT_U64(hi, (uint8_t*)customSecret, 16*i + 8); + uint64_t lo = GET_U64(kSecretPtr, 16 * i ) + seed64; + uint64_t hi = GET_U64(kSecretPtr, 16 * i + 8) - seed64; + PUT_U64(lo, (uint8_t *)customSecret, 16 * i ); + PUT_U64(hi, (uint8_t *)customSecret, 16 * i + 8); } } @@ -1064,65 +1057,64 @@ static FORCE_INLINE void XXH3_initCustomSecret_scalar(void * RESTRICT customSecr #define XXH_VSX 5 #if defined(__has_builtin) -# define XXH_HAS_BUILTIN(x) __has_builtin(x) + #define XXH_HAS_BUILTIN(x) __has_builtin(x) #else -# define XXH_HAS_BUILTIN(x) 0 + #define XXH_HAS_BUILTIN(x) 0 #endif -#if !defined(FORCE_SCALAR) && defined(HAVE_PPC_VSX) && \ - !defined(HAVE_PPC_ASM) && !defined(__s390x__) && \ +#if !defined(FORCE_SCALAR) && defined(HAVE_PPC_VSX) && \ + !defined(HAVE_PPC_ASM) && !defined(__s390x__) && \ !(defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)) -#warning "PPC mulo/mule compiler support not found; falling back to scalar code" -#define FORCE_SCALAR + #warning "PPC mulo/mule compiler support not found; falling back to scalar code" + #define FORCE_SCALAR #endif #if defined(FORCE_SCALAR) -#define XXH_VECTOR XXH_SCALAR -#define XXH_ACC_ALIGN 8 -#define XXH_SEC_ALIGN 8 + #define XXH_VECTOR XXH_SCALAR + #define XXH_ACC_ALIGN 8 + #define XXH_SEC_ALIGN 8 #elif defined(HAVE_ARM_NEON) -#define XXH_VECTOR XXH_NEON -#define XXH_ACC_ALIGN 16 -#define XXH_SEC_ALIGN 8 -#include "Intrinsics.h" -#include "xxhash/xxh3-arm.h" + #define XXH_VECTOR XXH_NEON + #define XXH_ACC_ALIGN 16 + #define XXH_SEC_ALIGN 8 + #include "Intrinsics.h" + #include "xxhash/xxh3-arm.h" #elif defined(HAVE_PPC_VSX) -#define XXH_VECTOR XXH_VSX -#define XXH_ACC_ALIGN 16 -#define XXH_SEC_ALIGN 8 -#include "Intrinsics.h" -#include "xxhash/xxh3-ppc.h" + #define XXH_VECTOR XXH_VSX + #define XXH_ACC_ALIGN 16 + #define XXH_SEC_ALIGN 8 + #include "Intrinsics.h" + #include "xxhash/xxh3-ppc.h" #elif defined(HAVE_AVX512_F) -#define XXH_VECTOR XXH_AVX512 -#define XXH_ACC_ALIGN 64 -#define XXH_SEC_ALIGN 64 -#include "Intrinsics.h" -#include "xxhash/xxh3-avx512.h" + #define XXH_VECTOR XXH_AVX512 + #define XXH_ACC_ALIGN 64 + #define XXH_SEC_ALIGN 64 + #include "Intrinsics.h" + #include "xxhash/xxh3-avx512.h" #elif defined(HAVE_AVX2) -#define XXH_VECTOR XXH_AVX2 -#define XXH_ACC_ALIGN 32 -#define XXH_SEC_ALIGN 32 -#include "Intrinsics.h" -#include "xxhash/xxh3-avx2.h" + #define XXH_VECTOR XXH_AVX2 + #define XXH_ACC_ALIGN 32 + #define XXH_SEC_ALIGN 32 + #include "Intrinsics.h" + #include "xxhash/xxh3-avx2.h" #elif defined(HAVE_SSE_2) -#define XXH_VECTOR XXH_SSE2 -#define XXH_ACC_ALIGN 16 -#define XXH_SEC_ALIGN 16 -#include "Intrinsics.h" -#include "xxhash/xxh3-sse2.h" + #define XXH_VECTOR XXH_SSE2 + #define XXH_ACC_ALIGN 16 + #define XXH_SEC_ALIGN 16 + #include "Intrinsics.h" + #include "xxhash/xxh3-sse2.h" #else -#define XXH_VECTOR XXH_SCALAR -#define XXH_ACC_ALIGN 8 -#define XXH_SEC_ALIGN 8 + #define XXH_VECTOR XXH_SCALAR + #define XXH_ACC_ALIGN 8 + #define XXH_SEC_ALIGN 8 #endif //------------------------------------------------------------ // XXH3 and XXH3-128 long keys // "Dispatcher" code -template < bool bswap > -static void XXH3_accumulate_512(void * RESTRICT acc, const void * RESTRICT input, - const void * RESTRICT secret) { +template +static void XXH3_accumulate_512( void * RESTRICT acc, const void * RESTRICT input, const void * RESTRICT secret ) { #if (XXH_VECTOR == XXH_AVX512) XXH3_accumulate_512_avx512(acc, input, secret); #elif (XXH_VECTOR == XXH_AVX2) @@ -1138,8 +1130,8 @@ static void XXH3_accumulate_512(void * RESTRICT acc, const void * RESTRICT input #endif } -template < bool bswap > -static void XXH3_scrambleAcc(void * RESTRICT acc, const void * RESTRICT secret) { +template +static void XXH3_scrambleAcc( void * RESTRICT acc, const void * RESTRICT secret ) { #if (XXH_VECTOR == XXH_AVX512) XXH3_scrambleAcc_avx512(acc, secret); #elif (XXH_VECTOR == XXH_AVX2) @@ -1155,8 +1147,8 @@ static void XXH3_scrambleAcc(void * RESTRICT acc, const void * RESTRICT secret) #endif } -template < bool bswap > -static void XXH3_initCustomSecret(void * RESTRICT customSecret, uint64_t seed64) { +template +static void XXH3_initCustomSecret( void * RESTRICT customSecret, uint64_t seed64 ) { #if (XXH_VECTOR == XXH_AVX512) XXH3_initCustomSecret_avx512(customSecret, seed64); #elif (XXH_VECTOR == XXH_AVX2) @@ -1176,53 +1168,48 @@ static void XXH3_initCustomSecret(void * RESTRICT customSecret, uint64_t seed64) // XXH3 and XXH3-128 long keys #if defined(__clang__) -# define XXH_PREFETCH_DIST 320 + #define XXH_PREFETCH_DIST 320 #elif (XXH_VECTOR == XXH_AVX512) -# define XXH_PREFETCH_DIST 512 + #define XXH_PREFETCH_DIST 512 #else -# define XXH_PREFETCH_DIST 384 + #define XXH_PREFETCH_DIST 384 #endif /* __clang__ */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate(uint64_t * RESTRICT acc, - const uint8_t* RESTRICT input, - const uint8_t* RESTRICT secret, - size_t nbStripes) { - for (size_t n = 0; n < nbStripes; n++ ) { - const uint8_t* const in = input + n*XXH_STRIPE_LEN; +template +static FORCE_INLINE void XXH3_accumulate( uint64_t * RESTRICT acc, const uint8_t * RESTRICT input, + const uint8_t * RESTRICT secret, size_t nbStripes ) { + for (size_t n = 0; n < nbStripes; n++) { + const uint8_t * const in = input + n * XXH_STRIPE_LEN; prefetch(in + XXH_PREFETCH_DIST); - XXH3_accumulate_512(acc, in, secret + n*XXH_SECRET_CONSUME_RATE); + XXH3_accumulate_512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE); } } -template < bool bswap > -static FORCE_INLINE void XXH3_hashLong_internal_loop(uint64_t* RESTRICT acc, - const uint8_t* RESTRICT input, size_t len, - const uint8_t* RESTRICT secret, size_t secretSize) { +template +static FORCE_INLINE void XXH3_hashLong_internal_loop( uint64_t * RESTRICT acc, const uint8_t * RESTRICT input, + size_t len, const uint8_t * RESTRICT secret, size_t secretSize ) { size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; - size_t const nb_blocks = (len - 1) / block_len; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1 ) / block_len; for (size_t n = 0; n < nb_blocks; n++) { - XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock); + XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock); XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN); } /* last partial block */ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes); + XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes); /* last stripe */ - const uint8_t* const p = input + len - XXH_STRIPE_LEN; -#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + const uint8_t * const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); } -template < bool bswap > -static FORCE_INLINE uint64_t XXH3_mix2Accs(const uint64_t* RESTRICT acc, const uint8_t* RESTRICT secret) { - return XXH3_mul128_fold64( - acc[0] ^ GET_U64(secret, 0), - acc[1] ^ GET_U64(secret, 8)); +template +static FORCE_INLINE uint64_t XXH3_mix2Accs( const uint64_t * RESTRICT acc, const uint8_t * RESTRICT secret ) { + return XXH3_mul128_fold64(acc[0] ^ GET_U64(secret, 0), acc[1] ^ GET_U64(secret, 8)); } // UGLY HACK: @@ -1231,12 +1218,12 @@ static FORCE_INLINE uint64_t XXH3_mix2Accs(const uint64_t* RESTRICT acc, const u // XXH3_64bits, len == 256, Snapdragon 835: // without hack: 2063.7 MB/s // with hack: 2560.7 MB/s -template < bool bswap > -static uint64_t XXH3_mergeAccs(const uint64_t* RESTRICT acc, - const uint8_t* RESTRICT secret, uint64_t start) { +template +static uint64_t XXH3_mergeAccs( const uint64_t * RESTRICT acc, const uint8_t * RESTRICT secret, uint64_t start ) { uint64_t result64 = start; + for (size_t i = 0; i < 4; i++) { - result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); + result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i); #if defined(__clang__) /* Clang */ \ && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ @@ -1250,43 +1237,36 @@ static uint64_t XXH3_mergeAccs(const uint64_t* RESTRICT acc, // It's important for performance that XXH3_hashLong is not inlined. Not sure // why (uop cache maybe?), but the difference is large and easily measurable. -template < bool bswap > -static NEVER_INLINE uint64_t XXH3_hashLong_64b_internal( - const void* RESTRICT input, size_t len, - const void* RESTRICT secret, size_t secretSize) { +template +static NEVER_INLINE uint64_t XXH3_hashLong_64b_internal( const void * RESTRICT input, + size_t len, const void * RESTRICT secret, size_t secretSize ) { alignas(XXH_ACC_ALIGN) uint64_t acc[XXH_ACC_NB] = { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1, }; - XXH3_hashLong_internal_loop(acc, (const uint8_t*)input, len, - (const uint8_t*)secret, secretSize); + XXH3_hashLong_internal_loop(acc, (const uint8_t *)input, len, (const uint8_t *)secret, secretSize); return XXH3_mergeAccs(acc, (const uint8_t *)secret + XXH_SECRET_MERGEACCS_START, (uint64_t)len * XXH_PRIME64_1); } -template < bool bswap > -static NEVER_INLINE XXH128_hash_t XXH3_hashLong_128b_internal( - const void* RESTRICT input, size_t len, - const void* RESTRICT secret, size_t secretSize) { +template +static NEVER_INLINE XXH128_hash_t XXH3_hashLong_128b_internal( const void * RESTRICT input, + size_t len, const void * RESTRICT secret, size_t secretSize ) { alignas(XXH_ACC_ALIGN) uint64_t acc[XXH_ACC_NB] = { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1, }; - XXH3_hashLong_internal_loop(acc, (const uint8_t*)input, len, - (const uint8_t*)secret, secretSize); + XXH3_hashLong_internal_loop(acc, (const uint8_t *)input, len, (const uint8_t *)secret, secretSize); // converge into final hash const XXH128_hash_t h128 = { - /* .low64 = */ XXH3_mergeAccs(acc, - (const uint8_t *)secret + XXH_SECRET_MERGEACCS_START, + /* .low64 = */ XXH3_mergeAccs (acc, (const uint8_t *)secret + XXH_SECRET_MERGEACCS_START, (uint64_t)len * XXH_PRIME64_1), - /* .high64 = */ XXH3_mergeAccs(acc, - (const uint8_t *)secret + secretSize - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((uint64_t)len * XXH_PRIME64_2)), + /* .high64 = */ XXH3_mergeAccs(acc, (const uint8_t *)secret + secretSize - + sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((uint64_t)len * XXH_PRIME64_2)), }; return h128; } @@ -1294,46 +1274,48 @@ static NEVER_INLINE XXH128_hash_t XXH3_hashLong_128b_internal( //------------------------------------------------------------ // XXH3 and XXH3-128 top-level functions -template < bool bswap > -static uint64_t XXH3_64bits_withSeed(const void * input, size_t len, uint64_t seed) { +template +static uint64_t XXH3_64bits_withSeed( const void * input, size_t len, uint64_t seed ) { const uint8_t * RESTRICT secret = (const uint8_t *)XXH3_kSecret; size_t secretLen = sizeof(XXH3_kSecret); - if (len <= 16) - return XXH3_len_0to16_64b((const uint8_t*)input, len, - secret, seed); - if (len <= 128) - return XXH3_len_17to128_64b((const uint8_t*)input, len, - secret, secretLen, seed); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const uint8_t*)input, len, - secret, secretLen, seed); - - if (seed == 0) + if (len <= 16) { + return XXH3_len_0to16_64b((const uint8_t *)input, len, secret, seed); + } + if (len <= 128) { + return XXH3_len_17to128_64b((const uint8_t *)input, len, secret, secretLen, seed); + } + if (len <= XXH3_MIDSIZE_MAX) { + return XXH3_len_129to240_64b((const uint8_t *)input, len, secret, secretLen, seed); + } + + if (seed == 0) { return XXH3_hashLong_64b_internal(input, len, secret, secretLen); + } alignas(XXH_SEC_ALIGN) uint8_t secretbuf[XXH3_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(secretbuf, seed); return XXH3_hashLong_64b_internal(input, len, secretbuf, sizeof(secretbuf)); } -template < bool bswap > -static XXH128_hash_t XXH3_128bits_withSeed(const void * input, size_t len, uint64_t seed) { +template +static XXH128_hash_t XXH3_128bits_withSeed( const void * input, size_t len, uint64_t seed ) { const uint8_t * RESTRICT secret = (const uint8_t *)XXH3_kSecret; size_t secretLen = sizeof(XXH3_kSecret); - if (len <= 16) - return XXH3_len_0to16_128b((const uint8_t*)input, len, - secret, seed); - if (len <= 128) - return XXH3_len_17to128_128b((const uint8_t*)input, len, - secret, secretLen, seed); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const uint8_t*)input, len, - secret, secretLen, seed); - - if (seed == 0) + if (len <= 16) { + return XXH3_len_0to16_128b((const uint8_t *)input, len, secret, seed); + } + if (len <= 128) { + return XXH3_len_17to128_128b((const uint8_t *)input, len, secret, secretLen, seed); + } + if (len <= XXH3_MIDSIZE_MAX) { + return XXH3_len_129to240_128b((const uint8_t *)input, len, secret, secretLen, seed); + } + + if (seed == 0) { return XXH3_hashLong_128b_internal(input, len, secret, secretLen); + } alignas(XXH_SEC_ALIGN) uint8_t secretbuf[XXH3_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(secretbuf, seed); @@ -1341,13 +1323,14 @@ static XXH128_hash_t XXH3_128bits_withSeed(const void * input, size_t len, uint6 } #if defined(XXH3_POP_PRAGMA) -# pragma GCC pop_options + #pragma GCC pop_options #endif //------------------------------------------------------------ -template < bool bswap > -static void XXH32(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void XXH32( const void * in, const size_t len, const seed_t seed, void * out ) { uint32_t h = XXH32_impl((const uint8_t *)in, len, (uint32_t)seed); + #if 0 // Output in "canonical" format if (isLE()) { @@ -1360,9 +1343,10 @@ static void XXH32(const void * in, const size_t len, const seed_t seed, void * o #endif } -template < bool bswap > -static void XXH64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void XXH64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = XXH64_impl((const uint8_t *)in, len, (uint64_t)seed); + #if 0 // Output in "canonical" format if (isLE()) { @@ -1376,9 +1360,10 @@ static void XXH64(const void * in, const size_t len, const seed_t seed, void * o } //------------------------------------------------------------ -template < bool bswap > -static void XXH3_64(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void XXH3_64( const void * in, const size_t len, const seed_t seed, void * out ) { uint64_t h = XXH3_64bits_withSeed(in, len, seed); + // Output in "canonical" BE format if (isLE()) { PUT_U64(h, (uint8_t *)out, 0); @@ -1387,88 +1372,89 @@ static void XXH3_64(const void * in, const size_t len, const seed_t seed, void * } } -template < bool bswap > -static void XXH3_128(const void * in, const size_t len, const seed_t seed, void * out) { +template +static void XXH3_128( const void * in, const size_t len, const seed_t seed, void * out ) { XXH128_hash_t h = XXH3_128bits_withSeed(in, len, seed); + // Output in "canonical" BE format if (isLE()) { PUT_U64(h.high64, (uint8_t *)out, 0); - PUT_U64(h.low64, (uint8_t *)out, 8); + PUT_U64(h.low64 , (uint8_t *)out, 8); } else { PUT_U64(h.high64, (uint8_t *)out, 0); - PUT_U64(h.low64, (uint8_t *)out, 8); + PUT_U64(h.low64 , (uint8_t *)out, 8); } } //------------------------------------------------------------ REGISTER_FAMILY(xxhash, - $.src_url = "https://github.com/Cyan4973/xxHash", - $.src_status = HashFamilyInfo::SRC_ACTIVE -); + $.src_url = "https://github.com/Cyan4973/xxHash", + $.src_status = HashFamilyInfo::SRC_ACTIVE + ); REGISTER_HASH(XXH_32, - $.desc = "xxHash, 32-bit version", - $.hash_flags = - FLAG_HASH_SMALL_SEED | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 32, - $.verification_LE = 0xBA88B743, - $.verification_BE = 0x2BC79298, - $.hashfn_native = XXH32, - $.hashfn_bswap = XXH32 -); + $.desc = "xxHash, 32-bit version", + $.hash_flags = + FLAG_HASH_SMALL_SEED | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 32, + $.verification_LE = 0xBA88B743, + $.verification_BE = 0x2BC79298, + $.hashfn_native = XXH32, + $.hashfn_bswap = XXH32 + ); REGISTER_HASH(XXH_64, - $.desc = "xxHash, 64-bit version", - $.hash_flags = - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY_64_64 | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0x024B7CF4, - $.verification_BE = 0xB96ABE81, - $.hashfn_native = XXH64, - $.hashfn_bswap = XXH64 -); + $.desc = "xxHash, 64-bit version", + $.hash_flags = + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY_64_64 | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0x024B7CF4, + $.verification_BE = 0xB96ABE81, + $.hashfn_native = XXH64, + $.hashfn_bswap = XXH64 + ); REGISTER_HASH(XXH3_64, - $.desc = "xxh3, 64-bit version", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 64, - $.verification_LE = 0x1AAEE62C, - $.verification_BE = 0xF8DBB4DD, - $.hashfn_native = XXH3_64, - $.hashfn_bswap = XXH3_64 -); + $.desc = "xxh3, 64-bit version", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 64, + $.verification_LE = 0x1AAEE62C, + $.verification_BE = 0xF8DBB4DD, + $.hashfn_native = XXH3_64, + $.hashfn_bswap = XXH3_64 + ); REGISTER_HASH(XXH3_128, - $.desc = "xxh3, 128-bit version", - $.hash_flags = - FLAG_HASH_LOOKUP_TABLE | - FLAG_HASH_ENDIAN_INDEPENDENT, - $.impl_flags = - FLAG_IMPL_CANONICAL_LE | - FLAG_IMPL_MULTIPLY | - FLAG_IMPL_ROTATE | - FLAG_IMPL_LICENSE_BSD, - $.bits = 128, - $.verification_LE = 0x288DAA94, - $.verification_BE = 0x6C82FA25, - $.hashfn_native = XXH3_128, - $.hashfn_bswap = XXH3_128 -); + $.desc = "xxh3, 128-bit version", + $.hash_flags = + FLAG_HASH_LOOKUP_TABLE | + FLAG_HASH_ENDIAN_INDEPENDENT, + $.impl_flags = + FLAG_IMPL_CANONICAL_LE | + FLAG_IMPL_MULTIPLY | + FLAG_IMPL_ROTATE | + FLAG_IMPL_LICENSE_BSD, + $.bits = 128, + $.verification_LE = 0x288DAA94, + $.verification_BE = 0x6C82FA25, + $.hashfn_native = XXH3_128, + $.hashfn_bswap = XXH3_128 + ); diff --git a/hashes/xxhash/xxh3-arm.h b/hashes/xxhash/xxh3-arm.h index 5be01687..fbd4182f 100644 --- a/hashes/xxhash/xxh3-arm.h +++ b/hashes/xxhash/xxh3-arm.h @@ -111,22 +111,22 @@ /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ -#if (defined(__GNUC__) || defined(__clang__)) && \ +#if (defined(__GNUC__) || defined(__clang__)) && \ (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) -#define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - /* Undocumented GCC/Clang operand modifier: */ \ - /* %e0 = lower D half, %f0 = upper D half */ \ - __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ - (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ - (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ - } while (0) + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: */ \ + /* %e0 = lower D half, %f0 = upper D half */ \ + __asm__ ("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) #else -#define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - (outLo) = vmovn_u64 (in); \ - (outHi) = vshrn_n_u64 ((in), 32); \ - } while (0) + #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64(in); \ + (outHi) = vshrn_n_u64((in), 32); \ + } while (0) #endif /* @@ -142,14 +142,18 @@ * unaligned load. */ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) + /* silence -Wcast-align */ -static FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) { - return *(uint64x2_t const*)ptr; +static FORCE_INLINE uint64x2_t XXH_vld1q_u64( void const * ptr ) { + return *(uint64x2_t const *)ptr; } + #else -static FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) { - return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); + +static FORCE_INLINE uint64x2_t XXH_vld1q_u64( void const * ptr ) { + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr)); } + #endif // Controls the NEON to scalar ratio for XXH3 @@ -187,9 +191,9 @@ static FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) { // // XXH_ACC_NB is #defined already, back in the main file. #if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) -#define XXH3_NEON_LANES 6 + #define XXH3_NEON_LANES 6 #else -#define XXH3_NEON_LANES XXH_ACC_NB + #define XXH3_NEON_LANES XXH_ACC_NB #endif /* @@ -201,27 +205,27 @@ static FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) { * * See XXH3_NEON_LANES for configuring this and details about this optimization. */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_neon(void * RESTRICT acc, - const void * RESTRICT input, const void * RESTRICT secret) { - uint64x2_t* const xacc = (uint64x2_t *) acc; +template +static FORCE_INLINE void XXH3_accumulate_512_neon( void * RESTRICT acc, const void * RESTRICT input, + const void * RESTRICT secret ) { + uint64x2_t * const xacc = (uint64x2_t * )acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* const xinput = (const uint8_t *) input; - uint8_t const* const xsecret = (const uint8_t *) secret; + uint8_t const * const xinput = (const uint8_t *)input; + uint8_t const * const xsecret = (const uint8_t *)secret; /* AArch64 uses both scalar and neon at the same time */ for (size_t i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { XXH3_scalarRound(acc, input, secret, i); } for (size_t i = 0; i < XXH3_NEON_LANES / 2; i++) { - uint64x2_t acc_vec = xacc[i]; + uint64x2_t acc_vec = xacc[i]; /* data_vec = xinput[i]; */ uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); /* key_vec = xsecret[i]; */ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); if (bswap) { - data_vec = Vbswap64_u64(data_vec); - key_vec = Vbswap64_u64(key_vec); + data_vec = Vbswap64_u64(data_vec); + key_vec = Vbswap64_u64(key_vec ); } uint64x2_t data_key; uint32x2_t data_key_lo, data_key_hi; @@ -229,24 +233,25 @@ static FORCE_INLINE void XXH3_accumulate_512_neon(void * RESTRICT acc, uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1); /* data_key = data_vec ^ key_vec; */ data_key = veorq_u64(data_vec, key_vec); - /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + /* + * data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (data_key >> 32); - * data_key = UNDEFINED; */ + * data_key = UNDEFINED; + */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ - acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi); + acc_vec_2 = vmlal_u32(acc_vec_2, data_key_lo, data_key_hi); /* xacc[i] += acc_vec_2; */ - acc_vec = vaddq_u64 (acc_vec, acc_vec_2); - xacc[i] = acc_vec; + acc_vec = vaddq_u64(acc_vec, acc_vec_2); + xacc[i] = acc_vec; } } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_neon(void * RESTRICT acc, - const void * RESTRICT secret) { - uint64x2_t* xacc = (uint64x2_t*) acc; - uint8_t const* xsecret = (uint8_t const*) secret; - uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); +template +static FORCE_INLINE void XXH3_scrambleAcc_neon( void * RESTRICT acc, const void * RESTRICT secret ) { + uint64x2_t * xacc = (uint64x2_t * )acc; + uint8_t const * xsecret = (uint8_t const *)secret; + uint32x2_t prime = vdup_n_u32(XXH_PRIME32_1); /* AArch64 uses both scalar and neon at the same time */ for (size_t i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { @@ -255,21 +260,23 @@ static FORCE_INLINE void XXH3_scrambleAcc_neon(void * RESTRICT acc, for (size_t i = 0; i < XXH3_NEON_LANES / 2; i++) { /* xacc[i] ^= (xacc[i] >> 47); */ uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); - uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); /* xacc[i] ^= xsecret[i]; */ - uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16)); + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); if (bswap) { key_vec = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(key_vec))); } - uint64x2_t data_key = veorq_u64 (data_vec, key_vec); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1 */ uint32x2_t data_key_lo, data_key_hi; - /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + /* + * data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (xacc[i] >> 32); - * xacc[i] = UNDEFINED; */ + * xacc[i] = UNDEFINED; + */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); { /* @@ -290,7 +297,7 @@ static FORCE_INLINE void XXH3_scrambleAcc_neon(void * RESTRICT acc, * this bug completely. * See https://bugs.llvm.org/show_bug.cgi?id=39967 */ - uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + uint64x2_t prod_hi = vmull_u32(data_key_hi, prime); /* xacc[i] = prod_hi << 32; */ prod_hi = vshlq_n_u64(prod_hi, 32); /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ diff --git a/hashes/xxhash/xxh3-avx2.h b/hashes/xxhash/xxh3-avx2.h index 1a09b450..5d0ebd37 100644 --- a/hashes/xxhash/xxh3-avx2.h +++ b/hashes/xxhash/xxh3-avx2.h @@ -30,65 +30,69 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_avx2( - void* RESTRICT acc, const void* RESTRICT input, - const void* RESTRICT secret) { - __m256i* const xacc = (__m256i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xinput = (const __m256i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; +template +static FORCE_INLINE void XXH3_accumulate_512_avx2( void * RESTRICT acc, const void * RESTRICT input, + const void * RESTRICT secret ) { + __m256i * const xacc = (__m256i * )acc; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. + */ + const __m256i * const xinput = (const __m256i *)input; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. + */ + const __m256i * const xsecret = (const __m256i *)secret; - for (size_t i = 0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { /* data_vec = xinput[i]; */ - __m256i const data_vec = bswap ? - mm256_bswap64(_mm256_loadu_si256(xinput+i)) : - _mm256_loadu_si256(xinput+i); + __m256i const data_vec = bswap ? + mm256_bswap64(_mm256_loadu_si256(xinput + i)) : + _mm256_loadu_si256(xinput + i); /* key_vec = xsecret[i]; */ - __m256i const key_vec = bswap ? - mm256_bswap64(_mm256_loadu_si256(xsecret+i)) : - _mm256_loadu_si256(xsecret+i); + __m256i const key_vec = bswap ? + mm256_bswap64(_mm256_loadu_si256(xsecret + i)) : + _mm256_loadu_si256(xsecret + i); /* data_key = data_vec ^ key_vec; */ - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + __m256i const data_key = _mm256_xor_si256(data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const data_key_lo = _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + __m256i const product = _mm256_mul_epu32(data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ - __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); /* xacc[i] += product; */ xacc[i] = _mm256_add_epi64(product, sum); } } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_avx2(void * RESTRICT acc, - const void * RESTRICT secret) { - __m256i* const xacc = (__m256i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); +template +static FORCE_INLINE void XXH3_scrambleAcc_avx2( void * RESTRICT acc, const void * RESTRICT secret ) { + __m256i * const xacc = (__m256i * )acc; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. + */ + const __m256i * const xsecret = (const __m256i *)secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); - for (size_t i = 0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ - __m256i const acc_vec = xacc[i]; - __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); - __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64(acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256(acc_vec , shifted); /* xacc[i] ^= xsecret; */ - __m256i const key_vec = bswap ? - mm256_bswap64(_mm256_loadu_si256(xsecret+i)) : - _mm256_loadu_si256(xsecret+i); - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + __m256i const key_vec = bswap ? + mm256_bswap64(_mm256_loadu_si256(xsecret + i)) : + _mm256_loadu_si256(xsecret + i); + __m256i const data_key = _mm256_xor_si256(data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ - __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); - __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + __m256i const data_key_hi = _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32(data_key , prime32); + __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32); xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); } } @@ -98,14 +102,14 @@ static FORCE_INLINE void XXH3_scrambleAcc_avx2(void * RESTRICT acc, * - not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ -template < bool bswap > -static FORCE_INLINE void XXH3_initCustomSecret_avx2(void * RESTRICT customSecret, - uint64_t seed64) { - _mm_prefetch((const char*)customSecret, _MM_HINT_T0); - __m256i const seed = _mm256_set_epi64x((int64_t)(UINT64_C(0) - seed64), (int64_t)seed64, (int64_t)(UINT64_C(0) - seed64), (int64_t)seed64); +template +static FORCE_INLINE void XXH3_initCustomSecret_avx2( void * RESTRICT customSecret, uint64_t seed64 ) { + _mm_prefetch((const char *)customSecret, _MM_HINT_T0); + __m256i const seed = _mm256_set_epi64x((int64_t)(UINT64_C(0) - seed64), (int64_t)seed64, + (int64_t)(UINT64_C(0) - seed64), (int64_t)seed64); - const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); - __m256i* dest = ( __m256i*) customSecret; + const __m256i * const src = (const __m256i *)((const void *)XXH3_kSecret); + __m256i * dest = (__m256i * )customSecret; #if defined(__GNUC__) || defined(__clang__) XXH_COMPILER_GUARD(dest); @@ -113,18 +117,18 @@ static FORCE_INLINE void XXH3_initCustomSecret_avx2(void * RESTRICT customSecret /* GCC -O2 need unroll loop manually */ if (bswap) { - dest[0] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+0)), seed)); - dest[1] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+1)), seed)); - dest[2] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+2)), seed)); - dest[3] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+3)), seed)); - dest[4] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+4)), seed)); - dest[5] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src+5)), seed)); + dest[0] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 0)), seed)); + dest[1] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 1)), seed)); + dest[2] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 2)), seed)); + dest[3] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 3)), seed)); + dest[4] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 4)), seed)); + dest[5] = mm256_bswap64(_mm256_add_epi64(mm256_bswap64(_mm256_stream_load_si256(src + 5)), seed)); } else { - dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); - dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); - dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); - dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); - dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); - dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed); + dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed); + dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed); + dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed); + dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed); + dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed); } } diff --git a/hashes/xxhash/xxh3-avx512.h b/hashes/xxhash/xxh3-avx512.h index ceab0035..90a98663 100644 --- a/hashes/xxhash/xxh3-avx512.h +++ b/hashes/xxhash/xxh3-avx512.h @@ -30,53 +30,52 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_avx512( - void * RESTRICT acc, const void * RESTRICT input, - const void * RESTRICT secret) { - - __m512i * const xacc = (__m512i *) acc; +template +static FORCE_INLINE void XXH3_accumulate_512_avx512( void * RESTRICT acc, + const void * RESTRICT input, const void * RESTRICT secret ) { + __m512i * const xacc = (__m512i *)acc; /* data_vec = input[0]; */ - __m512i const data_vec = bswap ? - mm512_bswap64(_mm512_loadu_si512 (input)) : - _mm512_loadu_si512 (input); + __m512i const data_vec = bswap ? + mm512_bswap64(_mm512_loadu_si512(input)) : + _mm512_loadu_si512(input); /* key_vec = secret[0]; */ - __m512i const key_vec = bswap ? - mm512_bswap64(_mm512_loadu_si512 (secret)) : - _mm512_loadu_si512 (secret); + __m512i const key_vec = bswap ? + mm512_bswap64(_mm512_loadu_si512(secret)) : + _mm512_loadu_si512(secret); /* data_key = data_vec ^ key_vec; */ - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + __m512i const data_key = _mm512_xor_si512(data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const data_key_lo = _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + __m512i const product = _mm512_mul_epu32(data_key, data_key_lo); /* xacc[0] += swap(data_vec); */ - __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); - __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ *xacc = _mm512_add_epi64(product, sum); } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_avx512( - void * RESTRICT acc, const void * RESTRICT secret) { - __m512i* const xacc = (__m512i*) acc; - const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); +template +static FORCE_INLINE void XXH3_scrambleAcc_avx512( void * RESTRICT acc, const void * RESTRICT secret ) { + __m512i * const xacc = (__m512i *)acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); /* xacc[0] ^= (xacc[0] >> 47) */ - __m512i const acc_vec = *xacc; - __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); - __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64(acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512(acc_vec , shifted); /* xacc[0] ^= secret; */ - __m512i const key_vec = bswap ? - mm512_bswap64(_mm512_loadu_si512 (secret)) : - _mm512_loadu_si512 (secret); - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + __m512i const key_vec = bswap ? + mm512_bswap64(_mm512_loadu_si512(secret)) : + _mm512_loadu_si512(secret); + __m512i const data_key = _mm512_xor_si512(data_vec, key_vec); /* xacc[0] *= XXH_PRIME32_1; */ - __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); - __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); - __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + __m512i const data_key_hi = _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32(data_key , prime32); + __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); } @@ -85,23 +84,25 @@ static FORCE_INLINE void XXH3_scrambleAcc_avx512( // // fwojcik: Make this GCC-only, since it explicitly supports // union-based type punning, which is otherwise Undefined Behavior -template < bool bswap > -static FORCE_INLINE void XXH3_initCustomSecret_avx512( - void * RESTRICT customSecret, uint64_t seed64) { - int const nbRounds = XXH3_SECRET_DEFAULT_SIZE / sizeof(__m512i); - __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((uint64_t)seed64), 0xAA, (uint64_t)(UINT64_C(0) - seed64)); +template +static FORCE_INLINE void XXH3_initCustomSecret_avx512( void * RESTRICT customSecret, uint64_t seed64 ) { + int const nbRounds = XXH3_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64( + (uint64_t)seed64), 0xAA, (uint64_t)(UINT64_C(0) - seed64)); + + const __m512i * const src = (const __m512i *)((const void *)XXH3_kSecret); + __m512i * const dest = (__m512i * )customSecret; - const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); - __m512i* const dest = ( __m512i*) customSecret; for (int i = 0; i < nbRounds; ++i) { #if defined(__GNUC__) || !defined(__clang__) union { - const __m512i* cp; - void* p; + const __m512i * cp; + void * p; } remote_const_void; remote_const_void.cp = src + i; if (bswap) { - dest[i] = mm512_bswap64(_mm512_add_epi64(mm512_bswap64(_mm512_stream_load_si512(remote_const_void.p)), seed)); + dest[i] = mm512_bswap64(_mm512_add_epi64(mm512_bswap64(_mm512_stream_load_si512( + remote_const_void.p)), seed)); } else { dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); } diff --git a/hashes/xxhash/xxh3-ppc.h b/hashes/xxhash/xxh3-ppc.h index d8d21e97..f013dc36 100644 --- a/hashes/xxhash/xxh3-ppc.h +++ b/hashes/xxhash/xxh3-ppc.h @@ -40,25 +40,31 @@ * inconsistent intrinsics, spotty coverage, and multiple endiannesses. */ -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned int xxh_u32x4; +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned int xxh_u32x4; #if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) -# define XXH_vec_revb vec_revb + #define XXH_vec_revb vec_revb #else + // A polyfill for POWER9's vec_revb(). -static FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { - xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, - 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; +static FORCE_INLINE xxh_u64x2 XXH_vec_revb( xxh_u64x2 val ) { + xxh_u8x16 const vByteSwap = { + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 + }; + return vec_perm(val, val, vByteSwap); } + #endif // Performs an unaligned vector load and byte swaps it on big endian. -template < bool bswap > -static FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { +template +static FORCE_INLINE xxh_u64x2 XXH_vec_loadu( const void * ptr ) { xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); if (bswap) { ret = XXH_vec_revb(ret); @@ -71,51 +77,58 @@ static FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { * * These intrinsics weren't added until GCC 8, despite existing for a while, * and they are endian dependent. Also, their meaning swap depending on version. - * */ + * + */ #if defined(__s390x__) /* s390x is always big endian, no issue on this platform */ -# define XXH_vec_mulo vec_mulo -# define XXH_vec_mule vec_mule + #define XXH_vec_mulo vec_mulo + #define XXH_vec_mule vec_mule #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ -# define XXH_vec_mulo __builtin_altivec_vmulouw -# define XXH_vec_mule __builtin_altivec_vmuleuw + #define XXH_vec_mulo __builtin_altivec_vmulouw + #define XXH_vec_mule __builtin_altivec_vmuleuw #else + /* gcc needs inline assembly */ + /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ -static FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) { +static FORCE_INLINE xxh_u64x2 XXH_vec_mulo( xxh_u32x4 a, xxh_u32x4 b ) { xxh_u64x2 result; - __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + + __asm__ ("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); return result; } -static FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) { + +static FORCE_INLINE xxh_u64x2 XXH_vec_mule( xxh_u32x4 a, xxh_u32x4 b ) { xxh_u64x2 result; - __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + + __asm__ ("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); return result; } + #endif /* XXH_vec_mulo, XXH_vec_mule */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_vsx(void * RESTRICT acc, - const void * RESTRICT input, const void * RESTRICT secret) { +template +static FORCE_INLINE void XXH3_accumulate_512_vsx( void * RESTRICT acc, const void * RESTRICT input, + const void * RESTRICT secret ) { /* presumed aligned */ - uint32_t * const xacc = (uint32_t *) acc; - xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ - xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ - xxh_u64x2 const v32 = { 32, 32 }; + uint32_t * const xacc = (uint32_t * )acc; + xxh_u64x2 const * const xinput = (xxh_u64x2 const *)input; /* no alignment restriction */ + xxh_u64x2 const * const xsecret = (xxh_u64x2 const *)secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); /* key_vec = xsecret[i]; */ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); xxh_u64x2 const data_key = data_vec ^ key_vec; /* shuffled = (data_key << 32) | (data_key >> 32); */ - xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + xxh_u32x4 const shuffled = (xxh_u32x4 )vec_rl(data_key , v32); /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); /* acc_vec = xacc[i]; */ - xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + xxh_u64x2 acc_vec = (xxh_u64x2 )vec_xl(0 , xacc + 4 * i); acc_vec += product; /* swap high and low halves */ @@ -129,14 +142,13 @@ static FORCE_INLINE void XXH3_accumulate_512_vsx(void * RESTRICT acc, } } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_vsx(void * RESTRICT acc, - const void * RESTRICT secret) { - xxh_u64x2* const xacc = (xxh_u64x2*) acc; - const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; +template +static FORCE_INLINE void XXH3_scrambleAcc_vsx( void * RESTRICT acc, const void * RESTRICT secret ) { + xxh_u64x2 * const xacc = (xxh_u64x2 * )acc; + const xxh_u64x2 * const xsecret = (const xxh_u64x2 *)secret; /* constants */ - xxh_u64x2 const v32 = { 32, 32 }; - xxh_u64x2 const v47 = { 47, 47 }; + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { @@ -150,7 +162,7 @@ static FORCE_INLINE void XXH3_scrambleAcc_vsx(void * RESTRICT acc, /* xacc[i] *= XXH_PRIME32_1 */ /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ - xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); xacc[i] = prod_odd + (prod_even << v32); diff --git a/hashes/xxhash/xxh3-sse2.h b/hashes/xxhash/xxh3-sse2.h index 90c42df9..afbbc907 100644 --- a/hashes/xxhash/xxh3-sse2.h +++ b/hashes/xxhash/xxh3-sse2.h @@ -30,66 +30,70 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ -template < bool bswap > -static FORCE_INLINE void XXH3_accumulate_512_sse2( - void * RESTRICT acc, const void * RESTRICT input, - const void * RESTRICT secret) { +template +static FORCE_INLINE void XXH3_accumulate_512_sse2( void * RESTRICT acc, const void * RESTRICT input, + const void * RESTRICT secret ) { /* SSE2 is just a half-scale version of the AVX2 version. */ - __m128i* const xacc = (__m128i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xinput = (const __m128i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; + __m128i * const xacc = (__m128i * )acc; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. + */ + const __m128i * const xinput = (const __m128i *)input; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. + */ + const __m128i * const xsecret = (const __m128i *)secret; - for (size_t i = 0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { /* data_vec = xinput[i]; */ - __m128i const data_vec = bswap ? - mm_bswap64(_mm_loadu_si128(xinput+i)) : - _mm_loadu_si128(xinput+i); + __m128i const data_vec = bswap ? + mm_bswap64(_mm_loadu_si128(xinput + i)) : + _mm_loadu_si128(xinput + i); /* key_vec = xsecret[i]; */ - __m128i const key_vec = bswap ? - mm_bswap64(_mm_loadu_si128(xsecret+i)) : - _mm_loadu_si128(xsecret+i); + __m128i const key_vec = bswap ? + mm_bswap64(_mm_loadu_si128(xsecret + i)) : + _mm_loadu_si128(xsecret + i); /* data_key = data_vec ^ key_vec; */ - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + __m128i const data_key = _mm_xor_si128(data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const data_key_lo = _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + __m128i const product = _mm_mul_epu32(data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ - __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); - __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); /* xacc[i] += product; */ xacc[i] = _mm_add_epi64(product, sum); } } -template < bool bswap > -static FORCE_INLINE void XXH3_scrambleAcc_sse2(void * RESTRICT acc, - const void * RESTRICT secret) { - __m128i* const xacc = (__m128i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); +template +static FORCE_INLINE void XXH3_scrambleAcc_sse2( void * RESTRICT acc, const void * RESTRICT secret ) { + __m128i * const xacc = (__m128i * )acc; + /* + * Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. + */ + const __m128i * const xsecret = (const __m128i *)secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); - for (size_t i = 0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { /* xacc[i] ^= (xacc[i] >> 47) */ - __m128i const acc_vec = xacc[i]; - __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); - __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64(acc_vec, 47); + __m128i const data_vec = _mm_xor_si128(acc_vec , shifted); /* xacc[i] ^= xsecret[i]; */ - __m128i const key_vec = bswap ? - mm_bswap64(_mm_loadu_si128(xsecret+i)) : - _mm_loadu_si128(xsecret+i); - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + __m128i const key_vec = bswap ? + mm_bswap64(_mm_loadu_si128(xsecret + i)) : + _mm_loadu_si128(xsecret + i); + __m128i const data_key = _mm_xor_si128(data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ - __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); - __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + __m128i const data_key_hi = _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32(data_key , prime32); + __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32); xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); } } @@ -99,21 +103,21 @@ static FORCE_INLINE void XXH3_scrambleAcc_sse2(void * RESTRICT acc, * - not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ -template < bool bswap > -static FORCE_INLINE void XXH3_initCustomSecret_sse2(void * RESTRICT customSecret, - uint64_t seed64) { +template +static FORCE_INLINE void XXH3_initCustomSecret_sse2( void * RESTRICT customSecret, uint64_t seed64 ) { int const nbRounds = XXH3_SECRET_DEFAULT_SIZE / sizeof(__m128i); /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 alignas(16) const uint64_t seed64x2[2] = { - (uint64_t)seed64, (uint64_t)(UINT64_C(0) - seed64) }; - __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); + (uint64_t)seed64, (uint64_t)(UINT64_C(0) - seed64) + }; + __m128i const seed = _mm_load_si128((__m128i const *)seed64x2); #else - __m128i const seed = _mm_set_epi64x((uint64_t)(UINT64_C(0) - seed64), (uint64_t)seed64); + __m128i const seed = _mm_set_epi64x((uint64_t)(UINT64_C(0) - seed64), (uint64_t)seed64); #endif const void * const src16 = XXH3_kSecret; - __m128i* dst16 = (__m128i*) customSecret; + __m128i * dst16 = (__m128i *)customSecret; #if defined(__GNUC__) || defined(__clang__) XXH_COMPILER_GUARD(dst16); @@ -121,9 +125,9 @@ static FORCE_INLINE void XXH3_initCustomSecret_sse2(void * RESTRICT customSecret for (int i = 0; i < nbRounds; ++i) { if (bswap) { - dst16[i] = mm_bswap64(_mm_add_epi64(mm_bswap64(_mm_load_si128((const __m128i *)src16+i)), seed)); + dst16[i] = mm_bswap64(_mm_add_epi64(mm_bswap64(_mm_load_si128((const __m128i *)src16 + i)), seed)); } else { - dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed); } } } diff --git a/include/common/Hashinfo.h b/include/common/Hashinfo.h index b11ebfdf..146f7bd3 100644 --- a/include/common/Hashinfo.h +++ b/include/common/Hashinfo.h @@ -19,50 +19,50 @@ #define HAVE_HASHINFO #include -#define HASH_FLAGS \ - FLAG_EXPAND(HASH_MOCK) \ - FLAG_EXPAND(HASH_CRYPTOGRAPHIC) \ - FLAG_EXPAND(HASH_CRYPTOGRAPHIC_WEAK) \ - FLAG_EXPAND(HASH_CRC_BASED) \ - FLAG_EXPAND(HASH_AES_BASED) \ - FLAG_EXPAND(HASH_CLMUL_BASED) \ - FLAG_EXPAND(HASH_LOOKUP_TABLE) \ - FLAG_EXPAND(HASH_XL_SEED) \ - FLAG_EXPAND(HASH_SMALL_SEED) \ - FLAG_EXPAND(HASH_NO_SEED) \ - FLAG_EXPAND(HASH_SYSTEM_SPECIFIC) \ - FLAG_EXPAND(HASH_ENDIAN_INDEPENDENT) \ +#define HASH_FLAGS \ + FLAG_EXPAND(HASH_MOCK) \ + FLAG_EXPAND(HASH_CRYPTOGRAPHIC) \ + FLAG_EXPAND(HASH_CRYPTOGRAPHIC_WEAK) \ + FLAG_EXPAND(HASH_CRC_BASED) \ + FLAG_EXPAND(HASH_AES_BASED) \ + FLAG_EXPAND(HASH_CLMUL_BASED) \ + FLAG_EXPAND(HASH_LOOKUP_TABLE) \ + FLAG_EXPAND(HASH_XL_SEED) \ + FLAG_EXPAND(HASH_SMALL_SEED) \ + FLAG_EXPAND(HASH_NO_SEED) \ + FLAG_EXPAND(HASH_SYSTEM_SPECIFIC) \ + FLAG_EXPAND(HASH_ENDIAN_INDEPENDENT) \ FLAG_EXPAND(HASH_FLOATING_POINT) -#define IMPL_FLAGS \ - FLAG_EXPAND(IMPL_SANITY_FAILS) \ - FLAG_EXPAND(IMPL_SLOW) \ - FLAG_EXPAND(IMPL_VERY_SLOW) \ - FLAG_EXPAND(IMPL_READ_PAST_EOB) \ - FLAG_EXPAND(IMPL_TYPE_PUNNING) \ - FLAG_EXPAND(IMPL_INCREMENTAL) \ - FLAG_EXPAND(IMPL_INCREMENTAL_DIFFERENT) \ - FLAG_EXPAND(IMPL_128BIT) \ - FLAG_EXPAND(IMPL_MULTIPLY) \ - FLAG_EXPAND(IMPL_MULTIPLY_64_64) \ - FLAG_EXPAND(IMPL_MULTIPLY_64_128) \ - FLAG_EXPAND(IMPL_MULTIPLY_128_128) \ - FLAG_EXPAND(IMPL_ROTATE) \ - FLAG_EXPAND(IMPL_ROTATE_VARIABLE) \ - FLAG_EXPAND(IMPL_SHIFT_VARIABLE) \ - FLAG_EXPAND(IMPL_DIVIDE) \ - FLAG_EXPAND(IMPL_MODULUS) \ - FLAG_EXPAND(IMPL_ASM) \ - FLAG_EXPAND(IMPL_CANONICAL_LE) \ - FLAG_EXPAND(IMPL_CANONICAL_BE) \ - FLAG_EXPAND(IMPL_SEED_WITH_HINT) \ - FLAG_EXPAND(IMPL_LICENSE_PUBLIC_DOMAIN) \ - FLAG_EXPAND(IMPL_LICENSE_BSD) \ - FLAG_EXPAND(IMPL_LICENSE_MIT) \ - FLAG_EXPAND(IMPL_LICENSE_ZLIB) \ +#define IMPL_FLAGS \ + FLAG_EXPAND(IMPL_SANITY_FAILS) \ + FLAG_EXPAND(IMPL_SLOW) \ + FLAG_EXPAND(IMPL_VERY_SLOW) \ + FLAG_EXPAND(IMPL_READ_PAST_EOB) \ + FLAG_EXPAND(IMPL_TYPE_PUNNING) \ + FLAG_EXPAND(IMPL_INCREMENTAL) \ + FLAG_EXPAND(IMPL_INCREMENTAL_DIFFERENT) \ + FLAG_EXPAND(IMPL_128BIT) \ + FLAG_EXPAND(IMPL_MULTIPLY) \ + FLAG_EXPAND(IMPL_MULTIPLY_64_64) \ + FLAG_EXPAND(IMPL_MULTIPLY_64_128) \ + FLAG_EXPAND(IMPL_MULTIPLY_128_128) \ + FLAG_EXPAND(IMPL_ROTATE) \ + FLAG_EXPAND(IMPL_ROTATE_VARIABLE) \ + FLAG_EXPAND(IMPL_SHIFT_VARIABLE) \ + FLAG_EXPAND(IMPL_DIVIDE) \ + FLAG_EXPAND(IMPL_MODULUS) \ + FLAG_EXPAND(IMPL_ASM) \ + FLAG_EXPAND(IMPL_CANONICAL_LE) \ + FLAG_EXPAND(IMPL_CANONICAL_BE) \ + FLAG_EXPAND(IMPL_SEED_WITH_HINT) \ + FLAG_EXPAND(IMPL_LICENSE_PUBLIC_DOMAIN) \ + FLAG_EXPAND(IMPL_LICENSE_BSD) \ + FLAG_EXPAND(IMPL_LICENSE_MIT) \ + FLAG_EXPAND(IMPL_LICENSE_ZLIB) \ FLAG_EXPAND(IMPL_LICENSE_GPL3) -#define FLAG_EXPAND(name) FLAG_ENUM_##name, +#define FLAG_EXPAND(name) FLAG_ENUM_ ## name, typedef enum { HASH_FLAGS } hashflag_enum_t; @@ -71,7 +71,7 @@ typedef enum { } implflag_enum_t; #undef FLAG_EXPAND -#define FLAG_EXPAND(name) FLAG_##name=(1ULL << FLAG_ENUM_##name), +#define FLAG_EXPAND(name) FLAG_ ## name = (1ULL << FLAG_ENUM_ ## name), typedef enum : uint64_t { HASH_FLAGS } HashFlags; @@ -83,16 +83,16 @@ typedef enum : uint64_t { //----------------------------------------------------------------------------- class HashInfo; -typedef bool (*HashInitFn)(void); -typedef seed_t (*HashSeedfixFn)(const HashInfo * hinfo, const seed_t seed); -typedef uintptr_t (*HashSeedFn)(const seed_t seed); -typedef void (*HashFn)(const void * in, const size_t len, const seed_t seed, void * out); +typedef bool (* HashInitFn)( void ); +typedef seed_t (* HashSeedfixFn)( const HashInfo * hinfo, const seed_t seed ); +typedef uintptr_t (* HashSeedFn)( const seed_t seed ); +typedef void (* HashFn)( const void * in, const size_t len, const seed_t seed, void * out ); -seed_t excludeBadseeds(const HashInfo * hinfo, const seed_t seed); -seed_t excludeZeroSeed(const HashInfo * hinfo, const seed_t seed); +seed_t excludeBadseeds( const HashInfo * hinfo, const seed_t seed ); +seed_t excludeZeroSeed( const HashInfo * hinfo, const seed_t seed ); class HashInfo { - friend class HashFamilyInfo; + friend class HashFamilyInfo; public: enum endianness : uint32_t { @@ -105,20 +105,21 @@ class HashInfo { }; protected: - static const char * _fixup_name(const char * in); + static const char * _fixup_name( const char * in ); private: - uint32_t _ComputedVerifyImpl(const HashInfo * hinfo, enum HashInfo::endianness endian) const; + uint32_t _ComputedVerifyImpl( const HashInfo * hinfo, enum HashInfo::endianness endian ) const; - bool _is_native(enum endianness e) const { + bool _is_native( enum endianness e ) const { bool is_native = true; - switch(e) { + + switch (e) { case ENDIAN_NATIVE : is_native = true; break; case ENDIAN_BYTESWAPPED: is_native = false; break; case ENDIAN_LITTLE : is_native = isLE(); break; case ENDIAN_BIG : is_native = isBE(); break; case ENDIAN_DEFAULT : /* fallthrough */ - case ENDIAN_NONDEFAULT : + case ENDIAN_NONDEFAULT : { // Compute is_native for the DEFAULT case if (hash_flags & FLAG_HASH_ENDIAN_INDEPENDENT) { if (impl_flags & FLAG_IMPL_CANONICAL_LE) { @@ -133,62 +134,64 @@ class HashInfo { if (e == ENDIAN_NONDEFAULT) { is_native = !is_native; } break; } + } return is_native; } public: - const char * family; - const char * name; - const char * desc; - uint64_t hash_flags; - uint64_t impl_flags; - uint32_t sort_order; - uint32_t bits; - uint32_t verification_LE; - uint32_t verification_BE; - HashInitFn initfn; - HashSeedfixFn seedfixfn; - HashSeedFn seedfn; - HashFn hashfn_native; - HashFn hashfn_bswap; - std::set badseeds; - - HashInfo(const char * n, const char * f) : - name(_fixup_name(n)), family(f), desc(""), - initfn(NULL), seedfixfn(NULL), seedfn(NULL), - hashfn_native(NULL), hashfn_bswap(NULL) { } + const char * family; + const char * name; + const char * desc; + uint64_t hash_flags; + uint64_t impl_flags; + uint32_t sort_order; + uint32_t bits; + uint32_t verification_LE; + uint32_t verification_BE; + HashInitFn initfn; + HashSeedfixFn seedfixfn; + HashSeedFn seedfn; + HashFn hashfn_native; + HashFn hashfn_bswap; + std::set badseeds; + + HashInfo( const char * n, const char * f ) : + name( _fixup_name( n )), family( f ), desc( "" ), + initfn( NULL ), seedfixfn( NULL ), seedfn( NULL ), + hashfn_native( NULL ), hashfn_bswap( NULL ) {} ~HashInfo() { free((char *)name); } // The hash will be seeded with a value of 0 before this fn returns - uint32_t ComputedVerify(enum HashInfo::endianness endian) const { + uint32_t ComputedVerify( enum HashInfo::endianness endian ) const { return _ComputedVerifyImpl(this, endian); } - uint32_t ExpectedVerify(enum HashInfo::endianness endian) const { + uint32_t ExpectedVerify( enum HashInfo::endianness endian ) const { const bool wantLE = isBE() ^ _is_native(endian); + return wantLE ? this->verification_LE : this->verification_BE; } - FORCE_INLINE HashFn hashFn(enum HashInfo::endianness endian) const { + FORCE_INLINE HashFn hashFn( enum HashInfo::endianness endian ) const { return _is_native(endian) ? hashfn_native : hashfn_bswap; } - FORCE_INLINE bool Init(void) const { + FORCE_INLINE bool Init( void ) const { if (initfn != NULL) { return initfn(); } return true; } - FORCE_INLINE seed_t Seed(seed_t seed, bool force = false, uint64_t hint = 0) const { + FORCE_INLINE seed_t Seed( seed_t seed, bool force = false, uint64_t hint = 0 ) const { if (unlikely(impl_flags & FLAG_IMPL_SEED_WITH_HINT)) { seedfixfn(NULL, hint); return seed; } - if (!force && seedfixfn != NULL) { + if (!force && (seedfixfn != NULL)) { seed = seedfixfn(this, seed); } if (seedfn != NULL) { @@ -200,46 +203,46 @@ class HashInfo { return seed; } - FORCE_INLINE bool isMock(void) const { + FORCE_INLINE bool isMock( void ) const { return !!(hash_flags & FLAG_HASH_MOCK); } - FORCE_INLINE bool is32BitSeed(void) const { + FORCE_INLINE bool is32BitSeed( void ) const { return !!(hash_flags & FLAG_HASH_SMALL_SEED); } - FORCE_INLINE bool isEndianDefined(void) const { + FORCE_INLINE bool isEndianDefined( void ) const { return !!(hash_flags & FLAG_HASH_ENDIAN_INDEPENDENT); } - FORCE_INLINE bool isCrypto(void) const { + FORCE_INLINE bool isCrypto( void ) const { return !!(hash_flags & FLAG_HASH_CRYPTOGRAPHIC); } - FORCE_INLINE bool isSlow(void) const { + FORCE_INLINE bool isSlow( void ) const { return !!(impl_flags & (FLAG_IMPL_SLOW | FLAG_IMPL_VERY_SLOW)); } - FORCE_INLINE bool isVerySlow(void) const { + FORCE_INLINE bool isVerySlow( void ) const { return !!(impl_flags & FLAG_IMPL_VERY_SLOW); } -}; +}; // class HashInfo class HashFamilyInfo { -public: - const char * name; - const char * src_url; - enum SrcStatus : uint32_t { - SRC_UNKNOWN, - SRC_FROZEN, // Very unlikely to change - SRC_STABLEISH, // Fairly unlikely to change - SRC_ACTIVE, // Likely to change - } src_status; - - HashFamilyInfo(const char * n) : - name(_fixup_name(n)), - src_url(NULL), src_status(SRC_UNKNOWN) { } - -private: - static const char * _fixup_name(const char * in); -}; + public: + const char * name; + const char * src_url; + enum SrcStatus : uint32_t { + SRC_UNKNOWN, + SRC_FROZEN, // Very unlikely to change + SRC_STABLEISH, // Fairly unlikely to change + SRC_ACTIVE, // Likely to change + } src_status; + + HashFamilyInfo( const char * n ) : + name( _fixup_name( n )), + src_url( NULL ), src_status( SRC_UNKNOWN ) {} + + private: + static const char * _fixup_name( const char * in ); +}; // class HashFamilyInfo diff --git a/include/common/Intrinsics.h b/include/common/Intrinsics.h index 44922189..5b1ecef4 100644 --- a/include/common/Intrinsics.h +++ b/include/common/Intrinsics.h @@ -28,33 +28,33 @@ #pragma once #if defined(HAVE_X86INTRIN) -# include + #include #elif defined(HAVE_AMMINTRIN) -# include + #include #elif defined(HAVE_IMMINTRIN) -# include + #include #endif #if defined(HAVE_ARM_NEON) - /* circumvent a clang bug */ -# if defined(__GNUC__) || defined(__clang__) -# if defined(__ARM_NEON__) || defined(__ARM_NEON) || \ +/* circumvent a clang bug */ + #if defined(__GNUC__) || defined(__clang__) + #if defined(__ARM_NEON__) || defined(__ARM_NEON) || \ defined(__aarch64__) || defined(_M_ARM) || \ defined(_M_ARM64) || defined(_M_ARM64EC) -# define inline __inline__ -# endif -# endif -# include -# if defined(__GNUC__) || defined(__clang__) -# if defined(__ARM_NEON__) || defined(__ARM_NEON) || \ + #define inline __inline__ + #endif + #endif + #include + #if defined(__GNUC__) || defined(__clang__) + #if defined(__ARM_NEON__) || defined(__ARM_NEON) || \ defined(__aarch64__) || defined(_M_ARM) || \ defined(_M_ARM64) || defined(_M_ARM64EC) -# undef inline -# endif -# endif -# if defined(HAVE_ARM_ACLE) -# include -# endif + #undef inline + #endif + #endif + #if defined(HAVE_ARM_ACLE) + #include + #endif #endif @@ -69,46 +69,46 @@ * * We use pragma push_macro/pop_macro to keep the namespace clean. */ -#pragma push_macro("bool") -#pragma push_macro("vector") -#pragma push_macro("pixel") + #pragma push_macro("bool") + #pragma push_macro("vector") + #pragma push_macro("pixel") /* silence potential macro redefined warnings */ -#undef bool -#undef vector -#undef pixel - -#if defined(__s390x__) -# include -#else -# include -#endif + #undef bool + #undef vector + #undef pixel + + #if defined(__s390x__) + #include + #else + #include + #endif /* Restore the original macro values, if applicable. */ -#pragma pop_macro("pixel") -#pragma pop_macro("vector") -#pragma pop_macro("bool") + #pragma pop_macro("pixel") + #pragma pop_macro("vector") + #pragma pop_macro("bool") -#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) typedef __vector unsigned char vec_t; -#define vec_encrypt(a,b) __vcipher(a,b); -#define vec_encryptlast(a,b) __vcipherlast(a,b); -#define vec_decrypt(a,b) __vncipher(a,b); -#define vec_encryptlast(a,b) __vncipherlast(a,b); -#elif defined(__clang__) + #define vec_encrypt(a, b) __vcipher(a, b); + #define vec_encryptlast(a, b) __vcipherlast(a, b); + #define vec_decrypt(a, b) __vncipher(a, b); + #define vec_encryptlast(a, b) __vncipherlast(a, b); + #elif defined(__clang__) typedef __vector unsigned long long vec_t; -#define vec_encrypt(a,b) __builtin_altivec_crypto_vcipher(a, b); -#define vec_encryptlast(a,b) __builtin_altivec_crypto_vcipherlast(a, b); -#define vec_decrypt(a,b) __builtin_altivec_crypto_vncipher(a, b); -#define vec_decryptlast(a,b) __builtin_altivec_crypto_vncipherlast(a, b); -#elif defined(__GNUC__) + #define vec_encrypt(a, b) __builtin_altivec_crypto_vcipher(a, b); + #define vec_encryptlast(a, b) __builtin_altivec_crypto_vcipherlast(a, b); + #define vec_decrypt(a, b) __builtin_altivec_crypto_vncipher(a, b); + #define vec_decryptlast(a, b) __builtin_altivec_crypto_vncipherlast(a, b); + #elif defined(__GNUC__) typedef __vector unsigned long long vec_t; -#define vec_encrypt(a,b) __builtin_crypto_vcipher(a,b); -#define vec_encryptlast(a,b) __builtin_crypto_vcipherlast(a,b); -#define vec_decrypt(a,b) __builtin_crypto_vncipher(a,b); -#define vec_decryptlast(a,b) __builtin_crypto_vncipherlast(a,b); -#else -#error "PPC AES intrinsic mapping unimplemented" -#endif + #define vec_encrypt(a, b) __builtin_crypto_vcipher(a, b); + #define vec_encryptlast(a, b) __builtin_crypto_vcipherlast(a, b); + #define vec_decrypt(a, b) __builtin_crypto_vncipher(a, b); + #define vec_decryptlast(a, b) __builtin_crypto_vncipherlast(a, b); + #else + #error "PPC AES intrinsic mapping unimplemented" + #endif #endif //------------------------------------------------------------ @@ -117,121 +117,127 @@ typedef __vector unsigned long long vec_t; // prefetch() implementation without this. #if defined(HAVE_SSE_2) -#undef prefetch -#define prefetch(x) _mm_prefetch(x, _MM_HINT_T0) + #undef prefetch + #define prefetch(x) _mm_prefetch(x, _MM_HINT_T0) #endif //------------------------------------------------------------ // Vectorized byteswapping #if defined(HAVE_ARM_NEON) -static FORCE_INLINE uint64x2_t Vbswap64_u64(const uint64x2_t v) { + +static FORCE_INLINE uint64x2_t Vbswap64_u64( const uint64x2_t v ) { return vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(v))); } -static FORCE_INLINE uint32x4_t Vbswap32_u32(const uint32x4_t v) { + +static FORCE_INLINE uint32x4_t Vbswap32_u32( const uint32x4_t v ) { return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))); } + #endif #if defined(HAVE_AVX512_BW) -static FORCE_INLINE __m512i mm512_bswap64(const __m512i v) { - const __m512i MASK = _mm512_set_epi64(UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607), - UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607), - UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607), - UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607)); + +static FORCE_INLINE __m512i mm512_bswap64( const __m512i v ) { + const __m512i MASK = _mm512_set_epi64(UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607), + UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607), UINT64_C(0x08090a0b0c0d0e0f), + UINT64_C(0x0001020304050607), UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607)); + return _mm512_shuffle_epi8(v, MASK); } -static FORCE_INLINE __m512i mm512_bswap32(const __m512i v) { - const __m512i MASK = _mm512_set_epi64(UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203), - UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203), - UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203), - UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203)); + +static FORCE_INLINE __m512i mm512_bswap32( const __m512i v ) { + const __m512i MASK = _mm512_set_epi64(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203), + UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203), UINT64_C(0x0c0d0e0f08090a0b), + UINT64_C(0x0405060700010203), UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203)); + return _mm512_shuffle_epi8(v, MASK); } + #elif defined(HAVE_AVX512_F) -static FORCE_INLINE __m512i mm512_bswap64(const __m512i v) { + +static FORCE_INLINE __m512i mm512_bswap64( const __m512i v ) { // Byteswapping 256 bits at a time, since _mm512_shuffle_epi8() // requires AVX512-BW in addition to AVX512-F. - const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607), - UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607)); - __m256i blk1 = _mm512_extracti64x4_epi64(v, 0); - __m256i blk2 = _mm512_extracti64x4_epi64(v, 1); + const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607), + UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607)); + __m256i blk1 = _mm512_extracti64x4_epi64(v, 0); + __m256i blk2 = _mm512_extracti64x4_epi64(v, 1); + blk1 = _mm256_shuffle_epi8(blk1, MASK); blk2 = _mm256_shuffle_epi8(blk2, MASK); - v = _mm512_inserti64x4(v, blk1, 0); - v = _mm512_inserti64x4(v, blk2, 1); + v = _mm512_inserti64x4(v, blk1, 0); + v = _mm512_inserti64x4(v, blk2, 1); return v; } -static FORCE_INLINE __m512i mm512_bswap64(const __m512i v) { + +static FORCE_INLINE __m512i mm512_bswap64( const __m512i v ) { // Byteswapping 256 bits at a time, since _mm512_shuffle_epi8() // requires AVX512-BW in addition to AVX512-F. - const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203), - UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203)); - __m256i blk1 = _mm512_extracti64x4_epi64(v, 0); - __m256i blk2 = _mm512_extracti64x4_epi64(v, 1); + const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203), + UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203)); + __m256i blk1 = _mm512_extracti64x4_epi64(v, 0); + __m256i blk2 = _mm512_extracti64x4_epi64(v, 1); + blk1 = _mm256_shuffle_epi8(blk1, MASK); blk2 = _mm256_shuffle_epi8(blk2, MASK); - v = _mm512_inserti64x4(v, blk1, 0); - v = _mm512_inserti64x4(v, blk2, 1); + v = _mm512_inserti64x4(v, blk1, 0); + v = _mm512_inserti64x4(v, blk2, 1); return v; } + #endif #if defined(HAVE_AVX2) -static FORCE_INLINE __m256i mm256_bswap64(const __m256i v) { - const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607), - UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607)); + +static FORCE_INLINE __m256i mm256_bswap64( const __m256i v ) { + const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607), + UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607)); + return _mm256_shuffle_epi8(v, MASK); } -static FORCE_INLINE __m256i mm256_bswap32(const __m256i v) { - const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203), - UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203)); + +static FORCE_INLINE __m256i mm256_bswap32( const __m256i v ) { + const __m256i MASK = _mm256_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203), + UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203)); + return _mm256_shuffle_epi8(v, MASK); } + #endif #if defined(HAVE_SSSE_3) -static FORCE_INLINE __m128i mm_bswap64(const __m128i v) { - const __m128i MASK = _mm_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), - UINT64_C(0x0001020304050607)); + +static FORCE_INLINE __m128i mm_bswap64( const __m128i v ) { + const __m128i MASK = _mm_set_epi64x(UINT64_C(0x08090a0b0c0d0e0f), UINT64_C(0x0001020304050607)); + return _mm_shuffle_epi8(v, MASK); } -static FORCE_INLINE __m128i mm_bswap32(const __m128i v) { - const __m128i MASK = _mm_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), - UINT64_C(0x0405060700010203)); + +static FORCE_INLINE __m128i mm_bswap32( const __m128i v ) { + const __m128i MASK = _mm_set_epi64x(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203)); + return _mm_shuffle_epi8(v, MASK); } + #elif defined(HAVE_SSE_2) -static FORCE_INLINE __m128i mm_bswap64(const __m128i v) { + +static FORCE_INLINE __m128i mm_bswap64( const __m128i v ) { // Swap each pair of bytes - __m128i tmp = _mm_or_si128(_mm_slri_epi16(v, 8), - _mm_slli_epi16(v, 8)); + __m128i tmp = _mm_or_si128(_mm_slri_epi16(v, 8), _mm_slli_epi16(v, 8)); + // Swap 16-bit words tmp = _mm_shufflelo_epi16(tmp, _MM_SHUFFLE(0, 1, 2, 3)); tmp = _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 1, 2, 3)); } -static FORCE_INLINE __m128i mm_bswap32(const __m128i v) { + +static FORCE_INLINE __m128i mm_bswap32( const __m128i v ) { // Swap each pair of bytes - __m128i tmp = _mm_or_si128(_mm_slri_epi16(v, 8), - _mm_slli_epi16(v, 8)); + __m128i tmp = _mm_or_si128(_mm_slri_epi16(v, 8), _mm_slli_epi16(v, 8)); + // Swap 16-bit words tmp = _mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); tmp = _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); } + #endif diff --git a/include/hashlib/AES-aesni.h b/include/hashlib/AES-aesni.h index 57fc59a0..7879d30d 100644 --- a/include/hashlib/AES-aesni.h +++ b/include/hashlib/AES-aesni.h @@ -31,6 +31,7 @@ */ static inline __m128i _expand_key_helper( __m128i rkey, __m128i assist ) { __m128i temp; + temp = _mm_slli_si128(rkey, 0x4); rkey = _mm_xor_si128(rkey, temp); temp = _mm_slli_si128(temp, 0x4); @@ -39,16 +40,17 @@ static inline __m128i _expand_key_helper( __m128i rkey, __m128i assist ) { rkey = _mm_xor_si128(rkey, temp); temp = _mm_shuffle_epi32(assist, 0xff); - rkey = _mm_xor_si128 (rkey, temp); + rkey = _mm_xor_si128(rkey, temp); return rkey; } #define MKASSIST(x, y) x, _mm_aeskeygenassist_si128(x, y) -static int AES_KeySetup_Enc_AESNI(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { +static int AES_KeySetup_Enc_AESNI( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { __m128i * round_keys = (__m128i *)rk; - round_keys[ 0] = _mm_loadu_si128((__m128i *)cipherKey); + + round_keys[0] = _mm_loadu_si128((__m128i *)cipherKey); round_keys[ 1] = _expand_key_helper(MKASSIST(round_keys[0], 0x01)); round_keys[ 2] = _expand_key_helper(MKASSIST(round_keys[1], 0x02)); round_keys[ 3] = _expand_key_helper(MKASSIST(round_keys[2], 0x04)); @@ -62,19 +64,20 @@ static int AES_KeySetup_Enc_AESNI(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cip return (keyBits == 128) ? 10 : (keyBits == 192) ? 12 : (keyBits == 256) ? 14 : 0; } -static int AES_KeySetup_Dec_AESNI(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { +static int AES_KeySetup_Dec_AESNI( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { __m128i * round_keys = (__m128i *)rk; + round_keys[10] = _mm_loadu_si128((__m128i *)cipherKey); - round_keys[ 9] = _expand_key_helper(MKASSIST(round_keys[10], 0x01)); - round_keys[ 8] = _expand_key_helper(MKASSIST(round_keys[ 9], 0x02)); - round_keys[ 7] = _expand_key_helper(MKASSIST(round_keys[ 8], 0x04)); - round_keys[ 6] = _expand_key_helper(MKASSIST(round_keys[ 7], 0x08)); - round_keys[ 5] = _expand_key_helper(MKASSIST(round_keys[ 6], 0x10)); - round_keys[ 4] = _expand_key_helper(MKASSIST(round_keys[ 5], 0x20)); - round_keys[ 3] = _expand_key_helper(MKASSIST(round_keys[ 4], 0x40)); - round_keys[ 2] = _expand_key_helper(MKASSIST(round_keys[ 3], 0x80)); - round_keys[ 1] = _expand_key_helper(MKASSIST(round_keys[ 2], 0x1b)); - round_keys[ 0] = _expand_key_helper(MKASSIST(round_keys[ 1], 0x36)); + round_keys[9] = _expand_key_helper(MKASSIST(round_keys[10], 0x01)); + round_keys[8] = _expand_key_helper(MKASSIST(round_keys[ 9], 0x02)); + round_keys[7] = _expand_key_helper(MKASSIST(round_keys[ 8], 0x04)); + round_keys[6] = _expand_key_helper(MKASSIST(round_keys[ 7], 0x08)); + round_keys[5] = _expand_key_helper(MKASSIST(round_keys[ 6], 0x10)); + round_keys[4] = _expand_key_helper(MKASSIST(round_keys[ 5], 0x20)); + round_keys[3] = _expand_key_helper(MKASSIST(round_keys[ 4], 0x40)); + round_keys[2] = _expand_key_helper(MKASSIST(round_keys[ 3], 0x80)); + round_keys[1] = _expand_key_helper(MKASSIST(round_keys[ 2], 0x1b)); + round_keys[0] = _expand_key_helper(MKASSIST(round_keys[ 1], 0x36)); for (int i = 1; i < 10; i++) { round_keys[i] = _mm_aesimc_si128(round_keys[i]); } @@ -83,48 +86,54 @@ static int AES_KeySetup_Dec_AESNI(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cip #undef MKASSIST -template < int Nr > -static inline void AES_Encrypt_AESNI(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], uint8_t ct[16]) { +template +static inline void AES_Encrypt_AESNI( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t pt[16], uint8_t ct[16] ) { const __m128i * round_keys = (const __m128i *)rk; - __m128i tmp; - tmp = _mm_loadu_si128((const __m128i*)pt); + __m128i tmp; + + tmp = _mm_loadu_si128((const __m128i *)pt); tmp = _mm_xor_si128(tmp, round_keys[0]); - for (int j = 1; j < Nr; j++) + for (int j = 1; j < Nr; j++) { tmp = _mm_aesenc_si128(tmp, round_keys[j]); + } tmp = _mm_aesenclast_si128(tmp, round_keys[Nr]); - _mm_storeu_si128((((__m128i*)ct)), tmp); + _mm_storeu_si128((((__m128i *)ct)), tmp); } -template < int Nr > -static inline void AES_Decrypt_AESNI(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], uint8_t pt[16]) { +template +static inline void AES_Decrypt_AESNI( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t ct[16], uint8_t pt[16] ) { const __m128i * round_keys = (const __m128i *)rk; - __m128i tmp; - tmp = _mm_loadu_si128((const __m128i*)ct); + __m128i tmp; + + tmp = _mm_loadu_si128((const __m128i *)ct); tmp = _mm_xor_si128(tmp, round_keys[0]); - for (int j = 1; j < Nr; j++) + for (int j = 1; j < Nr; j++) { tmp = _mm_aesdec_si128(tmp, round_keys[j]); + } tmp = _mm_aesdeclast_si128(tmp, round_keys[Nr]); - _mm_storeu_si128((((__m128i*)pt)), tmp); + _mm_storeu_si128((((__m128i *)pt)), tmp); } -static inline void AES_EncryptRound_AESNI(const uint32_t rk[4], uint8_t block[16]) { +static inline void AES_EncryptRound_AESNI( const uint32_t rk[4], uint8_t block[16] ) { const __m128i round_key = _mm_loadu_si128((const __m128i *)rk); - __m128i tmp = _mm_loadu_si128((__m128i *)block); + __m128i tmp = _mm_loadu_si128((__m128i *)block ); + tmp = _mm_aesenc_si128(tmp, round_key); - _mm_storeu_si128((((__m128i*)block)), tmp); + _mm_storeu_si128((((__m128i *)block)), tmp); } -static void AES_DecryptRound_AESNI(const uint32_t rk[4], uint8_t block[16]) { +static void AES_DecryptRound_AESNI( const uint32_t rk[4], uint8_t block[16] ) { const __m128i round_key = _mm_loadu_si128((const __m128i *)rk); - __m128i tmp = _mm_loadu_si128((__m128i *)block); + __m128i tmp = _mm_loadu_si128((__m128i *)block ); + tmp = _mm_aesdec_si128(tmp, round_key); - _mm_storeu_si128((((__m128i*)block)), tmp); + _mm_storeu_si128((((__m128i *)block)), tmp); } diff --git a/include/hashlib/AES-arm.h b/include/hashlib/AES-arm.h index f1fd7d4d..cbb9ec7c 100644 --- a/include/hashlib/AES-arm.h +++ b/include/hashlib/AES-arm.h @@ -29,57 +29,59 @@ * * For more information, please refer to */ -template < int Nr > -static inline void AES_Encrypt_ARM(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], uint8_t ct[16]) { - uint8x16_t block = vld1q_u8(pt); - const uint8_t * keys = (const uint8_t *)rk; +template +static inline void AES_Encrypt_ARM( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t pt[16], uint8_t ct[16] ) { + uint8x16_t block = vld1q_u8(pt); + const uint8_t * keys = (const uint8_t *)rk; // AES single round encryption - block = vaeseq_u8(block, vld1q_u8(keys+0*16)); + block = vaeseq_u8(block, vld1q_u8(keys + 0 * 16)); for (int i = 1; i < Nr; i++) { // AES mix columns block = vaesmcq_u8(block); // AES single round encryption - block = vaeseq_u8(block, vld1q_u8(keys+i*16)); + block = vaeseq_u8(block, vld1q_u8(keys + i * 16)); } // Final xor - block = veorq_u8(block, vld1q_u8(keys+Nr*16)); + block = veorq_u8(block, vld1q_u8(keys + Nr * 16)); vst1q_u8(ct, block); } -template < int Nr > -static inline void AES_Decrypt_ARM(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], uint8_t pt[16]) { - uint8x16_t block = vld1q_u8(ct); - const uint8_t * keys = (const uint8_t *)rk; +template +static inline void AES_Decrypt_ARM( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t ct[16], uint8_t pt[16] ) { + uint8x16_t block = vld1q_u8(ct); + const uint8_t * keys = (const uint8_t *)rk; // AES single round decryption - block = vaesdq_u8(block, vld1q_u8(keys+0*16)); + block = vaesdq_u8(block, vld1q_u8(keys + 0 * 16)); for (int i = 1; i < Nr; i++) { // AES inverse mix columns block = vaesimcq_u8(block); // AES single round decryption - block = vaesdq_u8(block, vld1q_u8(keys+i*16)); + block = vaesdq_u8(block, vld1q_u8(keys + i * 16)); } // Final xor - block = veorq_u8(block, vld1q_u8(keys+Nr*16)); + block = veorq_u8(block, vld1q_u8(keys + Nr * 16)); vst1q_u8(pt, block); } -static inline void AES_EncryptRound_ARM(const uint32_t rk[4], uint8_t block[16]) { +static inline void AES_EncryptRound_ARM( const uint32_t rk[4], uint8_t block[16] ) { uint8x16_t tmp = vld1q_u8(block); + tmp = vaeseq_u8(tmp, vld1q_u8((const uint8_t *)rk)); tmp = vaesmcq_u8(tmp); vst1q_u8(block, tmp); } -static inline void AES_DecryptRound_ARM(const uint32_t rk[4], uint8_t block[16]) { +static inline void AES_DecryptRound_ARM( const uint32_t rk[4], uint8_t block[16] ) { uint8x16_t tmp = vld1q_u8(block); + tmp = vaesdq_u8(tmp, vld1q_u8((const uint8_t *)rk)); tmp = vaesimcq_u8(tmp); vst1q_u8(block, tmp); diff --git a/include/hashlib/AES-portable.h b/include/hashlib/AES-portable.h index 68abdfc3..4d35522d 100644 --- a/include/hashlib/AES-portable.h +++ b/include/hashlib/AES-portable.h @@ -25,10 +25,11 @@ extern const uint32_t Te0[256], Te1[256], Te2[256], Te3[256], Te4[256]; extern const uint32_t Td0[256], Td1[256], Td2[256], Td3[256], Td4[256]; -static const uint32_t rcon[] = { +/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +static const uint32_t rcon[10] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + 0x1B000000, 0x36000000, }; /* Endian-independent macros */ @@ -37,7 +38,7 @@ static const uint32_t rcon[] = { ((uint32_t)(pt)[1] << 16) ^ \ ((uint32_t)(pt)[2] << 8) ^ \ ((uint32_t)(pt)[3] ) ) -#define PUTU32(ct, st) { \ +#define PUTU32(ct, st) { \ (ct)[0] = (uint8_t)((st) >> 24); \ (ct)[1] = (uint8_t)((st) >> 16); \ (ct)[2] = (uint8_t)((st) >> 8); \ @@ -48,24 +49,24 @@ static const uint32_t rcon[] = { * * Returns the number of rounds for the given cipher key size. */ -static int AES_KeySetup_Enc_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { - int i = 0; +static int AES_KeySetup_Enc_portable( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { + int i = 0; uint32_t temp; rk[0] = GETU32(cipherKey ); - rk[1] = GETU32(cipherKey + 4); - rk[2] = GETU32(cipherKey + 8); + rk[1] = GETU32(cipherKey + 4 ); + rk[2] = GETU32(cipherKey + 8 ); rk[3] = GETU32(cipherKey + 12); if (keyBits == 128) { for (;;) { temp = rk[3]; rk[4] = rk[0] ^ - (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ - rcon[i]; + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; rk[5] = rk[1] ^ rk[4]; rk[6] = rk[2] ^ rk[5]; rk[7] = rk[3] ^ rk[6]; @@ -81,22 +82,22 @@ static int AES_KeySetup_Enc_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t if (keyBits == 192) { for (;;) { - temp = rk[ 5]; - rk[ 6] = rk[ 0] ^ - (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ - rcon[i]; - rk[ 7] = rk[ 1] ^ rk[ 6]; - rk[ 8] = rk[ 2] ^ rk[ 7]; - rk[ 9] = rk[ 3] ^ rk[ 8]; + temp = rk[5]; + rk[6] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; + rk[ 7] = rk[1] ^ rk[ 6]; + rk[ 8] = rk[2] ^ rk[ 7]; + rk[ 9] = rk[3] ^ rk[ 8]; if (++i == 8) { return 12; } - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - rk += 6; + rk[10] = rk[4] ^ rk[ 9]; + rk[11] = rk[5] ^ rk[10]; + rk += 6; } } @@ -105,30 +106,30 @@ static int AES_KeySetup_Enc_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t if (keyBits == 256) { for (;;) { - temp = rk[ 7]; - rk[ 8] = rk[ 0] ^ - (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ - (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ - (Te4[(temp ) & 0xff] & 0x0000ff00) ^ - (Te4[(temp >> 24) ] & 0x000000ff) ^ - rcon[i]; - rk[ 9] = rk[ 1] ^ rk[ 8]; - rk[10] = rk[ 2] ^ rk[ 9]; - rk[11] = rk[ 3] ^ rk[10]; + temp = rk[7]; + rk[8] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; + rk[ 9] = rk[1] ^ rk[ 8]; + rk[10] = rk[2] ^ rk[ 9]; + rk[11] = rk[3] ^ rk[10]; if (++i == 7) { return 14; } - temp = rk[11]; - rk[12] = rk[ 4] ^ - (Te4[(temp >> 24) ] & 0xff000000) ^ - (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(temp ) & 0xff] & 0x000000ff); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - - rk += 8; + temp = rk[11]; + rk[12] = rk[4] ^ + (Te4[(temp >> 24) ] & 0xff000000) ^ + (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(temp ) & 0xff] & 0x000000ff); + rk[13] = rk[5] ^ rk[12]; + rk[14] = rk[6] ^ rk[13]; + rk[15] = rk[7] ^ rk[14]; + + rk += 8; } } @@ -140,15 +141,15 @@ static int AES_KeySetup_Enc_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t * * Returns the number of rounds for the given cipher key size. */ -static int AES_KeySetup_Dec_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { - int Nr, i, j; +static int AES_KeySetup_Dec_portable( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { + int Nr, i, j; uint32_t temp; /* expand the cipher key: */ Nr = AES_KeySetup_Dec_portable(rk, cipherKey, keyBits); /* invert the order of the round keys: */ - for (i = 0, j = 4*Nr; i < j; i += 4, j -= 4) { + for (i = 0, j = 4 * Nr; i < j; i += 4, j -= 4) { temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; @@ -157,35 +158,35 @@ static int AES_KeySetup_Dec_portable(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t /* apply the inverse MixColumn transform to all round keys but the first and the last: */ for (i = 1; i < Nr; i++) { - rk += 4; + rk += 4; rk[0] = - Td0[Te4[(rk[0] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[0] ) & 0xff] & 0xff]; + Td0[Te4[(rk[0] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[0] ) & 0xff] & 0xff]; rk[1] = - Td0[Te4[(rk[1] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[1] ) & 0xff] & 0xff]; + Td0[Te4[(rk[1] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[1] ) & 0xff] & 0xff]; rk[2] = - Td0[Te4[(rk[2] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[2] ) & 0xff] & 0xff]; + Td0[Te4[(rk[2] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[2] ) & 0xff] & 0xff]; rk[3] = - Td0[Te4[(rk[3] >> 24) ] & 0xff] ^ - Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ - Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ - Td3[Te4[(rk[3] ) & 0xff] & 0xff]; + Td0[Te4[(rk[3] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[3] ) & 0xff] & 0xff]; } return Nr; } -template < int Nr > -static void AES_Encrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], uint8_t ct[16]) { - //STATIC_ASSERT(Nr >=1 && Nr <= 14); +template +static void AES_Encrypt_portable( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t pt[16], uint8_t ct[16] ) { + // STATIC_ASSERT(Nr >=1 && Nr <= 14); uint32_t s0, s1, s2, s3, t0, t1, t2, t3; /* @@ -199,94 +200,94 @@ static void AES_Encrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_ /* round 1: */ if (Nr > 1) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[4]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[5]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[6]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[7]; } /* round 2: */ if (Nr > 2) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11]; } /* round 3: */ if (Nr > 3) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15]; } /* round 4: */ if (Nr > 4) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19]; } /* round 5: */ if (Nr > 5) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23]; } /* round 6: */ if (Nr > 6) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27]; } /* round 7: */ if (Nr > 7) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31]; } /* round 8: */ if (Nr > 8) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35]; } /* round 9: */ if (Nr > 9) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39]; } /* round 10: */ if (Nr > 10) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43]; } /* round 11: */ if (Nr > 11) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47]; } /* round 12: */ if (Nr > 12) { - s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48]; - s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49]; - s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50]; - s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51]; + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51]; } /* round 13: */ if (Nr > 13) { - t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52]; - t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53]; - t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54]; - t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55]; + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55]; } rk += Nr << 2; @@ -303,38 +304,39 @@ static void AES_Encrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_ * map cipher state to byte array block: */ s0 = - (Te4[(t0 >> 24) ] & 0xff000000) ^ - (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t3 ) & 0xff] & 0x000000ff) ^ - rk[0]; - PUTU32(ct , s0); + (Te4[(t0 >> 24) ] & 0xff000000) ^ + (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t3 ) & 0xff] & 0x000000ff) ^ + rk[0]; s1 = - (Te4[(t1 >> 24) ] & 0xff000000) ^ - (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t0 ) & 0xff] & 0x000000ff) ^ - rk[1]; - PUTU32(ct + 4, s1); + (Te4[(t1 >> 24) ] & 0xff000000) ^ + (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t0 ) & 0xff] & 0x000000ff) ^ + rk[1]; s2 = - (Te4[(t2 >> 24) ] & 0xff000000) ^ - (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t1 ) & 0xff] & 0x000000ff) ^ - rk[2]; - PUTU32(ct + 8, s2); + (Te4[(t2 >> 24) ] & 0xff000000) ^ + (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t1 ) & 0xff] & 0x000000ff) ^ + rk[2]; s3 = - (Te4[(t3 >> 24) ] & 0xff000000) ^ - (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ - (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ - (Te4[(t2 ) & 0xff] & 0x000000ff) ^ - rk[3]; + (Te4[(t3 >> 24) ] & 0xff000000) ^ + (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t2 ) & 0xff] & 0x000000ff) ^ + rk[3]; + + PUTU32(ct , s0); + PUTU32(ct + 4, s1); + PUTU32(ct + 8, s2); PUTU32(ct + 12, s3); } -template < int Nr > -static void AES_Decrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], uint8_t pt[16]) { - //STATIC_ASSERT(Nr >=1 && Nr <= 14); +template +static void AES_Decrypt_portable( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t ct[16], uint8_t pt[16] ) { + // STATIC_ASSERT(Nr >=1 && Nr <= 14); uint32_t s0, s1, s2, s3, t0, t1, t2, t3; /* @@ -348,94 +350,94 @@ static void AES_Decrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_ /* round 1: */ if (Nr > 1) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[4]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[5]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[6]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[7]; } /* round 2: */ if (Nr > 2) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11]; } /* round 3: */ if (Nr > 3) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15]; } /* round 4: */ if (Nr > 4) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19]; } /* round 5: */ if (Nr > 5) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23]; } /* round 6: */ if (Nr > 6) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27]; } /* round 7: */ if (Nr > 7) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31]; } /* round 8: */ if (Nr > 8) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35]; } /* round 9: */ if (Nr > 9) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39]; } /* round 10: */ if (Nr > 10) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43]; } /* round 11: */ if (Nr > 11) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47]; } /* round 12: */ if (Nr > 12) { - s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48]; - s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49]; - s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50]; - s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51]; + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51]; } /* round 13: */ if (Nr > 13) { - t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52]; - t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53]; - t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54]; - t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55]; + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55]; } rk += Nr << 2; @@ -452,36 +454,37 @@ static void AES_Decrypt_portable(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_ * map cipher state to byte array block: */ s0 = - (Td4[(t0 >> 24) ] & 0xff000000) ^ - (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t1 ) & 0xff] & 0x000000ff) ^ - rk[0]; - PUTU32(pt , s0); + (Td4[(t0 >> 24) ] & 0xff000000) ^ + (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t1 ) & 0xff] & 0x000000ff) ^ + rk[0]; s1 = - (Td4[(t1 >> 24) ] & 0xff000000) ^ - (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t2 ) & 0xff] & 0x000000ff) ^ - rk[1]; - PUTU32(pt + 4, s1); + (Td4[(t1 >> 24) ] & 0xff000000) ^ + (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t2 ) & 0xff] & 0x000000ff) ^ + rk[1]; s2 = - (Td4[(t2 >> 24) ] & 0xff000000) ^ - (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t3 ) & 0xff] & 0x000000ff) ^ - rk[2]; - PUTU32(pt + 8, s2); + (Td4[(t2 >> 24) ] & 0xff000000) ^ + (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t3 ) & 0xff] & 0x000000ff) ^ + rk[2]; s3 = - (Td4[(t3 >> 24) ] & 0xff000000) ^ - (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ - (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ - (Td4[(t0 ) & 0xff] & 0x000000ff) ^ - rk[3]; + (Td4[(t3 >> 24) ] & 0xff000000) ^ + (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t0 ) & 0xff] & 0x000000ff) ^ + rk[3]; + + PUTU32(pt , s0); + PUTU32(pt + 4, s1); + PUTU32(pt + 8, s2); PUTU32(pt + 12, s3); } -static void AES_EncryptRound_portable(const uint32_t rk[4], uint8_t block[16]) { +static void AES_EncryptRound_portable( const uint32_t rk[4], uint8_t block[16] ) { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; s0 = GETU32(block ); @@ -490,29 +493,29 @@ static void AES_EncryptRound_portable(const uint32_t rk[4], uint8_t block[16]) { s3 = GETU32(block + 12); t0 = - (Te0[(s0 >> 24) ] & 0xff000000) ^ - (Te1[(s1 >> 16) & 0xff] & 0x00ff0000) ^ - (Te2[(s2 >> 8) & 0xff] & 0x0000ff00) ^ - (Te3[(s3 ) & 0xff] & 0x000000ff) ^ - rk[0]; + (Te0[(s0 >> 24) ] & 0xff000000) ^ + (Te1[(s1 >> 16) & 0xff] & 0x00ff0000) ^ + (Te2[(s2 >> 8) & 0xff] & 0x0000ff00) ^ + (Te3[(s3 ) & 0xff] & 0x000000ff) ^ + rk[0]; t1 = - (Te0[(s1 >> 24) ] & 0xff000000) ^ - (Te1[(s2 >> 16) & 0xff] & 0x00ff0000) ^ - (Te2[(s3 >> 8) & 0xff] & 0x0000ff00) ^ - (Te3[(s0 ) & 0xff] & 0x000000ff) ^ - rk[1]; + (Te0[(s1 >> 24) ] & 0xff000000) ^ + (Te1[(s2 >> 16) & 0xff] & 0x00ff0000) ^ + (Te2[(s3 >> 8) & 0xff] & 0x0000ff00) ^ + (Te3[(s0 ) & 0xff] & 0x000000ff) ^ + rk[1]; t2 = - (Te0[(s2 >> 24) ] & 0xff000000) ^ - (Te1[(s3 >> 16) & 0xff] & 0x00ff0000) ^ - (Te2[(s0 >> 8) & 0xff] & 0x0000ff00) ^ - (Te3[(s1 ) & 0xff] & 0x000000ff) ^ - rk[2]; + (Te0[(s2 >> 24) ] & 0xff000000) ^ + (Te1[(s3 >> 16) & 0xff] & 0x00ff0000) ^ + (Te2[(s0 >> 8) & 0xff] & 0x0000ff00) ^ + (Te3[(s1 ) & 0xff] & 0x000000ff) ^ + rk[2]; t3 = - (Te0[(s3 >> 24) ] & 0xff000000) ^ - (Te1[(s0 >> 16) & 0xff] & 0x00ff0000) ^ - (Te2[(s1 >> 8) & 0xff] & 0x0000ff00) ^ - (Te3[(s2 ) & 0xff] & 0x000000ff) ^ - rk[3]; + (Te0[(s3 >> 24) ] & 0xff000000) ^ + (Te1[(s0 >> 16) & 0xff] & 0x00ff0000) ^ + (Te2[(s1 >> 8) & 0xff] & 0x0000ff00) ^ + (Te3[(s2 ) & 0xff] & 0x000000ff) ^ + rk[3]; PUTU32(block , t0); PUTU32(block + 4, t1); @@ -520,7 +523,7 @@ static void AES_EncryptRound_portable(const uint32_t rk[4], uint8_t block[16]) { PUTU32(block + 12, t3); } -static void AES_DecryptRound_portable(const uint32_t rk[4], uint8_t block[16]) { +static void AES_DecryptRound_portable( const uint32_t rk[4], uint8_t block[16] ) { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; s0 = GETU32(block ); @@ -529,29 +532,29 @@ static void AES_DecryptRound_portable(const uint32_t rk[4], uint8_t block[16]) { s3 = GETU32(block + 12); t0 = - Td0[(s0 >> 24) ] ^ - Td1[(s3 >> 16) & 0xff] ^ - Td2[(s2 >> 8) & 0xff] ^ - Td3[(s1 ) & 0xff] ^ - rk[0]; + Td0[(s0 >> 24) ] ^ + Td1[(s3 >> 16) & 0xff] ^ + Td2[(s2 >> 8) & 0xff] ^ + Td3[(s1 ) & 0xff] ^ + rk[0]; t1 = - Td0[(s1 >> 24) ] ^ - Td1[(s0 >> 16) & 0xff] ^ - Td2[(s3 >> 8) & 0xff] ^ - Td3[(s2 ) & 0xff] ^ - rk[1]; + Td0[(s1 >> 24) ] ^ + Td1[(s0 >> 16) & 0xff] ^ + Td2[(s3 >> 8) & 0xff] ^ + Td3[(s2 ) & 0xff] ^ + rk[1]; t2 = - Td0[(s2 >> 24) ] ^ - Td1[(s1 >> 16) & 0xff] ^ - Td2[(s0 >> 8) & 0xff] ^ - Td3[(s3 ) & 0xff] ^ - rk[2]; + Td0[(s2 >> 24) ] ^ + Td1[(s1 >> 16) & 0xff] ^ + Td2[(s0 >> 8) & 0xff] ^ + Td3[(s3 ) & 0xff] ^ + rk[2]; t3 = - Td0[(s3 >> 24) ] ^ - Td1[(s2 >> 16) & 0xff] ^ - Td2[(s1 >> 8) & 0xff] ^ - Td3[(s0 ) & 0xff] ^ - rk[3]; + Td0[(s3 >> 24) ] ^ + Td1[(s2 >> 16) & 0xff] ^ + Td2[(s1 >> 8) & 0xff] ^ + Td3[(s0 ) & 0xff] ^ + rk[3]; PUTU32(block , t0); PUTU32(block + 4, t1); diff --git a/include/hashlib/AES-ppc.h b/include/hashlib/AES-ppc.h index 693c5ab2..b2e3730c 100644 --- a/include/hashlib/AES-ppc.h +++ b/include/hashlib/AES-ppc.h @@ -18,46 +18,50 @@ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -template < int Nr > -static inline void AES_Encrypt_PPC(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], uint8_t ct[16]) { +template +static inline void AES_Encrypt_PPC( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t pt[16], uint8_t ct[16] ) { const uint8_t * keys = (const uint8_t *)rk; vec_t block = (vec_t)vec_vsx_ld(0, pt); + block = vec_xor(block, (vec_t)vec_vsx_ld(0, keys)); for (int i = 1; i < Nr; i++) { - block = vec_encrypt(block, (vec_t)vec_vsx_ld(i*16, keys)); + block = vec_encrypt(block, (vec_t)vec_vsx_ld(i * 16, keys)); } - block = vec_encryptlast(block, (vec_t)vec_vsx_ld(Nr*16, keys)); + block = vec_encryptlast(block, (vec_t)vec_vsx_ld(Nr * 16, keys)); vec_vsx_st((__vector unsigned char)block, 0, ct); } -template < int Nr > -static inline void AES_Decrypt_PPC(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], uint8_t pt[16]) { +template +static inline void AES_Decrypt_PPC( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t ct[16], uint8_t pt[16] ) { const uint8_t * keys = (const uint8_t *)rk; vec_t block = (vec_t)vec_vsx_ld(0, ct); + block = vec_xor(block, (vec_t)vec_vsx_ld(0, keys)); for (int i = 1; i < Nr; i++) { - block = vec_decrypt(block, (vec_t)vec_vsx_ld(i*16, keys)); + block = vec_decrypt(block, (vec_t)vec_vsx_ld(i * 16, keys)); } - block = vec_decryptlast(block, (vec_t)vec_vsx_ld(Nr*16, keys)); + block = vec_decryptlast(block, (vec_t)vec_vsx_ld(Nr * 16, keys)); vec_vsx_st((__vector unsigned char)block, 0, pt); } -static inline void AES_EncryptRound_PPC(const uint32_t rk[4], uint8_t block[16]) { +static inline void AES_EncryptRound_PPC( const uint32_t rk[4], uint8_t block[16] ) { vec_t tmp = (vec_t)vec_vsx_ld(0, block); + tmp = vec_encrypt(tmp, (vec_t)vec_vsx_ld(0, (const uint8_t *)rk)); vec_vsx_st((__vector unsigned char)tmp, 0, block); } -static inline void AES_DecryptRound_PPC(const uint32_t rk[4], uint8_t block[16]) { +static inline void AES_DecryptRound_PPC( const uint32_t rk[4], uint8_t block[16] ) { vec_t tmp = (vec_t)vec_vsx_ld(0, block); + tmp = vec_decrypt(tmp, (vec_t)vec_vsx_ld(0, (const uint8_t *)rk)); vec_vsx_st((__vector unsigned char)tmp, 0, block); } diff --git a/include/hashlib/AES.h b/include/hashlib/AES.h index b5d55b14..1cde9b89 100644 --- a/include/hashlib/AES.h +++ b/include/hashlib/AES.h @@ -35,31 +35,31 @@ #include "Intrinsics.h" #if defined(HAVE_X86_64_AES) -# include "AES-aesni.h" + #include "AES-aesni.h" #elif defined(HAVE_ARM_AES) -# include "AES-arm.h" -# include "AES-portable.h" // ARM doesn't have any AES keygen intrinsics + #include "AES-arm.h" + #include "AES-portable.h" // ARM doesn't have any AES keygen intrinsics #elif defined(HAVE_PPC_AES) -# include "AES-ppc.h" -# include "AES-portable.h" // PPC doesn't really have any AES keygen intrinsics + #include "AES-ppc.h" + #include "AES-portable.h" // PPC doesn't really have any AES keygen intrinsics #else -# include "AES-portable.h" + #include "AES-portable.h" #endif -static inline void _bswap_subkeys(uint32_t rk[], int subkeys) { +static inline void _bswap_subkeys( uint32_t rk[], int subkeys ) { for (int i = 0; i < subkeys; i++) { rk[i] = COND_BSWAP(rk[i], true); } } -static int AES_KeySetup_Enc(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { +static int AES_KeySetup_Enc( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { // STATIC_ASSERT(keyBits == 128); #if defined(HAVE_X86_64_AES) return AES_KeySetup_Enc_AESNI(rk, cipherKey, keyBits); #elif defined(HAVE_ARM_AES) int Nr = AES_KeySetup_Enc_portable(rk, cipherKey, keyBits); if (isLE()) { - _bswap_subkeys(rk, 4*(Nr+1)); + _bswap_subkeys(rk, 4 * (Nr + 1)); } return Nr; #else @@ -67,14 +67,14 @@ static int AES_KeySetup_Enc(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey #endif } -static int AES_KeySetup_Dec(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey[], int keyBits) { +static int AES_KeySetup_Dec( uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t cipherKey[], int keyBits ) { // STATIC_ASSERT(keyBits == 128); #if defined(HAVE_X86_64_AES) return AES_KeySetup_Dec_AESNI(rk, cipherKey, keyBits); #elif defined(HAVE_ARM_AES) int Nr = AES_KeySetup_Dec_portable(rk, cipherKey, keyBits); if (isLE()) { - _bswap_subkeys(rk, 4*(Nr+1)); + _bswap_subkeys(rk, 4 * (Nr + 1)); } return Nr; #else @@ -82,8 +82,8 @@ static int AES_KeySetup_Dec(uint32_t rk[/*4*(Nr + 1)*/], const uint8_t cipherKey #endif } -template < int Nr > -static void AES_Encrypt(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], uint8_t ct[16]) { +template +static void AES_Encrypt( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t pt[16], uint8_t ct[16] ) { #if defined(HAVE_X86_64_AES) AES_Encrypt_AESNI(rk, pt, ct); #elif defined(HAVE_ARM_AES) @@ -95,8 +95,8 @@ static void AES_Encrypt(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t pt[16], #endif } -template < int Nr > -static void AES_Decrypt(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], uint8_t pt[16]) { +template +static void AES_Decrypt( const uint32_t rk[] /*4*(Nr + 1)*/, const uint8_t ct[16], uint8_t pt[16] ) { #if defined(HAVE_X86_64_AES) AES_Decrypt_AESNI(rk, pt, ct); #elif defined(HAVE_ARM_AES) @@ -108,7 +108,7 @@ static void AES_Decrypt(const uint32_t rk[/*4*(Nr + 1)*/], const uint8_t ct[16], #endif } -static void AES_EncryptRound(const uint32_t rk[4], uint8_t block[16]) { +static void AES_EncryptRound( const uint32_t rk[4], uint8_t block[16] ) { #if defined(HAVE_X86_64_AES) AES_EncryptRound_AESNI(rk, block); #elif defined(HAVE_ARM_AES) @@ -120,7 +120,7 @@ static void AES_EncryptRound(const uint32_t rk[4], uint8_t block[16]) { #endif } -static void AES_DecryptRound(const uint32_t rk[4], uint8_t block[16]) { +static void AES_DecryptRound( const uint32_t rk[4], uint8_t block[16] ) { #if defined(HAVE_X86_64_AES) AES_DecryptRound_AESNI(rk, block); #elif defined(HAVE_ARM_AES) diff --git a/include/hashlib/Hashlib.h b/include/hashlib/Hashlib.h index 5d341a98..339e1a2f 100644 --- a/include/hashlib/Hashlib.h +++ b/include/hashlib/Hashlib.h @@ -20,42 +20,43 @@ #include -unsigned register_hash(const HashInfo * hinfo); +// Interface for hash implementations +unsigned register_hash( const HashInfo * hinfo ); -const HashInfo * findHash(const char * name); -std::vector findAllHashes(void); -void listHashes(bool nameonly); +// Interface for consumer for getting hashes +const HashInfo * findHash( const char * name ); +std::vector findAllHashes( void ); +void listHashes( bool nameonly ); -bool verifyAllHashes(bool verbose); -bool verifyHash(const HashInfo * hinfo, - enum HashInfo::endianness endian, - bool verbose, bool prefix); +// Interface for ensuring hash is giving expected results +bool verifyAllHashes( bool verbose ); +bool verifyHash( const HashInfo * hinfo, enum HashInfo::endianness endian, bool verbose, bool prefix ); //----------------------------------------------------------------------------- -#define CONCAT_INNER(x, y) x##y -#define CONCAT(x,y) CONCAT_INNER(x, y) - -#define REGISTER_FAMILY(N, ...) \ - static_assert(sizeof(#N) > 1, \ - "REGISTER_FAMILY() needs a non-empty name"); \ - static HashFamilyInfo THIS_HASH_FAMILY = []{ \ - HashFamilyInfo $(#N); \ - __VA_ARGS__; \ - return $; \ - }(); \ +#define CONCAT_INNER(x, y) x ## y +#define CONCAT(x, y) CONCAT_INNER(x, y) + +#define REGISTER_FAMILY(N, ...) \ + static_assert(sizeof(#N) > 1, \ + "REGISTER_FAMILY() needs a non-empty name"); \ + static HashFamilyInfo THIS_HASH_FAMILY = []{ \ + HashFamilyInfo $(#N); \ + __VA_ARGS__; \ + return $; \ + }(); \ unsigned CONCAT(N,_ref) -#define REGISTER_HASH(N, ...) \ - static_assert(sizeof(#N) > 1, \ - "REGISTER_HASH() needs a non-empty name"); \ - static HashInfo CONCAT(Hash_,N) = []{ \ - HashInfo $(#N, THIS_HASH_FAMILY.name); \ - __VA_ARGS__; \ - register_hash(&$); \ - return $; \ +#define REGISTER_HASH(N, ...) \ + static_assert(sizeof(#N) > 1, \ + "REGISTER_HASH() needs a non-empty name"); \ + static HashInfo CONCAT(Hash_,N) = []{ \ + HashInfo $(#N, THIS_HASH_FAMILY.name); \ + __VA_ARGS__; \ + register_hash(&$); \ + return $; \ }(); -#define USE_FAMILY(N) \ - extern unsigned CONCAT(N,_ref); \ +#define USE_FAMILY(N) \ + extern unsigned CONCAT(N,_ref); \ CONCAT(N,_ref) = 1 diff --git a/include/hashlib/Mathmult.h b/include/hashlib/Mathmult.h index f2d94552..2bc6365d 100644 --- a/include/hashlib/Mathmult.h +++ b/include/hashlib/Mathmult.h @@ -54,19 +54,19 @@ */ // 32x32->64 multiplication [rhi:rlo = a * b] -static FORCE_INLINE void mult32_64(uint32_t& rlo, uint32_t& rhi, uint32_t a, uint32_t b) { +static FORCE_INLINE void mult32_64( uint32_t & rlo, uint32_t & rhi, uint32_t a, uint32_t b ) { // XXX Are either of these asm blocks better than just the plain code? #if 0 && defined(HAVE_ARM_ASM) - __asm__("UMULL w%0, w%1, w%2, w%3\n" - : "+r" (rlo), "+r" (rhi) - : "r" (a), "r" (b) - : "cc", "memory" - ); + __asm__ ("UMULL w%0, w%1, w%2, w%3\n" + : "+r" (rlo), "+r" (rhi) + : "r" (a), "r" (b) + : "cc", "memory" + ); #elif 0 && defined(HAVE_X86_64_ASM) - __asm__("mull %[b]\n" - : "=d" (rhi), "=a" (rlo) - : "1" (a), [b] "rm" (b) - ); + __asm__ ("mull %[b]\n" + : "=d" (rhi), "=a" (rlo) + : "1" (a), [b] "rm" (b) + ); #else uint64_t r = (uint64_t)a * (uint64_t)b; rhi = (uint32_t)(r >> 32); @@ -75,7 +75,7 @@ static FORCE_INLINE void mult32_64(uint32_t& rlo, uint32_t& rhi, uint32_t a, uin } // 32x32->64 multiplication [r64 = a32 * b32] -static FORCE_INLINE void mult32_64(uint64_t & r64, uint32_t a32, uint32_t b32) { +static FORCE_INLINE void mult32_64( uint64_t & r64, uint32_t a32, uint32_t b32 ) { #if defined(_MSC_VER) && defined(_M_IX86) r64 = __emulu(a32, b32); #else @@ -84,56 +84,56 @@ static FORCE_INLINE void mult32_64(uint64_t & r64, uint32_t a32, uint32_t b32) { } // 96-bit addition [rhi:rmi:rlo += addhi:addmi:addlo] -static FORCE_INLINE void add96(uint32_t& rlo, uint32_t& rmi, uint32_t& rhi, const uint32_t& addlo, const uint32_t& addmi, const uint32_t& addhi) { +static FORCE_INLINE void add96( uint32_t & rlo, uint32_t & rmi, uint32_t & rhi, const uint32_t & addlo, + const uint32_t & addmi, const uint32_t & addhi ) { #if defined(HAVE_ARM_ASM) - __asm__("ADDS %w0, %w3, %w0\n" - "ADCS %w1, %w4, %w1\n" - "ADC %w2, %w5, %w2\n" - : "+r" (rlo), "+r" (rmi), "+r" (rhi) - : "r" (addlo), "r" (addmi), "r" (addhi) - : "cc" - ); + __asm__ ("ADDS %w0, %w3, %w0\n" + "ADCS %w1, %w4, %w1\n" + "ADC %w2, %w5, %w2\n" + : "+r" (rlo), "+r" (rmi), "+r" (rhi) + : "r" (addlo), "r" (addmi), "r" (addhi) + : "cc" + ); #elif defined(HAVE_X86_64_ASM) - __asm__("addl %3, %0\n" - "adcl %4, %1\n" - "adcl %5, %2\n" - : "+g" (rlo), "+g" (rmi), "+g" (rhi) - : "g" (addlo), "g" (addmi), "g" (addhi) - : "cc" - ); + __asm__ ("addl %3, %0\n" + "adcl %4, %1\n" + "adcl %5, %2\n" + : "+g" (rlo), "+g" (rmi), "+g" (rhi) + : "g" (addlo), "g" (addmi), "g" (addhi) + : "cc" + ); #else - uint64_t w = (((uint64_t)rmi) << 32) + ((uint64_t)rlo); + uint64_t w = (((uint64_t)rmi ) << 32) + ((uint64_t)rlo ); uint64_t r = (((uint64_t)addmi) << 32) + ((uint64_t)addlo) + w; - rhi += (r < w); - rhi += addhi; - rmi = (uint32_t)(r >> 32); - rlo = (uint32_t)(r); + rhi += addhi + (r < w); + rmi = (uint32_t)(r >> 32); + rlo = (uint32_t)(r ); #endif } // 96-bit fused multiply addition [rhi:rmi:rlo += a * b] -static FORCE_INLINE void fma32_96(uint32_t& rlo, uint32_t& rmi, uint32_t& rhi, uint32_t a, uint32_t b) { +static FORCE_INLINE void fma32_96( uint32_t & rlo, uint32_t & rmi, uint32_t & rhi, uint32_t a, uint32_t b ) { // These #defines are not correct; some arm seems to not support this #if 0 && defined(HAVE_ARM_ASM) uint32_t tmphi, tmplo; - __asm__("UMULL %w3, %w4, %w5, %w6\n" - "ADDS %w0, %w3, %w0\n" - "ADCS %w1, %w4, %w1\n" - "ADC %w2, %w2, #0x0\n" - : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=r" (tmplo), "=r" (tmphi) - : "r" (a), "r" (b) - : "cc" - ); + __asm__ ("UMULL %w3, %w4, %w5, %w6\n" + "ADDS %w0, %w3, %w0\n" + "ADCS %w1, %w4, %w1\n" + "ADC %w2, %w2, #0x0\n" + : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=r" (tmplo), "=r" (tmphi) + : "r" (a), "r" (b) + : "cc" + ); #elif defined(HAVE_X86_64_ASM) uint32_t tmphi; - __asm__("mull %5\n" - "addl %%eax, %0\n" - "adcl %%edx, %1\n" - "adcl $0, %2\n" - : "+g" (rlo), "+g" (rmi), "+g" (rhi), "=a" (tmphi) - : "a" (a), "g" (b) - : "edx", "cc" - ); + __asm__ ("mull %5\n" + "addl %%eax, %0\n" + "adcl %%edx, %1\n" + "adcl $0, %2\n" + : "+g" (rlo), "+g" (rmi), "+g" (rhi), "=a" (tmphi) + : "a" (a), "g" (b) + : "edx", "cc" + ); #else uint32_t tmplo, tmpmi, tmphi = 0; mult32_64(tmplo, tmpmi, a, b); @@ -142,7 +142,7 @@ static FORCE_INLINE void fma32_96(uint32_t& rlo, uint32_t& rmi, uint32_t& rhi, u } // 64x64->128 multiplication [rhi:rlo = a * b] -static FORCE_INLINE void mult64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void mult64_128( uint64_t & rlo, uint64_t & rhi, uint64_t a, uint64_t b ) { #if defined(HAVE_ARM64_ASM) /* * AARCH64 needs 2 insns to calculate 128-bit result of the @@ -151,16 +151,16 @@ static FORCE_INLINE void mult64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, ui * is very slow. */ rlo = a * b; - __asm__("umulh %0, %1, %2\n" - : "=r" (rhi) - : "r" (a), "r" (b) - ); + __asm__ ("umulh %0, %1, %2\n" + : "=r" (rhi) + : "r" (a), "r" (b) + ); #elif defined(HAVE_PPC_ASM) rlo = a * b; - __asm__("mulhdu %0, %1, %2\n" - : "=r" (rhi) - : "r" (a), "r" (b) - ); + __asm__ ("mulhdu %0, %1, %2\n" + : "=r" (rhi) + : "r" (a), "r" (b) + ); #elif defined(HAVE_UMUL128) rlo = _umul128(a, b, &rhi); #elif defined(HAVE_UMULH) @@ -173,16 +173,16 @@ static FORCE_INLINE void mult64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, ui * takes 3-cycles vs. 4 for MULX, MULX permits more freedom in * insn scheduling as it uses less fixed registers. */ - __asm__("mulxq %3,%1,%0\n" - : "=r" (rhi), "=r" (rlo) - : "d" (a), "r" (b) - ); + __asm__ ("mulxq %3,%1,%0\n" + : "=r" (rhi), "=r" (rlo) + : "d" (a), "r" (b) + ); #elif defined(HAVE_X86_64_ASM) - __asm__("mulq %[b]\n" - : "=d" (rhi), "=a" (rlo) - : "1" (a), [b] "rm" (b) - : "cc" - ); + __asm__ ("mulq %[b]\n" + : "=d" (rhi), "=a" (rlo) + : "1" (a), [b] "rm" (b) + : "cc" + ); #elif defined(HAVE_INT128) // Maybe move this before the other x64 ASM methods? // Seems like it's more compiler-friendly, but it produces slower code. @@ -202,16 +202,16 @@ static FORCE_INLINE void mult64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, ui uint64_t tmplo = alo * blo; uint64_t t, carry = 0; - t = tmplo + (tmpmi_0 << 32); - carry += (t < tmplo); - rlo = t + (tmpmi_1 << 32); - carry += (rlo < t); - rhi = tmphi + (tmpmi_0 >> 32) + (tmpmi_1 >> 32) + carry; + t = (tmpmi_0 << 32 ) + tmplo; + carry += (t < tmplo); + rlo = (tmpmi_1 << 32 ) + t; + carry += (rlo < t ); + rhi = (tmpmi_0 >> 32 ) + (tmpmi_1 >> 32) + tmphi + carry; #endif } // 64x64->128 multiplication with no cross-lane carry [rhi:rlo ~= a * b] -static FORCE_INLINE void mult64_128_nocarry(uint64_t& rlo, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void mult64_128_nocarry( uint64_t & rlo, uint64_t & rhi, uint64_t a, uint64_t b ) { /* * Implementation of 64x64->128-bit multiplication by four * 32x32->64 bit multiplication, excluding the carry bits. This @@ -230,27 +230,27 @@ static FORCE_INLINE void mult64_128_nocarry(uint64_t& rlo, uint64_t& rhi, uint64 } // 128-bit addition special case [rhi:rlo += 0:addlo] -static FORCE_INLINE void add128(uint64_t& rlo, uint64_t& rhi, uint64_t addlo) { +static FORCE_INLINE void add128( uint64_t & rlo, uint64_t & rhi, uint64_t addlo ) { #if defined(HAVE_X86_64_ASM) - __asm__("addq %2, %0\n" - "adcq $0, %1\n" -#if defined(DEBUG) - : "+r" (rlo), "+r" (rhi) - : "r" (addlo) -#elif defined(__clang__) - // clang cannot work properly with "g" and silently - // produces hardly-workging code, if "g" is specified; - // see, for instance, here: - // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload - // To avoid 3x performance hit we have to specify sources/destinations - : "+r" (rlo), "+r" (rhi) - : "m" (addlo) -#else - : "+g" (rlo), "+g" (rhi) - : "g" (addlo) -#endif - : "cc" - ); + __asm__ ("addq %2, %0\n" + "adcq $0, %1\n" + #if defined(DEBUG) + : "+r" (rlo), "+r" (rhi) + : "r" (addlo) + #elif defined(__clang__) + // clang cannot work properly with "g" and silently + // produces hardly-workging code, if "g" is specified; + // see, for instance, here: + // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload + // To avoid 3x performance hit we have to specify sources/destinations + : "+r" (rlo), "+r" (rhi) + : "m" (addlo) + #else + : "+g" (rlo), "+g" (rhi) + : "g" (addlo) + #endif + : "cc" + ); #else rlo += addlo; rhi += (rlo < addlo); @@ -258,33 +258,33 @@ static FORCE_INLINE void add128(uint64_t& rlo, uint64_t& rhi, uint64_t addlo) { } // 128-bit addition [rhi:rlo += addhi:addlo] -static FORCE_INLINE void add128(uint64_t& rlo, uint64_t& rhi, uint64_t addlo, uint64_t addhi) { +static FORCE_INLINE void add128( uint64_t & rlo, uint64_t & rhi, uint64_t addlo, uint64_t addhi ) { #if defined(HAVE_X86_64_ASM) - __asm__("addq %2, %0\n" - "adcq %3, %1\n" -#if defined(DEBUG) - : "+r" (rlo), "+r" (rhi) - : "r" (addlo), "r" (addhi) -#elif defined(__clang__) - // clang cannot work properly with "g" and silently - // produces hardly-workging code, if "g" is specified; - // see, for instance, here: - // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload - // To avoid 3x performance hit we have to specify sources/destinations - : "+r" (rlo), "+r" (rhi) - : "m" (addlo), "m" (addhi) -#else - : "+r" (rlo), "+g" (rhi) - : "g" (addlo), "g" (addhi) -#endif - : "cc" - ); + __asm__ ("addq %2, %0\n" + "adcq %3, %1\n" + #if defined(DEBUG) + : "+r" (rlo), "+r" (rhi) + : "r" (addlo), "r" (addhi) + #elif defined(__clang__) + // clang cannot work properly with "g" and silently + // produces hardly-workging code, if "g" is specified; + // see, for instance, here: + // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload + // To avoid 3x performance hit we have to specify sources/destinations + : "+r" (rlo), "+r" (rhi) + : "m" (addlo), "m" (addhi) + #else + : "+r" (rlo), "+g" (rhi) + : "g" (addlo), "g" (addhi) + #endif + : "cc" + ); #elif defined(HAVE_PPC_ASM) - __asm__("addc %1, %1, %3\n" - "adde %0, %0, %2\n" - : "+r" (rhi), "+r" (rlo) - : "r" (addhi), "r" (addlo) - ); + __asm__ ("addc %1, %1, %3\n" + "adde %0, %0, %2\n" + : "+r" (rhi), "+r" (rlo) + : "r" (addhi), "r" (addlo) + ); #else rlo += addlo; rhi += (rlo < addlo); @@ -293,28 +293,29 @@ static FORCE_INLINE void add128(uint64_t& rlo, uint64_t& rhi, uint64_t addlo, ui } // 192-bit addition [rhi:rmi:rlo += addhi:addmi:addlo] -static FORCE_INLINE void add192(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, const uint64_t& addlo, const uint64_t& addmi, const uint64_t& addhi) { +static FORCE_INLINE void add192( uint64_t & rlo, uint64_t & rmi, uint64_t & rhi, const uint64_t & addlo, + const uint64_t & addmi, const uint64_t & addhi ) { #if defined(HAVE_X86_64_ASM) - __asm__("addq %3, %0\n" - "adcq %4, %1\n" - "adcq %5, %2\n" -#if defined(DEBUG) - : "+r" (rlo), "+r" (rmi), "+r" (rhi) - : "r" (addlo), "r" (addmi), "r" (addhi) -#elif defined(__clang__) - // clang cannot work properly with "g" and silently - // produces hardly-workging code, if "g" is specified; - // see, for instance, here: - // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload - // To avoid 3x performance hit we have to specify sources/destinations - : "+r" (rlo), "+r" (rmi), "+r" (rhi) - : "m" (addlo), "m" (addmi), "m" (addhi) -#else - : "+g" (rlo), "+g" (rmi), "+g" (rhi) - : "rm" (addlo), "rm" (addmi), "rm" (addhi) -#endif - : "cc" - ); + __asm__ ("addq %3, %0\n" + "adcq %4, %1\n" + "adcq %5, %2\n" + #if defined(DEBUG) + : "+r" (rlo), "+r" (rmi), "+r" (rhi) + : "r" (addlo), "r" (addmi), "r" (addhi) + #elif defined(__clang__) + // clang cannot work properly with "g" and silently + // produces hardly-workging code, if "g" is specified; + // see, for instance, here: + // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload + // To avoid 3x performance hit we have to specify sources/destinations + : "+r" (rlo), "+r" (rmi), "+r" (rhi) + : "m" (addlo), "m" (addmi), "m" (addhi) + #else + : "+g" (rlo), "+g" (rmi), "+g" (rhi) + : "rm" (addlo), "rm" (addmi), "rm" (addhi) + #endif + : "cc" + ); #else rlo += addlo; rmi += (rlo < addlo); @@ -325,7 +326,7 @@ static FORCE_INLINE void add192(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, con } // 128-bit fused multiply addition [rhi:rlo += a * b] -static FORCE_INLINE void fma64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void fma64_128( uint64_t & rlo, uint64_t & rhi, uint64_t a, uint64_t b ) { #if defined(HAVE_X86_64_ASM) /* * Dummy variable to tell the compiler that the register rax is @@ -333,25 +334,25 @@ static FORCE_INLINE void fma64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, uin * below. Better syntactic expression is very welcome. */ uint64_t dummy; - __asm__("mulq %4\n" - "addq %%rax, %0\n" - "adcq %%rdx, %1\n" -#if defined(DEBUG) - : "+r" (rlo), "+r" (rhi), "=a" (dummy) - : "a" (a), "r" (b) -#elif defined(__clang__) - // clang cannot work properly with "g" and silently - // produces hardly-workging code, if "g" is specified; - // see, for instance, here: - // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload - // To avoid 3x performance hit we have to specify sources/destinations - : "+r" (rlo), "+r" (rhi), "=a" (dummy) - : "a" (a), "m" (b) -#else - : "+g" (rlo), "+g" (rhi), "=a" (dummy) - : "a" (a), "g" (b) -#endif - : "rdx", "cc"); + __asm__ ("mulq %4\n" + "addq %%rax, %0\n" + "adcq %%rdx, %1\n" + #if defined(DEBUG) + : "+r" (rlo), "+r" (rhi), "=a" (dummy) + : "a" (a), "r" (b) + #elif defined(__clang__) + // clang cannot work properly with "g" and silently + // produces hardly-workging code, if "g" is specified; + // see, for instance, here: + // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload + // To avoid 3x performance hit we have to specify sources/destinations + : "+r" (rlo), "+r" (rhi), "=a" (dummy) + : "a" (a), "m" (b) + #else + : "+g" (rlo), "+g" (rhi), "=a" (dummy) + : "a" (a), "g" (b) + #endif + : "rdx", "cc"); #else uint64_t tmplo, tmphi; mult64_128(tmplo, tmphi, a, b); @@ -360,7 +361,7 @@ static FORCE_INLINE void fma64_128(uint64_t& rlo, uint64_t& rhi, uint64_t a, uin } // 192-bit fused multiply addition [rhi:rmi:rlo += a * b] -static FORCE_INLINE void fma64_192(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, uint64_t a, uint64_t b) { +static FORCE_INLINE void fma64_192( uint64_t & rlo, uint64_t & rmi, uint64_t & rhi, uint64_t a, uint64_t b ) { #if defined(HAVE_X86_64_ASM) /* * Dummy variable to tell the compiler that the register rax is @@ -368,26 +369,26 @@ static FORCE_INLINE void fma64_192(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, * below. Better syntactic expression is very welcome. */ uint64_t dummy; - __asm__("mulq %5\n" - "addq %%rax, %0\n" - "adcq %%rdx, %1\n" - "adcq $0, %2\n" -#if defined(DEBUG) - : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=a" (dummy) - : "a" (a), "r" (b) -#elif defined(__clang__) - // clang cannot work properly with "g" and silently - // produces hardly-workging code, if "g" is specified; - // see, for instance, here: - // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload - // To avoid 3x performance hit we have to specify sources/destinations - : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=a" (dummy) - : "a" (a), "m" (b) -#else - : "+g" (rlo), "+g" (rmi), "+g" (rhi), "=a" (dummy) - : "a" (a), "g" (b) -#endif - : "rdx", "cc" ); + __asm__ ("mulq %5\n" + "addq %%rax, %0\n" + "adcq %%rdx, %1\n" + "adcq $0, %2\n" + #if defined(DEBUG) + : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=a" (dummy) + : "a" (a), "r" (b) + #elif defined(__clang__) + // clang cannot work properly with "g" and silently + // produces hardly-workging code, if "g" is specified; + // see, for instance, here: + // http://stackoverflow.com/questions/16850309/clang-llvm-inline-assembly-multiple-constraints-with-useless-spills-reload + // To avoid 3x performance hit we have to specify sources/destinations + : "+r" (rlo), "+r" (rmi), "+r" (rhi), "=a" (dummy) + : "a" (a), "m" (b) + #else + : "+g" (rlo), "+g" (rmi), "+g" (rhi), "=a" (dummy) + : "a" (a), "g" (b) + #endif + : "rdx", "cc"); #else uint64_t tmplo, tmpmi, tmphi = 0; mult64_128(tmplo, tmpmi, a, b); @@ -396,11 +397,12 @@ static FORCE_INLINE void fma64_192(uint64_t& rlo, uint64_t& rmi, uint64_t& rhi, } // 128x128->128 multiplication [rhi:rlo = a * bhi:blo] -static FORCE_INLINE void mult128_128(uint64_t& rlo, uint64_t& rhi, uint64_t alo, uint64_t ahi, uint64_t blo, uint64_t bhi) { +static FORCE_INLINE void mult128_128( uint64_t & rlo, uint64_t & rhi, uint64_t alo, + uint64_t ahi, uint64_t blo, uint64_t bhi ) { #if defined(HAVE_INT128) uint128_t r = (((uint128_t)ahi) << 64) + (uint128_t)alo; uint128_t c = (((uint128_t)bhi) << 64) + (uint128_t)blo; - r = r * c; + r = r * c; rhi = (uint64_t)(r >> 64); rlo = (uint64_t)r; #else diff --git a/lib/Hashinfo.cpp b/lib/Hashinfo.cpp index 02aae7aa..17c9b542 100644 --- a/lib/Hashinfo.cpp +++ b/lib/Hashinfo.cpp @@ -24,21 +24,22 @@ #include #include -const char * HashInfo::_fixup_name(const char * in) { +const char * HashInfo::_fixup_name( const char * in ) { // Since dashes can't be in C/C++ identifiers, but humans want // them in names, replace underscores with dashes. Similarly, // replace double underscores with dots. - std::string out(in); + std::string out( in ); + do { size_t p = out.find("__"); - if (p == std::string::npos) break; + if (p == std::string::npos) { break; } out.replace(p, 2, "."); - } while(true); + } while (true); std::replace(&out[0], &out[out.length()], '_', '-'); return strdup(out.c_str()); } -const char * HashFamilyInfo::_fixup_name(const char * in) { +const char * HashFamilyInfo::_fixup_name( const char * in ) { return HashInfo::_fixup_name(in); } @@ -46,56 +47,57 @@ const char * HashFamilyInfo::_fixup_name(const char * in) { // This should hopefully be a thorough and uambiguous test of whether a hash // is correctly implemented on a given platform. -uint32_t HashInfo::_ComputedVerifyImpl(const HashInfo * hinfo, enum HashInfo::endianness endian) const { - const HashFn hash = hinfo->hashFn(endian); - const uint32_t hashbits = hinfo->bits; - const uint32_t hashbytes = hashbits / 8; - - uint8_t * key = new uint8_t[256]; - uint8_t * hashes = new uint8_t[hashbytes * 256]; - uint8_t * total = new uint8_t[hashbytes]; - - memset(key,0,256); - memset(hashes,0,hashbytes*256); - memset(total,0,hashbytes); - - // Hash keys of the form {}, {0}, {0,1}, {0,1,2}... up to N=255, using - // 256-N as the seed - for(int i = 0; i < 256; i++) { - seed_t seed = 256 - i; - seed = hinfo->Seed(seed, true, 1); - hash(key, i, seed, &hashes[i*hashbytes]); - addVCodeInput(key, i); - key[i] = (uint8_t)i; - } - - // Then hash the result array - seed_t seed = 0; - seed = hinfo->Seed(0, true, 1); - hash(hashes, hashbytes*256, seed, total); - addVCodeOutput(hashes, 256*hashbytes); - addVCodeOutput(total, hashbytes); - - // The first four bytes of that hash, interpreted as a little-endian - // integer, is our verification value - uint32_t verification = (total[0] << 0) | (total[1] << 8) | - (total[2] << 16) | (total[3] << 24) ; - addVCodeResult(verification); - - delete [] total; - delete [] hashes; - delete [] key; - - return verification; +uint32_t HashInfo::_ComputedVerifyImpl( const HashInfo * hinfo, enum HashInfo::endianness endian ) const { + const HashFn hash = hinfo->hashFn(endian); + const uint32_t hashbits = hinfo->bits; + const uint32_t hashbytes = hashbits / 8; + + uint8_t * key = new uint8_t[256 ]; + uint8_t * hashes = new uint8_t[hashbytes * 256]; + uint8_t * total = new uint8_t[hashbytes ]; + + memset(key , 0, 256); + memset(hashes, 0, hashbytes * 256); + memset(total , 0, hashbytes); + + // Hash keys of the form {}, {0}, {0,1}, {0,1,2}... up to N=255, using + // 256-N as the seed + for (int i = 0; i < 256; i++) { + seed_t seed = 256 - i; + seed = hinfo->Seed(seed, true, 1); + hash(key, i, seed, &hashes[i * hashbytes]); + addVCodeInput(key, i); + key[i] = (uint8_t)i; + } + + // Then hash the result array + seed_t seed = 0; + seed = hinfo->Seed(0, true, 1); + hash(hashes, hashbytes * 256, seed, total); + addVCodeOutput(hashes, 256 * hashbytes); + addVCodeOutput(total , hashbytes ); + + // The first four bytes of that hash, interpreted as a little-endian + // integer, is our verification value + uint32_t verification = (total[0] << 0) | (total[1] << 8) | + (total[2] << 16) | (total[3] << 24); + addVCodeResult(verification); + + delete [] total; + delete [] hashes; + delete [] key; + + return verification; } //----------------------------------------------------------------------------- // Utility function for hashes to easily specify that any seeds in // their badseed set should be excluded when their FixupSeed() method // is called. -seed_t excludeBadseeds(const HashInfo * hinfo, const seed_t seed) { +seed_t excludeBadseeds( const HashInfo * hinfo, const seed_t seed ) { seed_t newseed = seed; - auto endp = hinfo->badseeds.end(); + auto endp = hinfo->badseeds.end(); + while (hinfo->badseeds.find(newseed) != endp) { newseed++; } @@ -104,6 +106,6 @@ seed_t excludeBadseeds(const HashInfo * hinfo, const seed_t seed) { // Utility function for hashes to easily specify that the seed value // should not be 0. -seed_t excludeZeroSeed(const HashInfo * hinfo, const seed_t seed) { +seed_t excludeZeroSeed( const HashInfo * hinfo, const seed_t seed ) { return (seed == 0) ? 1 : seed; } diff --git a/lib/Hashlib.cpp b/lib/Hashlib.cpp index 5344ebda..f948a3aa 100644 --- a/lib/Hashlib.cpp +++ b/lib/Hashlib.cpp @@ -25,54 +25,56 @@ #include //----------------------------------------------------------------------------- -typedef std::unordered_map HashMap; -typedef std::vector HashMapOrder; +typedef std::unordered_map HashMap; +typedef std::vector HashMapOrder; -static HashMap& hashMap() { - static HashMap * map = new HashMap; - return *map; +static HashMap & hashMap() { + static HashMap * map = new HashMap; + + return *map; } //----------------------------------------------------------------------------- // Add a hash to the hashMap list of all hashes. // // FIXME Verify hinfo is all filled out. -unsigned register_hash(const HashInfo * hinfo) { - static std::unordered_map hashcodes; - std::string name = hinfo->name; - // Allow users to lookup hashes by any case - std::transform(name.begin(), name.end(), name.begin(), ::tolower); - - if (hashMap().find(name) != hashMap().end()) { - printf("Hash names must be unique.\n"); - printf("\"%s\" (\"%s\") was added multiple times.\n", hinfo->name, name.c_str()); - printf("Note that hash names are using a case-insensitive comparison.\n"); - exit(1); - } - - if (hinfo->verification_LE != 0) { - const auto it_LE = hashcodes.find(hinfo->verification_LE); - if (it_LE == hashcodes.end()) { - hashcodes[hinfo->verification_LE] = hinfo; - } else { - printf("WARNING: Hash with verification code %08x was already registered: %s\n", - hinfo->verification_LE, it_LE->second->name); - printf(" Are you certain %s is a unique implementation?\n", hinfo->name); - } - } - if ((hinfo->verification_BE != 0) && (hinfo->verification_BE != hinfo->verification_LE)) { - const auto it_BE = hashcodes.find(hinfo->verification_BE); - if (it_BE == hashcodes.end()) { - hashcodes[hinfo->verification_BE] = hinfo; - } else { - printf("WARNING: Hash with verification code %08x was already registered: %s\n", - hinfo->verification_BE, it_BE->second->name); - printf(" Are you certain %s is a unique implementation?\n", hinfo->name); - } - } - - hashMap()[name] = hinfo; - return hashMap().size(); +unsigned register_hash( const HashInfo * hinfo ) { + static std::unordered_map hashcodes; + std::string name = hinfo->name; + + // Allow users to lookup hashes by any case + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + + if (hashMap().find(name) != hashMap().end()) { + printf("Hash names must be unique.\n"); + printf("\"%s\" (\"%s\") was added multiple times.\n", hinfo->name, name.c_str()); + printf("Note that hash names are using a case-insensitive comparison.\n"); + exit(1); + } + + if (hinfo->verification_LE != 0) { + const auto it_LE = hashcodes.find(hinfo->verification_LE); + if (it_LE == hashcodes.end()) { + hashcodes[hinfo->verification_LE] = hinfo; + } else { + printf("WARNING: Hash with verification code %08x was already registered: %s\n", + hinfo->verification_LE, it_LE->second->name); + printf(" Are you certain %s is a unique implementation?\n", hinfo->name); + } + } + if ((hinfo->verification_BE != 0) && (hinfo->verification_BE != hinfo->verification_LE)) { + const auto it_BE = hashcodes.find(hinfo->verification_BE); + if (it_BE == hashcodes.end()) { + hashcodes[hinfo->verification_BE] = hinfo; + } else { + printf("WARNING: Hash with verification code %08x was already registered: %s\n", + hinfo->verification_BE, it_BE->second->name); + printf(" Are you certain %s is a unique implementation?\n", hinfo->name); + } + } + + hashMap()[name] = hinfo; + return hashMap().size(); } //----------------------------------------------------------------------------- @@ -84,75 +86,80 @@ unsigned register_hash(const HashInfo * hinfo) { // // This is overloaded for mock hashes to also override the sorting for // _family name_, which is not something general users should do. -static HashMapOrder defaultSort(HashMap & map) { +static HashMapOrder defaultSort( HashMap & map ) { HashMapOrder hashes; + hashes.reserve(map.size()); - for (auto kv : map) { + for (auto kv: map) { hashes.push_back(kv.second); } - std::sort(hashes.begin(), hashes.end(), - [](const HashInfo * a, const HashInfo * b) { - int r; - // Mock hashes go before others - if (a->isMock() != b->isMock()) - return a->isMock(); - // Mock hashes use sort_order over all other criteria - if (a->isMock() && (a->sort_order != b->sort_order)) - return (a->sort_order < b->sort_order); - // Cryptographic hashes go before non-crypto - if (a->isCrypto() != b->isCrypto()) - return a->isCrypto(); - // Then sort by family (case-insensitive) - if ((r = strcasecmp(a->family, b->family)) != 0) - return (r < 0); - // Then by hash output size (smaller first) - if (a->bits != b->bits) - return (a->bits < b->bits); - // Then by explicit sort_order - if (a->sort_order != b->sort_order) - return (a->sort_order < b->sort_order); - // And finally by hash name (case-insensitive) - if ((r = strcasecmp(a->name, b->name)) != 0) - return (r < 0); - return false; - }); + std::sort(hashes.begin(), hashes.end(), []( const HashInfo * a, const HashInfo * b ) { + int r; + // Mock hashes go before others + if (a->isMock() != b->isMock()) { + return a->isMock(); + } + // Mock hashes use sort_order over all other criteria + if (a->isMock() && (a->sort_order != b->sort_order)) { + return a->sort_order < b->sort_order; + } + // Cryptographic hashes go before non-crypto + if (a->isCrypto() != b->isCrypto()) { + return a->isCrypto(); + } + // Then sort by family (case-insensitive) + if ((r = strcasecmp(a->family, b->family)) != 0) { + return r < 0; + } + // Then by hash output size (smaller first) + if (a->bits != b->bits) { + return a->bits < b->bits; + } + // Then by explicit sort_order + if (a->sort_order != b->sort_order) { + return a->sort_order < b->sort_order; + } + // And finally by hash name (case-insensitive) + if ((r = strcasecmp(a->name, b->name)) != 0) { + return r < 0; + } + return false; + }); return hashes; } -std::vector findAllHashes(void) { +std::vector findAllHashes( void ) { HashMapOrder hashes; + hashes = defaultSort(hashMap()); return hashes; } -const HashInfo * findHash(const char * name) { - std::string n = name; - // Search without regards to case - std::transform(n.begin(), n.end(), n.begin(), ::tolower); - // Since underscores can't be in names, the user must have meant a dash - std::replace(n.begin(), n.end(), '_', '-'); - - const auto it = hashMap().find(n); - if (it == hashMap().end()) { - return NULL; - } - return it->second; +const HashInfo * findHash( const char * name ) { + std::string n = name; + + // Search without regards to case + std::transform(n.begin(), n.end(), n.begin(), ::tolower); + // Since underscores can't be in names, the user must have meant a dash + std::replace(n.begin(), n.end(), '_', '-'); + + const auto it = hashMap().find(n); + if (it == hashMap().end()) { + return NULL; + } + return it->second; } -void listHashes(bool nameonly) { +void listHashes( bool nameonly ) { if (!nameonly) { printf("Hashnames can be supplied using any case letters.\n\n"); - printf("%-25s %4s %6s %-60s\n", - "Name", "Bits", "Type", "Description"); - printf("%-25s %4s %6s %-60s\n", - "----", "----", "----", "-----------"); + printf("%-25s %4s %6s %-60s\n", "Name", "Bits", "Type", "Description"); + printf("%-25s %4s %6s %-60s\n", "----", "----", "----", "-----------"); } - for (const HashInfo * h : defaultSort(hashMap())) { + for (const HashInfo * h: defaultSort(hashMap())) { if (!nameonly) { - printf("%-25s %4d %6s %-60s\n", - h->name, h->bits, - h->isMock() ? "MOCK" : (h->isCrypto() ? "CRYPTO" : ""), - h->desc); + printf("%-25s %4d %6s %-60s\n", h->name, h->bits, + h->isMock() ? "MOCK" : (h->isCrypto() ? "CRYPTO" : ""), h->desc); } else { printf("%s\n", h->name); } @@ -162,16 +169,14 @@ void listHashes(bool nameonly) { //----------------------------------------------------------------------------- // Hash verification routines -static void reportInitFailure(const HashInfo * hinfo) { - printf("%25s - Hash initialization failed! ...... FAIL!\n", - hinfo->name); +static void reportInitFailure( const HashInfo * hinfo ) { + printf("%25s - Hash initialization failed! ...... FAIL!\n", hinfo->name); } -static bool compareVerification(uint32_t expected, uint32_t actual, - const char * endstr, const char * name, - bool verbose, bool prefix) { +static bool compareVerification( uint32_t expected, uint32_t actual, const char * endstr, + const char * name, bool verbose, bool prefix ) { const char * result_str; - bool result = true; + bool result = true; if (expected == actual) { result_str = (actual != 0) ? "PASS\n" : "INSECURE (should not be 0)\n"; @@ -179,7 +184,7 @@ static bool compareVerification(uint32_t expected, uint32_t actual, result_str = "SKIP (unverifiable)\n"; } else { result_str = "FAIL! (Expected 0x%08x)\n"; - result = false; + result = false; } if (verbose) { @@ -193,8 +198,8 @@ static bool compareVerification(uint32_t expected, uint32_t actual, return result; } -static const char * endianstr(enum HashInfo::endianness e) { - switch(e) { +static const char * endianstr( enum HashInfo::endianness e ) { + switch (e) { case HashInfo::ENDIAN_LITTLE : return "LE"; // "Little endian" case HashInfo::ENDIAN_BIG : return "BE"; // "Big endian" case HashInfo::ENDIAN_NATIVE : return isLE() ? "LE" : "BE"; @@ -205,21 +210,20 @@ static const char * endianstr(enum HashInfo::endianness e) { return NULL; /* unreachable */ } -bool verifyHash(const HashInfo * hinfo, enum HashInfo::endianness endian, - bool verbose, bool prefix = true) { +bool verifyHash( const HashInfo * hinfo, enum HashInfo::endianness endian, bool verbose, bool prefix = true ) { bool result = true; const uint32_t actual = hinfo->ComputedVerify(endian); const uint32_t expect = hinfo->ExpectedVerify(endian); - result &= compareVerification(expect, actual, endianstr(endian), - hinfo->name, verbose, prefix); + result &= compareVerification(expect, actual, endianstr(endian), hinfo->name, verbose, prefix); return result; } -bool verifyAllHashes(bool verbose) { +bool verifyAllHashes( bool verbose ) { bool result = true; - for (const HashInfo * h : defaultSort(hashMap())) { + + for (const HashInfo * h: defaultSort(hashMap())) { if (!h->Init()) { if (verbose) { reportInitFailure(h); @@ -228,13 +232,13 @@ bool verifyAllHashes(bool verbose) { } else if (h->isEndianDefined()) { // Verify the hash the canonical way first, and then the // other way. - result &= verifyHash(h, HashInfo::ENDIAN_DEFAULT, verbose); + result &= verifyHash(h, HashInfo::ENDIAN_DEFAULT , verbose); result &= verifyHash(h, HashInfo::ENDIAN_NONDEFAULT, verbose); } else { // Always verify little-endian first, just for consistency // for humans looking at the results. result &= verifyHash(h, HashInfo::ENDIAN_LITTLE, verbose); - result &= verifyHash(h, HashInfo::ENDIAN_BIG, verbose); + result &= verifyHash(h, HashInfo::ENDIAN_BIG , verbose); } } printf("\n"); @@ -243,10 +247,12 @@ bool verifyAllHashes(bool verbose) { //----------------------------------------------------------------------------- // Run Mathmult unit tests via global constructor -int Mathmult_selftest(void); +int Mathmult_selftest( void ); + static int selftest_result = Mathmult_selftest(); //----------------------------------------------------------------------------- // See Hashrefs.cpp.in for why these exist. You can very likely just ignore them. unsigned refs(); + static unsigned dummy = refs(); diff --git a/lib/Mathmult.cpp b/lib/Mathmult.cpp index 9314734e..0ad101e6 100644 --- a/lib/Mathmult.cpp +++ b/lib/Mathmult.cpp @@ -21,16 +21,15 @@ #include "Mathmult.h" -template < typename T > -static void fail(const char * test, int idx, const T * expected, - std::initializer_list actual) { +template +static void fail( const char * test, int idx, const T * expected, std::initializer_list actual ) { if (idx >= 0) { printf("Test %s #%d failed!\n\tGot :", test, idx); } else { printf("Test %s failed!\n\tGot :", test); } int count = 0; - for (auto val : actual) { + for (auto val: actual) { if (sizeof(T) == 4) { printf(" %08x", val); } else { @@ -49,7 +48,7 @@ static void fail(const char * test, int idx, const T * expected, printf("\n\n"); } -static bool test_32(void) { +static bool test_32( void ) { bool passed = true; const uint32_t tests[14][4] = { { 0x1, 0x1, 0x0, 0x1 }, @@ -67,32 +66,33 @@ static bool test_32(void) { { 0xFFFFFFFF, 0x11111111, 0x11111110, 0xEEEEEEEF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x1 }, }; - const uint32_t testsum[3] = { 0x33058587, 0x416D9DEB, 0x2580A632}; + const uint32_t testsum[3] = { 0x33058587, 0x416D9DEB, 0x2580A632 }; uint32_t sum1_lo, sum1_mi, sum1_hi, sum2_lo, sum2_mi, sum2_hi; uint32_t r1_lo, r1_hi, r2_lo, r2_hi; uint64_t r1_64, r2_64; + sum1_lo = sum1_mi = sum1_hi = sum2_lo = sum2_mi = sum2_hi = 0; for (int i = 0; i < 14; i++) { - mult32_64(r1_lo, r1_hi, tests[i][0], tests[i][1]); + mult32_64(r1_lo, r1_hi , tests[i][0], tests[i][1]); mult32_64(r1_64, tests[i][0], tests[i][1]); - mult32_64(r2_lo, r2_hi, tests[i][1], tests[i][0]); + mult32_64(r2_lo, r2_hi , tests[i][1], tests[i][0]); mult32_64(r2_64, tests[i][1], tests[i][0]); if ((r1_hi != tests[i][2]) || (r1_lo != tests[i][3])) { - fail("mult32_64, r1, rhi:rlo", i, &tests[i][2], {r1_hi, r1_lo}); + fail("mult32_64, r1, rhi:rlo", i, &tests[i][2], { r1_hi, r1_lo }); passed = false; } if (((r1_64 >> 32) != tests[i][2]) || (((uint32_t)r1_64) != tests[i][3])) { - fail("mult32_64, r1, r64", i, &tests[i][2], {(uint32_t)(r1_64 >> 32), (uint32_t)r1_64}); + fail("mult32_64, r1, r64", i, &tests[i][2], { (uint32_t)(r1_64 >> 32), (uint32_t)r1_64 }); passed = false; } if ((r2_hi != tests[i][2]) || (r2_lo != tests[i][3])) { - fail("mult32_64, r2, rhi:rlo", i, &tests[i][2], {r2_hi, r2_lo}); + fail("mult32_64, r2, rhi:rlo", i, &tests[i][2], { r2_hi, r2_lo }); passed = false; } if (((r2_64 >> 32) != tests[i][2]) || (((uint32_t)r2_64) != tests[i][3])) { - fail("mult32_64, r2, r64", i, &tests[i][2], {(uint32_t)(r2_64 >> 32), (uint32_t)r2_64}); + fail("mult32_64, r2, r64", i, &tests[i][2], { (uint32_t)(r2_64 >> 32), (uint32_t)r2_64 }); passed = false; } add96(sum1_lo, sum1_mi, sum1_hi, tests[i][3], tests[i][2], 0x38ADE957); @@ -102,80 +102,115 @@ static bool test_32(void) { } if ((sum1_hi != testsum[0]) || (sum1_mi != testsum[1]) || (sum1_lo != testsum[2])) { - fail("add96", -1, &testsum[0], {sum1_hi, sum1_mi, sum1_lo}); + fail("add96", -1, &testsum[0], { sum1_hi, sum1_mi, sum1_lo }); passed = false; } if ((sum2_hi != testsum[0]) || (sum2_mi != testsum[1]) || (sum2_lo != testsum[2])) { - fail("fma32_96", -1, &testsum[0], {sum2_hi, sum2_mi, sum2_lo}); + fail("fma32_96", -1, &testsum[0], { sum2_hi, sum2_mi, sum2_lo }); passed = false; } return passed; } -static bool test_64(void) { +static bool test_64( void ) { bool passed = true; const uint64_t tests[16][6] = { - { 0x1 , 0x1, - 0x0 , 0x1, - 0x0 , 0x1 }, - { UINT64_C(0x2F9AC342168A6741), 0x0, - 0x0 , 0x0, - 0x0 , 0x0 }, + { + 0x1, 0x1, + 0x0, 0x1, + 0x0, 0x1 + }, + { + UINT64_C(0x2F9AC342168A6741), 0x0, + 0x0, 0x0, + 0x0, 0x0 + }, // No cross-lane carry - { UINT64_C(0x418FD883CEB217D8), UINT64_C(0x7213F60E1222CE60), - UINT64_C(0x1D372B1B98652CD8), UINT64_C(0xC1E418E52CA8C100), - UINT64_C(0x1D372B1B98652CD8), UINT64_C(0xC1E418E52CA8C100) }, + { + UINT64_C(0x418FD883CEB217D8), UINT64_C(0x7213F60E1222CE60), + UINT64_C(0x1D372B1B98652CD8), UINT64_C(0xC1E418E52CA8C100), + UINT64_C(0x1D372B1B98652CD8), UINT64_C(0xC1E418E52CA8C100) + }, // 1 cross-lane carry - { UINT64_C(0x477B3604218D2514), UINT64_C(0xA6019680FBEACF3B), - UINT64_C(0x2E5A5688195E73C4), UINT64_C(0x1E1F1A735CCAB79C), - UINT64_C(0x2E5A5688195E73C3), UINT64_C(0x1E1F1A735CCAB79C) }, + { + UINT64_C(0x477B3604218D2514), UINT64_C(0xA6019680FBEACF3B), + UINT64_C(0x2E5A5688195E73C4), UINT64_C(0x1E1F1A735CCAB79C), + UINT64_C(0x2E5A5688195E73C3), UINT64_C(0x1E1F1A735CCAB79C) + }, // 2 cross-lane carries - { UINT64_C(0xA7E5AD86B74C236C), UINT64_C(0x1522F8FF937041C7), - UINT64_C(0x0DDCC70B3782740B), UINT64_C(0x0249EA7D546DF4F4), - UINT64_C(0x0DDCC70B37827409), UINT64_C(0x0249EA7D546DF4F4) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x1), - UINT64_C( 0x0), UINT64_C(0x7FFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C(0x7FFFFFFFFFFFFFFF) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x2), - UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFE), - UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFE) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x3), - UINT64_C( 0x1), UINT64_C(0x7FFFFFFFFFFFFFFD), - UINT64_C( 0x1), UINT64_C(0x7FFFFFFFFFFFFFFD) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x4), - UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFC), - UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFC) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1), - UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFF) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x2), - UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFE), - UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFE) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x3), - UINT64_C( 0x2), UINT64_C(0xFFFFFFFFFFFFFFFD), - UINT64_C( 0x2), UINT64_C(0xFFFFFFFFFFFFFFFD) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x4), - UINT64_C( 0x3), UINT64_C(0xFFFFFFFFFFFFFFFC), - UINT64_C( 0x3), UINT64_C(0xFFFFFFFFFFFFFFFC) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x8), - UINT64_C( 0x7), UINT64_C(0xFFFFFFFFFFFFFFF8), - UINT64_C( 0x7), UINT64_C(0xFFFFFFFFFFFFFFF8) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x1111111111111111), - UINT64_C(0x1111111111111110), UINT64_C(0xEEEEEEEEEEEEEEEF), - UINT64_C(0x111111111111110F), UINT64_C(0xEEEEEEEEEEEEEEEF) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), - UINT64_C(0xFFFFFFFFFFFFFFFD), UINT64_C( 0x1) }, + { + UINT64_C(0xA7E5AD86B74C236C), UINT64_C(0x1522F8FF937041C7), + UINT64_C(0x0DDCC70B3782740B), UINT64_C(0x0249EA7D546DF4F4), + UINT64_C(0x0DDCC70B37827409), UINT64_C(0x0249EA7D546DF4F4) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x1), + UINT64_C( 0x0), UINT64_C(0x7FFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C(0x7FFFFFFFFFFFFFFF) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x2), + UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFE), + UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFE) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x3), + UINT64_C( 0x1), UINT64_C(0x7FFFFFFFFFFFFFFD), + UINT64_C( 0x1), UINT64_C(0x7FFFFFFFFFFFFFFD) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C( 0x4), + UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFC), + UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFC) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1), + UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C(0xFFFFFFFFFFFFFFFF) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x2), + UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFE), + UINT64_C( 0x1), UINT64_C(0xFFFFFFFFFFFFFFFE) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x3), + UINT64_C( 0x2), UINT64_C(0xFFFFFFFFFFFFFFFD), + UINT64_C( 0x2), UINT64_C(0xFFFFFFFFFFFFFFFD) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x4), + UINT64_C( 0x3), UINT64_C(0xFFFFFFFFFFFFFFFC), + UINT64_C( 0x3), UINT64_C(0xFFFFFFFFFFFFFFFC) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x8), + UINT64_C( 0x7), UINT64_C(0xFFFFFFFFFFFFFFF8), + UINT64_C( 0x7), UINT64_C(0xFFFFFFFFFFFFFFF8) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x1111111111111111), + UINT64_C(0x1111111111111110), UINT64_C(0xEEEEEEEEEEEEEEEF), + UINT64_C(0x111111111111110F), UINT64_C(0xEEEEEEEEEEEEEEEF) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), + UINT64_C(0xFFFFFFFFFFFFFFFD), UINT64_C( 0x1) + }, + }; + const uint64_t testsum[3] = { + UINT64_C(0x92791E340E9CF671), + UINT64_C(0xD4FEB37FF4AE4B9B), + UINT64_C(0xA278198999A0B8CA) }; - const uint64_t testsum[3] = { UINT64_C(0x92791E340E9CF671), - UINT64_C(0xD4FEB37FF4AE4B9B), - UINT64_C(0xA278198999A0B8CA) }; uint64_t sum1_lo, sum1_mi, sum1_hi, sum2_lo, sum2_mi, sum2_hi; uint64_t sum3_lo, sum3_mi, sum3_hi; uint64_t r1_lo, r1_hi, r2_lo, r2_hi; + sum1_lo = sum1_mi = sum1_hi = sum2_lo = sum2_mi = sum2_hi = 0; sum3_lo = sum3_mi = sum3_hi = 0; @@ -183,28 +218,27 @@ static bool test_64(void) { mult64_128_nocarry(r1_lo, r1_hi, tests[i][0], tests[i][1]); mult64_128_nocarry(r2_lo, r2_hi, tests[i][1], tests[i][0]); if ((r1_hi != tests[i][4]) || (r1_lo != tests[i][5])) { - fail("mult64_128_nocarry, r1, rhi:rlo", i, &tests[i][4], {r1_hi, r1_lo}); + fail("mult64_128_nocarry, r1, rhi:rlo", i, &tests[i][4], { r1_hi, r1_lo }); passed = false; } if ((r2_hi != tests[i][4]) || (r2_lo != tests[i][5])) { - fail("mult64_128_nocarry, r2, rhi:rlo", i, &tests[i][4], {r2_hi, r2_lo}); + fail("mult64_128_nocarry, r2, rhi:rlo", i, &tests[i][4], { r2_hi, r2_lo }); passed = false; } mult64_128(r1_lo, r1_hi, tests[i][0], tests[i][1]); mult64_128(r2_lo, r2_hi, tests[i][1], tests[i][0]); if ((r1_hi != tests[i][2]) || (r1_lo != tests[i][3])) { - fail("mult64_128, r1, rhi:rlo", i, &tests[i][0], {r1_hi, r1_lo}); + fail("mult64_128, r1, rhi:rlo", i, &tests[i][0], { r1_hi, r1_lo }); passed = false; } if ((r2_hi != tests[i][2]) || (r2_lo != tests[i][3])) { - fail("mult64_128, r2, rhi:rlo", i, &tests[i][0], {r2_hi, r2_lo}); + fail("mult64_128, r2, rhi:rlo", i, &tests[i][0], { r2_hi, r2_lo }); passed = false; } add128(sum1_lo, sum1_mi, tests[i][3], tests[i][2]); - add192(sum1_lo, sum1_mi, sum1_hi, - tests[i][3], tests[i][2], UINT64_C(0x192791e340e9cf67)); + add192(sum1_lo, sum1_mi, sum1_hi, tests[i][3], tests[i][2], UINT64_C(0x192791e340e9cf67)); fma64_128(sum2_lo, sum2_mi, tests[i][0], tests[i][1]); fma64_128(sum3_lo, sum3_mi, tests[i][1], tests[i][0]); fma64_192(sum2_lo, sum2_mi, sum2_hi, tests[i][0], tests[i][1]); @@ -214,73 +248,105 @@ static bool test_64(void) { } if ((sum1_hi != testsum[0]) || (sum1_mi != testsum[1]) || (sum1_lo != testsum[2])) { - fail("add128/add192", -1, &testsum[0], {sum1_hi, sum1_mi, sum1_lo}); + fail("add128/add192", -1, &testsum[0], { sum1_hi, sum1_mi, sum1_lo }); passed = false; } if ((sum2_hi != testsum[0]) || (sum2_mi != testsum[1]) || (sum2_lo != testsum[2])) { - fail("fma64_128/fma64_192", 1, &testsum[0], {sum2_hi, sum2_mi, sum2_lo}); + fail("fma64_128/fma64_192", 1, &testsum[0], { sum2_hi, sum2_mi, sum2_lo }); passed = false; } if ((sum3_hi != testsum[0]) || (sum3_mi != testsum[1]) || (sum3_lo != testsum[2])) { - fail("fma64_128/fma64_192", 2, &testsum[0], {sum3_hi, sum3_mi, sum3_lo}); + fail("fma64_128/fma64_192", 2, &testsum[0], { sum3_hi, sum3_mi, sum3_lo }); passed = false; } return passed; } -static bool test_128(void) { +static bool test_128( void ) { bool passed = true; const uint64_t tests[16][6] = { - { 0x0 , 0x1, - 0x0 , 0x1, - 0x0 , 0x1 }, - { UINT64_C(0xAF756DACBD453D68), UINT64_C(0xE5915DA08FF8BFD9), - 0x0 , 0x0, - 0x0 , 0x0 }, - { UINT64_C(0xAF756DACBD453D68), UINT64_C(0xE5915DA08FF8BFD9), - UINT64_C(0x2C297F5B51B1274F), UINT64_C(0x2A51DC0FB3F6EA0A), - UINT64_C(0xB9E5265202949E5E), UINT64_C(0x96526CC31499D87A) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x1), - UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x2), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFE) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x3), - UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFD) }, - { UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x4), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFC) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1), - UINT64_C( 0x0), UINT64_C( 0x1), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1) }, - { UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), - UINT64_C( 0x0), UINT64_C( 0x2), - UINT64_C(0xFFFFFFFFFFFFFFFC), UINT64_C( 0x2) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x3), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFD) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x4), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFC) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C( 0x0), UINT64_C( 0x8), - UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFF8) }, - { UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x1111111111111111), - UINT64_C(0x1111111111111110), UINT64_C(0xEEEEEEEEEEEEEEEE), - UINT64_C(0x1FDB97530ECA8642), UINT64_C(0xDF0123456789ABCE) }, - { UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0xFFFFFFFFFFFFFFFF), - UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), - UINT64_C(0xAAAAAAAAAAAAAAAC), UINT64_C(0xFFFFFFFFFFFFFFFF) }, - { UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0x5555555555555555), - UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), - UINT64_C( 0x0), UINT64_C(0x5555555555555555) }, - { UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0x5555555555555555), - UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x0), - UINT64_C(0x5555555555555556), UINT64_C( 0x0) }, + { + 0x0, 0x1, + 0x0, 0x1, + 0x0, 0x1 + }, + { + UINT64_C(0xAF756DACBD453D68), UINT64_C(0xE5915DA08FF8BFD9), + 0x0, 0x0, + 0x0, 0x0 + }, + { + UINT64_C(0xAF756DACBD453D68), UINT64_C(0xE5915DA08FF8BFD9), + UINT64_C(0x2C297F5B51B1274F), UINT64_C(0x2A51DC0FB3F6EA0A), + UINT64_C(0xB9E5265202949E5E), UINT64_C(0x96526CC31499D87A) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x1), + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x2), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFE) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x3), + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFD) + }, + { + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x4), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFC) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1), + UINT64_C( 0x0), UINT64_C( 0x1), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C( 0x1) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), + UINT64_C( 0x0), UINT64_C( 0x2), + UINT64_C(0xFFFFFFFFFFFFFFFC), UINT64_C( 0x2) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x3), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFD) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x4), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFC) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C( 0x0), UINT64_C( 0x8), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFF8) + }, + { + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x1111111111111111), + UINT64_C(0x1111111111111110), UINT64_C(0xEEEEEEEEEEEEEEEE), + UINT64_C(0x1FDB97530ECA8642), UINT64_C(0xDF0123456789ABCE) + }, + { + UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), + UINT64_C(0xAAAAAAAAAAAAAAAC), UINT64_C(0xFFFFFFFFFFFFFFFF) + }, + { + UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0x5555555555555555), + UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x1), + UINT64_C( 0x0), UINT64_C(0x5555555555555555) + }, + { + UINT64_C(0xAAAAAAAAAAAAAAAA), UINT64_C(0x5555555555555555), + UINT64_C(0xFFFFFFFFFFFFFFFE), UINT64_C( 0x0), + UINT64_C(0x5555555555555556), UINT64_C( 0x0) + }, }; uint64_t r1_lo, r1_hi, r2_lo, r2_hi; @@ -289,11 +355,11 @@ static bool test_128(void) { mult128_128(r1_lo, r1_hi, tests[i][1], tests[i][0], tests[i][3], tests[i][2]); mult128_128(r2_lo, r2_hi, tests[i][3], tests[i][2], tests[i][1], tests[i][0]); if ((r1_hi != tests[i][4]) || (r1_lo != tests[i][5])) { - fail("mult128_128, r1, rhi:rlo", i, &tests[i][4], {r1_hi, r1_lo}); + fail("mult128_128, r1, rhi:rlo", i, &tests[i][4], { r1_hi, r1_lo }); passed = false; } if ((r2_hi != tests[i][4]) || (r2_lo != tests[i][5])) { - fail("mult128_128, r2, rhi:rlo", i, &tests[i][4], {r2_hi, r2_lo}); + fail("mult128_128, r2, rhi:rlo", i, &tests[i][4], { r2_hi, r2_lo }); passed = false; } } @@ -301,7 +367,7 @@ static bool test_128(void) { return passed; } -int Mathmult_selftest(void) { +int Mathmult_selftest( void ) { bool passed = true; passed &= test_32(); diff --git a/main.cpp b/main.cpp index 8e659604..b7425da7 100644 --- a/main.cpp +++ b/main.cpp @@ -106,16 +106,16 @@ const char * g_failstr = "*********FAIL*********\n"; //-------- // Overall log2-p-value statistics and test pass/fail counts -uint32_t g_log2pValueCounts[COUNT_MAX_PVALUE+2]; +uint32_t g_log2pValueCounts[COUNT_MAX_PVALUE + 2]; uint32_t g_testPass, g_testFail; -std::vector< std::pair > g_testFailures; +std::vector> g_testFailures; //----------------------------------------------------------------------------- // Locally-visible configuration static bool g_drawDiagram = false; // excessive torture tests: Sparse, Avalanche, DiffDist, scan all seeds -static bool g_testExtra = false; +static bool g_testExtra = false; static bool g_testAll; static bool g_testVerifyAll; @@ -142,39 +142,39 @@ static bool g_testBIC; static bool g_testBadSeeds; struct TestOpts { - bool &var; - bool defaultvalue; // What "All" sets the test to - bool testspeedonly; // If true, then disabling test doesn't affect "All" testing - const char* name; + bool & var; + bool defaultvalue; // What "All" sets the test to + bool testspeedonly; // If true, then disabling test doesn't affect "All" testing + const char * name; }; -static TestOpts g_testopts[] = -{ - { g_testVerifyAll, false, false, "VerifyAll" }, // Overrides all others below - { g_testSanityAll, false, false, "SanityAll" }, // Overrides all others below - { g_testSpeedAll, false, false, "SpeedAll" }, // Overrides all others below - { g_testAll, true, false, "All" }, - { g_testSanity, true, false, "Sanity" }, - { g_testSpeed, true, true, "Speed" }, - { g_testHashmap, true, true, "Hashmap" }, - { g_testAvalanche, true, false, "Avalanche" }, - { g_testSparse, true, false, "Sparse" }, - { g_testPermutation, true, false, "Permutation" }, - { g_testWindow, true, false, "Window" }, - { g_testCyclic, true, false, "Cyclic" }, - { g_testTwoBytes, true, false, "TwoBytes" }, - { g_testText, true, false, "Text" }, - { g_testZeroes, true, false, "Zeroes" }, - { g_testSeed, true, false, "Seed" }, - { g_testPerlinNoise, true, false, "PerlinNoise" }, - { g_testDiff, true, false, "Diff" }, - { g_testDiffDist, true, false, "DiffDist" }, - { g_testPopcount, true, false, "Popcount" }, - { g_testPrng, true, false, "Prng" }, - { g_testBIC, false, false, "BIC" }, - { g_testBadSeeds, false, false, "BadSeeds" }, +// These first 3 override all other selections +static TestOpts g_testopts[] = { + { g_testVerifyAll, false, false, "VerifyAll" }, + { g_testSanityAll, false, false, "SanityAll" }, + { g_testSpeedAll, false, false, "SpeedAll" }, + { g_testAll, true, false, "All" }, + { g_testSanity, true, false, "Sanity" }, + { g_testSpeed, true, true, "Speed" }, + { g_testHashmap, true, true, "Hashmap" }, + { g_testAvalanche, true, false, "Avalanche" }, + { g_testSparse, true, false, "Sparse" }, + { g_testPermutation, true, false, "Permutation" }, + { g_testWindow, true, false, "Window" }, + { g_testCyclic, true, false, "Cyclic" }, + { g_testTwoBytes, true, false, "TwoBytes" }, + { g_testText, true, false, "Text" }, + { g_testZeroes, true, false, "Zeroes" }, + { g_testSeed, true, false, "Seed" }, + { g_testPerlinNoise, true, false, "PerlinNoise" }, + { g_testDiff, true, false, "Diff" }, + { g_testDiffDist, true, false, "DiffDist" }, + { g_testPopcount, true, false, "Popcount" }, + { g_testPrng, true, false, "Prng" }, + { g_testBIC, false, false, "BIC" }, + { g_testBadSeeds, false, false, "BadSeeds" }, }; -static void set_default_tests(bool enable) { +static void set_default_tests( bool enable ) { for (size_t i = 0; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { if (enable) { g_testopts[i].var = g_testopts[i].defaultvalue; @@ -184,10 +184,10 @@ static void set_default_tests(bool enable) { } } -static void parse_tests(const char * str, bool enable_tests) { +static void parse_tests( const char * str, bool enable_tests ) { while (*str != '\0') { - size_t len; - const char *p = strchr(str, ','); + size_t len; + const char * p = strchr(str, ','); if (p == NULL) { len = strlen(str); } else { @@ -195,7 +195,7 @@ static void parse_tests(const char * str, bool enable_tests) { } struct TestOpts * found = NULL; - bool foundmultiple = false; + bool foundmultiple = false; for (size_t i = 0; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { const char * testname = g_testopts[i].name; // Allow the user to specify test names by case-agnostic @@ -205,38 +205,33 @@ static void parse_tests(const char * str, bool enable_tests) { foundmultiple = true; } found = &g_testopts[i]; - // Exact match found, don't bother looking further, - // and don't error out. if (testname[len] == '\0') { + // Exact match found, don't bother looking further, and + // don't error out. foundmultiple = false; break; } } } if (foundmultiple) { - printf("Ambiguous test name: --%stest=%*s\n", - enable_tests ? "" : "no", (int)len, str); + printf("Ambiguous test name: --%stest=%*s\n", enable_tests ? "" : "no", (int)len, str); goto error; } if (found == NULL) { - printf("Invalid option: --%stest=%*s\n", - enable_tests ? "" : "no", (int)len, str); + printf("Invalid option: --%stest=%*s\n", enable_tests ? "" : "no", (int)len, str); goto error; } - //printf("%sabling test %s\n", enable_tests ? "en" : "dis", testname); + // printf("%sabling test %s\n", enable_tests ? "en" : "dis", testname); found->var = enable_tests; - // If "All" tests are being enabled or disabled, then - // adjust the individual test variables as - // instructed. Otherwise, if a material "All" test - // (not just a speed-testing test) is being - // specifically disabled, then don't consider "All" - // tests as being run. + + // If "All" tests are being enabled or disabled, then adjust the individual + // test variables as instructed. Otherwise, if a material "All" test (not + // just a speed-testing test) is being specifically disabled, then don't + // consider "All" tests as being run. if (&found->var == &g_testAll) { set_default_tests(enable_tests); - } else if (!enable_tests && - found->defaultvalue && - !found->testspeedonly) { + } else if (!enable_tests && found->defaultvalue && !found->testspeedonly) { g_testAll = false; } @@ -248,17 +243,18 @@ static void parse_tests(const char * str, bool enable_tests) { return; - error: + error: printf("Valid tests: --test=%s", g_testopts[0].name); - for(size_t i = 1; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { + for (size_t i = 1; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { printf(",%s", g_testopts[i].name); } printf(" \n"); exit(1); } -static void usage(void); -static HashInfo::endianness parse_endian(const char * str) { +static void usage( void ); + +static HashInfo::endianness parse_endian( const char * str ) { if (!strcmp(str, "native")) { return HashInfo::ENDIAN_NATIVE; } if (!strcmp(str, "nonnative")) { return HashInfo::ENDIAN_BYTESWAPPED; } if (!strcmp(str, "default")) { return HashInfo::ENDIAN_DEFAULT; } @@ -273,25 +269,25 @@ static HashInfo::endianness parse_endian(const char * str) { //----------------------------------------------------------------------------- // Self-tests - verify that hashes work correctly -static void HashSelfTestAll(bool verbose) { - bool pass = true; +static void HashSelfTestAll( bool verbose ) { + bool pass = true; - printf("[[[ VerifyAll Tests ]]]\n\n"); + printf("[[[ VerifyAll Tests ]]]\n\n"); - pass &= verifyAllHashes(verbose); + pass &= verifyAllHashes(verbose); - if (!pass) { - printf("Self-test FAILED!\n"); - if (!verbose) { - verifyAllHashes(true); + if (!pass) { + printf("Self-test FAILED!\n"); + if (!verbose) { + verifyAllHashes(true); + } + exit(1); } - exit(1); - } - printf("PASS\n\n"); + printf("PASS\n\n"); } -static bool HashSelfTest(const HashInfo * hinfo) { +static bool HashSelfTest( const HashInfo * hinfo ) { bool result = verifyHash(hinfo, g_hashEndian, true, false); recordTestResult(result, "Sanity", "Implementation verification"); @@ -299,13 +295,13 @@ static bool HashSelfTest(const HashInfo * hinfo) { return result; } -static void HashSanityTestAll(void) { +static void HashSanityTestAll( void ) { std::vector allHashes = findAllHashes(); printf("[[[ SanityAll Tests ]]]\n\n"); SanityTestHeader(); - for (const HashInfo * h : allHashes) { + for (const HashInfo * h: allHashes) { if (!h->Init()) { printf("%s : hash initialization failed!", h->name); continue; @@ -318,13 +314,13 @@ static void HashSanityTestAll(void) { //----------------------------------------------------------------------------- // Quickly speed test all hashes -static void HashSpeedTestAll(void) { +static void HashSpeedTestAll( void ) { std::vector allHashes = findAllHashes(); printf("[[[ Short Speed Tests ]]]\n\n"); ShortSpeedTestHeader(); - for (const HashInfo * h : allHashes) { + for (const HashInfo * h: allHashes) { if (!h->Init()) { printf("%s : hash initialization failed!", h->name); continue; @@ -335,469 +331,452 @@ static void HashSpeedTestAll(void) { } //----------------------------------------------------------------------------- -void print_pvaluecounts(void) { - printf("Log2(p-value) summary:"); - for (uint32_t lo = 0; lo <= (COUNT_MAX_PVALUE+1); lo += 10) { - printf("\n\t %2d%c", lo, (lo == (COUNT_MAX_PVALUE+1)) ? '+' : ' '); - for (uint32_t i = 1; i < 10; i++) { - printf(" %2d%c", lo+i, ((lo+i) == (COUNT_MAX_PVALUE+1)) ? '+' : ' '); - } - printf("\n\t----"); - for (uint32_t i = 1; i < 10; i++) { - printf(" ----"); - } - printf("\n\t%4d", g_log2pValueCounts[lo+0]); - for (uint32_t i = 1; i < 10; i++) { - printf(" %4d", g_log2pValueCounts[lo+i]); + +static void print_pvaluecounts( void ) { + printf("Log2(p-value) summary:"); + for (uint32_t lo = 0; lo <= (COUNT_MAX_PVALUE + 1); lo += 10) { + printf("\n\t %2d%c", lo, (lo == (COUNT_MAX_PVALUE + 1)) ? '+' : ' '); + for (uint32_t i = 1; i < 10; i++) { + printf(" %2d%c", lo + i, ((lo + i) == (COUNT_MAX_PVALUE + 1)) ? '+' : ' '); + } + printf("\n\t----"); + for (uint32_t i = 1; i < 10; i++) { + printf(" ----"); + } + printf("\n\t%4d", g_log2pValueCounts[lo + 0]); + for (uint32_t i = 1; i < 10; i++) { + printf(" %4d", g_log2pValueCounts[lo + i]); + } + printf("\n"); } printf("\n"); - } - printf("\n"); } //----------------------------------------------------------------------------- -template < typename hashtype > -static bool test ( const HashInfo * hInfo ) -{ - const int hashbits = sizeof(hashtype) * 8; - bool result = true; - - if (g_testAll) { - printf("-------------------------------------------------------------------------------\n"); - } - - if (!hInfo->Init()) { - printf("Hash initialization failed! Cannot continue.\n"); - exit(1); - } - - //----------------------------------------------------------------------------- - // Some hashes only take 32-bits of seed data, so there's no way of - // getting big seeds to them at all. - if ((g_seed >= (1ULL << (8 * sizeof(uint32_t)))) && hInfo->is32BitSeed()) { - printf("WARNING: Specified global seed 0x%016" PRIx64 "\n" - " is larger than the specified hash can accept\n", g_seed); - } - - //----------------------------------------------------------------------------- - // Sanity tests - - FILE * outfile; - if (g_testAll || g_testSpeed || g_testHashmap) - outfile = stdout; - else - outfile = stderr; - fprintf(outfile, "--- Testing %s \"%s\" %s", hInfo->name, hInfo->desc, hInfo->isMock() ? "MOCK" : ""); - if (g_seed != 0) - fprintf(outfile, " seed 0x%016" PRIx64 "\n\n", g_seed); - else - fprintf(outfile, "\n\n"); - - if(g_testSanity) - { - printf("[[[ Sanity Tests ]]]\n\n"); - - result &= HashSelfTest(hInfo); - result &= (SanityTest(hInfo) || hInfo->isMock()); - printf("\n"); - } +template +static bool test( const HashInfo * hInfo ) { + const int hashbits = sizeof(hashtype) * 8; + bool result = true; + + if (g_testAll) { + printf("-------------------------------------------------------------------------------\n"); + } + + if (!hInfo->Init()) { + printf("Hash initialization failed! Cannot continue.\n"); + exit(1); + } + + //----------------------------------------------------------------------------- + // Some hashes only take 32-bits of seed data, so there's no way of + // getting big seeds to them at all. + if ((g_seed >= (1ULL << (8 * sizeof(uint32_t)))) && hInfo->is32BitSeed()) { + printf("WARNING: Specified global seed 0x%016" PRIx64 "\n" + " is larger than the specified hash can accept\n", g_seed); + } + + //----------------------------------------------------------------------------- + // Sanity tests + + FILE * outfile; + if (g_testAll || g_testSpeed || g_testHashmap) { + outfile = stdout; + } else { + outfile = stderr; + } + fprintf(outfile, "--- Testing %s \"%s\" %s", hInfo->name, hInfo->desc, hInfo->isMock() ? "MOCK" : ""); + if (g_seed != 0) { + fprintf(outfile, " seed 0x%016" PRIx64 "\n\n", g_seed); + } else { + fprintf(outfile, "\n\n"); + } + + if (g_testSanity) { + printf("[[[ Sanity Tests ]]]\n\n"); + + result &= HashSelfTest(hInfo); + result &= (SanityTest(hInfo) || hInfo->isMock()); + printf("\n"); + } + + //----------------------------------------------------------------------------- + // Speed tests + + if (g_testSpeed) { + SpeedTest(hInfo); + } + + if (g_testHashmap) { + result &= HashMapTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Avalanche tests + + if (g_testAvalanche) { + result &= AvalancheTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Keyset 'Sparse' - keys with all bits 0 except a few + + if (g_testSparse) { + result &= SparseKeyTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Keyset 'Permutation' - all possible combinations of a set of blocks + + if (g_testPermutation) { + result &= PermutedKeyTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Keyset 'Window' + + if (g_testWindow) { + result &= WindowedKeyTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Keyset 'Cyclic' - keys of the form "abcdabcdabcd..." + + if (g_testCyclic) { + result &= CyclicKeyTest(hInfo, g_drawDiagram); + } + + //----------------------------------------------------------------------------- + // Keyset 'TwoBytes' - all keys up to N bytes containing two non-zero bytes + // With --extra this generates some huge keysets, + // 128-bit tests will take ~1.3 gigs of RAM. + + if (g_testTwoBytes) { + result &= TwoBytesKeyTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Keyset 'Text' + + if (g_testText) { + result &= TextKeyTest(hInfo, g_drawDiagram); + } + + //----------------------------------------------------------------------------- + // Keyset 'Zeroes' + + if (g_testZeroes) { + result &= ZeroKeyTest(hInfo, g_drawDiagram); + } + + //----------------------------------------------------------------------------- + // Keyset 'Seed' + + if (g_testSeed) { + result &= SeedTest(hInfo, g_drawDiagram); + } + + //----------------------------------------------------------------------------- + // Keyset 'PerlinNoise' + + if (g_testPerlinNoise) { + result &= PerlinNoiseTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Differential tests + + if (g_testDiff) { + result &= DiffTest(hInfo, g_drawDiagram, g_testExtra); + } + + //----------------------------------------------------------------------------- + // Differential-distribution tests + + if (g_testDiffDist) { + result &= DiffDistTest(hInfo, g_drawDiagram); + } + + //----------------------------------------------------------------------------- + // Measuring the distribution of the population count of the + // lowest 32 bits set over the whole key space. + + if (g_testPopcount) { + result &= PopcountTest(hInfo, g_testExtra); + } - //----------------------------------------------------------------------------- - // Speed tests + //----------------------------------------------------------------------------- + // Test the hash function as a PRNG by repeatedly feeding its output + // back into the hash to get the next random number. - if(g_testSpeed) - { - SpeedTest(hInfo); - } + if (g_testPrng) { + result &= PRNGTest(hInfo, g_drawDiagram, g_testExtra); + } - if(g_testHashmap) - { - result &= HashMapTest(hInfo, g_drawDiagram, g_testExtra); - } + //----------------------------------------------------------------------------- + // Bit Independence Criteria. Interesting, but doesn't tell us much about + // collision or distribution. For >=128bit hashes, do this only with --extra - //----------------------------------------------------------------------------- - // Avalanche tests + if (g_testAll && g_testExtra && (hInfo->bits >= 128)) { + g_testBIC = true; + } + if (g_testBIC) { + result &= BicTest(hInfo, g_drawDiagram); + } - if(g_testAvalanche) - { - result &= AvalancheTest(hInfo, g_drawDiagram, g_testExtra); - } + //----------------------------------------------------------------------------- + // Test for known or unknown seed values which give bad/suspect hash values. - //----------------------------------------------------------------------------- - // Keyset 'Sparse' - keys with all bits 0 except a few + if (g_testBadSeeds) { + result &= BadSeedsTest(hInfo, g_testExtra); + } - if(g_testSparse) - { - result &= SparseKeyTest(hInfo, g_drawDiagram, g_testExtra); - } + //----------------------------------------------------------------------------- + // If All material tests were done, show a final summary of testing + + if (g_testAll) { + printf("-------------------------------------------------------------------------------\n"); + print_pvaluecounts(); + printf("-------------------------------------------------------------------------------\n"); + printf("Overall result: %s (%d/%d passed)\n", result ? "pass" : "FAIL", + g_testPass, g_testPass + g_testFail); + if (!result) { + const char * prev = ""; + printf("Failures"); + for (auto x: g_testFailures) { + if (strcmp(prev, x.first) != 0) { + printf("%c\n %-20s: [%s", (strlen(prev) == 0) ? ':' : ']', x.first, x.second ? x.second : ""); + prev = x.first; + } else { + printf(", %s", x.second); + } + free(x.second); + } + printf("]\n\n"); + } else { + // Sometimes failures are recorded even for overall + // successes. The only example I know of is Mock hashes + // failing sanity tests. + for (auto x: g_testFailures) { + free(x.second); + } + printf("\n"); + } + printf("-------------------------------------------------------------------------------\n"); + } - //----------------------------------------------------------------------------- - // Keyset 'Permutation' - all possible combinations of a set of blocks - - if(g_testPermutation) - { - result &= PermutedKeyTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Keyset 'Window' - - if(g_testWindow) - { - result &= WindowedKeyTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Keyset 'Cyclic' - keys of the form "abcdabcdabcd..." - - if (g_testCyclic) - { - result &= CyclicKeyTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Keyset 'TwoBytes' - all keys up to N bytes containing two non-zero bytes - // With --extra this generates some huge keysets, - // 128-bit tests will take ~1.3 gigs of RAM. - - if(g_testTwoBytes) - { - result &= TwoBytesKeyTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Keyset 'Text' - - if(g_testText) - { - result &= TextKeyTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Keyset 'Zeroes' - - if(g_testZeroes) - { - result &= ZeroKeyTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Keyset 'Seed' - - if(g_testSeed) - { - result &= SeedTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Keyset 'PerlinNoise' - - if(g_testPerlinNoise) - { - result &= PerlinNoiseTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Differential tests - - if(g_testDiff) - { - result &= DiffTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Differential-distribution tests - - if (g_testDiffDist) - { - result &= DiffDistTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Measuring the distribution of the population count of the - // lowest 32 bits set over the whole key space. - - if (g_testPopcount) - { - result &= PopcountTest(hInfo, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Test the hash function as a PRNG by repeatedly feeding its output - // back into the hash to get the next random number. - - if (g_testPrng) - { - result &= PRNGTest(hInfo, g_drawDiagram, g_testExtra); - } - - //----------------------------------------------------------------------------- - // Bit Independence Criteria. Interesting, but doesn't tell us much about - // collision or distribution. For >=128bit hashes, do this only with --extra - - if (g_testAll && g_testExtra && hInfo->bits >= 128) { - g_testBIC = true; - } - if(g_testBIC) - { - result &= BicTest(hInfo, g_drawDiagram); - } - - //----------------------------------------------------------------------------- - // Test for known or unknown seed values which give bad/suspect hash values. - - if (g_testBadSeeds) - { - result &= BadSeedsTest(hInfo, g_testExtra); - } - - //----------------------------------------------------------------------------- - // If All material tests were done, show a final summary of testing - if (g_testAll) { - printf("-------------------------------------------------------------------------------\n"); - print_pvaluecounts(); - printf("-------------------------------------------------------------------------------\n"); - printf("Overall result: %s (%d/%d passed)\n", result ? "pass" : "FAIL", - g_testPass, g_testPass+g_testFail); - if (!result) { - const char * prev = ""; - printf("Failures"); - for (auto x: g_testFailures) { - if (strcmp(prev, x.first) != 0) { - printf("%c\n %-20s: [%s", (strlen(prev) == 0) ? ':' : ']', - x.first, x.second ? x.second : ""); - prev = x.first; - } else { - printf(", %s", x.second); - } - free(x.second); - } - printf("]\n\n"); - } else { - // Sometimes failures are recorded even for overall - // successes. The only example I know of is Mock hashes - // failing sanity tests. - for (auto x: g_testFailures) { - free(x.second); - } - printf("\n"); - } - printf("-------------------------------------------------------------------------------\n"); - } - - return result; + return result; } //----------------------------------------------------------------------------- -static bool testHash(const char * name) { - const HashInfo * hInfo; - - if ((hInfo = findHash(name)) == NULL) { - printf("Invalid hash '%s' specified\n", name); - return false; - } - - // If you extend these statements by adding a new bitcount/type, you - // need to adjust HASHTYPELIST in util/Instantiate.h also. - if(hInfo->bits == 32) - return test>( hInfo ); - if(hInfo->bits == 64) - return test>( hInfo ); - if(hInfo->bits == 128) - return test>( hInfo ); - if(hInfo->bits == 160) - return test>( hInfo ); - if(hInfo->bits == 224) - return test>( hInfo ); - if(hInfo->bits == 256) - return test>( hInfo ); - - printf("Invalid hash bit width %d for hash '%s'", - hInfo->bits, hInfo->name); - - return false; +static bool testHash( const char * name ) { + const HashInfo * hInfo; + + if ((hInfo = findHash(name)) == NULL) { + printf("Invalid hash '%s' specified\n", name); + return false; + } + + // If you extend these statements by adding a new bitcount/type, you + // need to adjust HASHTYPELIST in util/Instantiate.h also. + if (hInfo->bits == 32) { + return test>(hInfo); + } + if (hInfo->bits == 64) { + return test>(hInfo); + } + if (hInfo->bits == 128) { + return test>(hInfo); + } + if (hInfo->bits == 160) { + return test>(hInfo); + } + if (hInfo->bits == 224) { + return test>(hInfo); + } + if (hInfo->bits == 256) { + return test>(hInfo); + } + + printf("Invalid hash bit width %d for hash '%s'", hInfo->bits, hInfo->name); + + return false; } //----------------------------------------------------------------------------- -static void usage( void ) -{ +static void usage( void ) { printf("Usage: SMHasher3 [--[no]test=[,...]] [--extra] [--seed=]\n" - " [--endian=default|nondefault|native|nonnative|big|little]\n" - " [--verbose] [--vcode] [--ncpu=N] []\n" - "\n" - " SMHasher3 [--list]|[--listnames]|[--tests]|[--version]\n" - "\n" - " Hashnames can be supplied using any case letters.\n" - ); + " [--endian=default|nondefault|native|nonnative|big|little]\n" + " [--verbose] [--vcode] [--ncpu=N] []\n" + "\n" + " SMHasher3 [--list]|[--listnames]|[--tests]|[--version]\n" + "\n" + " Hashnames can be supplied using any case letters.\n"); } #if defined(DEBUG) extern bool blobsort_test_result; #endif -int main ( int argc, const char ** argv ) -{ - setbuf(stdout, NULL); // Unbuffer stdout always - setbuf(stderr, NULL); // Unbuffer stderr always +int main( int argc, const char ** argv ) { + setbuf(stdout, NULL); // Unbuffer stdout always + setbuf(stderr, NULL); // Unbuffer stderr always - if (!isLE() && !isBE()) { - printf("Runtime endian detection failed! Cannot continue\n"); - exit(1); - } + if (!isLE() && !isBE()) { + printf("Runtime endian detection failed! Cannot continue\n"); + exit(1); + } #if defined(DEBUG) - if (!blobsort_test_result) { - printf("Blobsort self-test failed! Cannot continue\n"); - exit(1); - } + if (!blobsort_test_result) { + printf("Blobsort self-test failed! Cannot continue\n"); + exit(1); + } #endif - set_default_tests(true); + set_default_tests(true); #if defined(HAVE_32BIT_PLATFORM) - const char * defaulthash = "wyhash-32"; + const char * defaulthash = "wyhash-32"; #else - const char * defaulthash = "xxh3-64"; + const char * defaulthash = "xxh3-64"; #endif - const char * hashToTest = defaulthash; + const char * hashToTest = defaulthash; - if (argc < 2) { - printf("No test hash given on command line, testing %s.\n", hashToTest); - usage(); - } - - for (int argnb = 1; argnb < argc; argnb++) { - const char* const arg = argv[argnb]; - if (strncmp(arg,"--", 2) == 0) { - // This is a command - if (strcmp(arg,"--help") == 0) { + if (argc < 2) { + printf("No test hash given on command line, testing %s.\n", hashToTest); usage(); - exit(0); - } - if (strcmp(arg,"--list") == 0) { - listHashes(false); - exit(0); - } - if (strcmp(arg,"--listnames") == 0) { - listHashes(true); - exit(0); - } - if (strcmp(arg,"--tests") == 0) { - printf("Valid tests:\n"); - for(size_t i = 0; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { - printf(" %s\n", g_testopts[i].name); - } - exit(0); - } - if (strcmp(arg,"--version") == 0) { - printf("SMHasher3 %s\n", VERSION); - exit(0); - } - if (strcmp(arg,"--verbose") == 0) { - g_drawDiagram = true; - continue; - } - if (strcmp(arg,"--extra") == 0) { - g_testExtra = true; - continue; - } - // VCodes allow easy comparison of test results and hash inputs - // and outputs across SMHasher3 runs, hashes (of the same width), - // and systems. - if (strcmp(arg,"--vcode") == 0) { - g_doVCode = 1; - VCODE_INIT(); - continue; - } - if (strncmp(arg,"--endian=", 9) == 0) { - g_hashEndian = parse_endian(&arg[9]); - continue; - } - if (strncmp(arg,"--seed=", 7) == 0) { - errno = 0; - char * endptr; - uint64_t seed = strtol(&arg[7], &endptr, 0); - if ((errno != 0) || (arg[7] == '\0') || (*endptr != '\0')) { - printf("Error parsing global seed value \"%s\"\n", &arg[7]); - exit(1); - } - g_seed = seed; - continue; - } - if (strncmp(arg,"--ncpu=", 7) == 0) { + } + + for (int argnb = 1; argnb < argc; argnb++) { + const char * const arg = argv[argnb]; + if (strncmp(arg, "--", 2) == 0) { + // This is a command + if (strcmp(arg, "--help") == 0) { + usage(); + exit(0); + } + if (strcmp(arg, "--list") == 0) { + listHashes(false); + exit(0); + } + if (strcmp(arg, "--listnames") == 0) { + listHashes(true); + exit(0); + } + if (strcmp(arg, "--tests") == 0) { + printf("Valid tests:\n"); + for (size_t i = 0; i < sizeof(g_testopts) / sizeof(TestOpts); i++) { + printf(" %s\n", g_testopts[i].name); + } + exit(0); + } + if (strcmp(arg, "--version") == 0) { + printf("SMHasher3 %s\n", VERSION); + exit(0); + } + if (strcmp(arg, "--verbose") == 0) { + g_drawDiagram = true; + continue; + } + if (strcmp(arg, "--extra") == 0) { + g_testExtra = true; + continue; + } + // VCodes allow easy comparison of test results and hash inputs + // and outputs across SMHasher3 runs, hashes (of the same width), + // and systems. + if (strcmp(arg, "--vcode") == 0) { + g_doVCode = 1; + VCODE_INIT(); + continue; + } + if (strncmp(arg, "--endian=", 9) == 0) { + g_hashEndian = parse_endian(&arg[9]); + continue; + } + if (strncmp(arg, "--seed=", 7) == 0) { + errno = 0; + char * endptr; + uint64_t seed = strtol(&arg[7], &endptr, 0); + if ((errno != 0) || (arg[7] == '\0') || (*endptr != '\0')) { + printf("Error parsing global seed value \"%s\"\n", &arg[7]); + exit(1); + } + g_seed = seed; + continue; + } + if (strncmp(arg, "--ncpu=", 7) == 0) { #if defined(HAVE_THREADS) - errno = 0; - char * endptr; - long int Ncpu = strtol(&arg[7], &endptr, 0); - if ((errno != 0) || (arg[7] == '\0') || (*endptr != '\0') || (Ncpu < 1)) { - printf("Error parsing cpu number \"%s\"\n", &arg[7]); - exit(1); - } - if (Ncpu > 32) { - printf("WARNING: limiting to 32 threads\n"); - Ncpu = 32; - } - g_NCPU = Ncpu; - continue; + errno = 0; + char * endptr; + long int Ncpu = strtol(&arg[7], &endptr, 0); + if ((errno != 0) || (arg[7] == '\0') || (*endptr != '\0') || (Ncpu < 1)) { + printf("Error parsing cpu number \"%s\"\n", &arg[7]); + exit(1); + } + if (Ncpu > 32) { + printf("WARNING: limiting to 32 threads\n"); + Ncpu = 32; + } + g_NCPU = Ncpu; + continue; #else - printf("WARNING: compiled without threads; ignoring --ncpu\n"); - continue; + printf("WARNING: compiled without threads; ignoring --ncpu\n"); + continue; #endif - } - if (strncmp(arg,"--test=", 6) == 0) { - // If a list of tests is given, only test those - g_testAll = false; - set_default_tests(false); - parse_tests(&arg[7], true); - continue; - } - if (strncmp(arg,"--notest=", 8) == 0) { - parse_tests(&arg[9], false); - continue; - } - if (strcmp(arg,"--EstimateNbCollisions") == 0) { - ReportCollisionEstimates(); - exit(0); - } - // invalid command - printf("Invalid command \n"); - usage(); - exit(1); - } - // Not a command ? => interpreted as hash name - hashToTest = arg; - } - - size_t timeBegin = monotonic_clock(); - - if (g_testVerifyAll) { - HashSelfTestAll(g_drawDiagram); - } else if (g_testSanityAll) { - HashSanityTestAll(); - } else if (g_testSpeedAll) { - HashSpeedTestAll(); - } else { - testHash(hashToTest); - } - - size_t timeEnd = monotonic_clock(); - - uint32_t vcode = VCODE_FINALIZE(); - - FILE * outfile = g_testAll ? stdout : stderr; - - if (g_doVCode) { - fprintf(outfile, - "Input vcode 0x%08x, Output vcode 0x%08x, Result vcode 0x%08x\n", - g_inputVCode, g_outputVCode, g_resultVCode); - } - - fprintf(outfile, - "Verification value is 0x%08x - Testing took %f seconds\n\n", - vcode, (double)(timeEnd-timeBegin)/(double)NSEC_PER_SEC); - - return 0; + } + if (strncmp(arg, "--test=", 6) == 0) { + // If a list of tests is given, only test those + g_testAll = false; + set_default_tests(false); + parse_tests(&arg[7], true); + continue; + } + if (strncmp(arg, "--notest=", 8) == 0) { + parse_tests(&arg[9], false); + continue; + } + if (strcmp(arg, "--EstimateNbCollisions") == 0) { + ReportCollisionEstimates(); + exit(0); + } + // invalid command + printf("Invalid command \n"); + usage(); + exit(1); + } + // Not a command ? => interpreted as hash name + hashToTest = arg; + } + + size_t timeBegin = monotonic_clock(); + + if (g_testVerifyAll) { + HashSelfTestAll(g_drawDiagram); + } else if (g_testSanityAll) { + HashSanityTestAll(); + } else if (g_testSpeedAll) { + HashSpeedTestAll(); + } else { + testHash(hashToTest); + } + + size_t timeEnd = monotonic_clock(); + + uint32_t vcode = VCODE_FINALIZE(); + + FILE * outfile = g_testAll ? stdout : stderr; + + if (g_doVCode) { + fprintf(outfile, "Input vcode 0x%08x, Output vcode 0x%08x, Result vcode 0x%08x\n", + g_inputVCode, g_outputVCode, g_resultVCode); + } + + fprintf(outfile, "Verification value is 0x%08x - Testing took %f seconds\n\n", + vcode, (double)(timeEnd - timeBegin) / (double)NSEC_PER_SEC); + + return 0; } diff --git a/misc/exactcoll.c b/misc/exactcoll.c index cda5a0a1..739d2c3b 100644 --- a/misc/exactcoll.c +++ b/misc/exactcoll.c @@ -53,36 +53,36 @@ #define EXTRA_DIGITS 0 #if !defined(USE_MPFI) -#if !defined(USE_MPFR) -#error "Exactly one of USE_MPFI and USE_MPFR must be defined" -#endif + #if !defined(USE_MPFR) + #error "Exactly one of USE_MPFI and USE_MPFR must be defined" + #endif #endif #if defined(USE_MPFI) -#if defined(USE_MPFR) -#error "Exactly one of USE_MPFI and USE_MPFR must be defined" -#endif + #if defined(USE_MPFR) + #error "Exactly one of USE_MPFI and USE_MPFR must be defined" + #endif #endif #if defined(USE_MPFI) -#include -#include + #include + #include typedef mpfi_t mp_t; #else -#include + #include typedef mpfr_t mp_t; #endif -char buf[3*PRECISION]; +char buf[3 * PRECISION]; FILE * membuf; #if defined(USE_MPFI) -#define MP(x,...) mpfi_##x(__VA_ARGS__) + #define MP(x, ...) mpfi_ ## x(__VA_ARGS__) #else -#define MP(x,...) mpfr_##x(__VA_ARGS__, MPFR_RNDN) + #define MP(x, ...) mpfr_ ## x(__VA_ARGS__, MPFR_RNDN) #endif -void printcoll(uint64_t balls, uint64_t log2bins) { +void printcoll( uint64_t balls, uint64_t log2bins ) { mp_t m, n, p, e, f, c; #if defined(USE_MPFI) @@ -155,7 +155,7 @@ void printcoll(uint64_t balls, uint64_t log2bins) { * represent). */ double lb = strtod(&buf[1], NULL); - double ub = strtod(strchr(buf, ',')+1, NULL); + double ub = strtod(strchr(buf, ',') + 1, NULL); if (lb != ub) { printf("BOUNDS DO NOT MATCH TO DOUBLE PRECISION!\n"); printf("Increase PRECISION and recompile.\n"); @@ -190,7 +190,7 @@ void printcoll(uint64_t balls, uint64_t log2bins) { #endif } -int main(void) { +int main( void ) { mpfr_set_default_prec(PRECISION); membuf = fmemopen(buf, sizeof(buf), "w"); @@ -208,9 +208,9 @@ int main(void) { 264097, 204800, 200000, 102774, 100000, 77163, 50643, 6 }; - const uint64_t bits[] = {256, 224, 160, 128, 64, 55, 45, 42, 39, 36, 32, 29, 27, 24, 22, 19, 12, 8}; - const uint64_t keycnt = sizeof(keys)/sizeof(keys[0]); - const uint64_t bitcnt = sizeof(bits)/sizeof(bits[0]); + const uint64_t bits[] = { 256, 224, 160, 128, 64, 55, 45, 42, 39, 36, 32, 29, 27, 24, 22, 19, 12, 8 }; + const uint64_t keycnt = sizeof(keys) / sizeof(keys[0]); + const uint64_t bitcnt = sizeof(bits) / sizeof(bits[0]); printf("double realcoll[%d][%d] = {\n", keycnt, bitcnt); @@ -222,12 +222,11 @@ int main(void) { printcoll(key, bit); if (j == bitcnt - 1) { printf(" },\n"); - } else if ((j%3)==2) { + } else if ((j % 3) == 2) { printf(",\n "); } else { printf(", "); } - } } diff --git a/tests/AvalancheTest.cpp b/tests/AvalancheTest.cpp index 8ed3e45e..dc07dbe4 100644 --- a/tests/AvalancheTest.cpp +++ b/tests/AvalancheTest.cpp @@ -59,11 +59,11 @@ // VCode might have already included this #if defined(HAVE_AVX2) || defined(HAVE_SSE_4_1) -#include "Intrinsics.h" + #include "Intrinsics.h" #endif #if defined(HAVE_THREADS) -#include + #include typedef std::atomic a_int; #else typedef int a_int; @@ -71,52 +71,48 @@ typedef int a_int; //----------------------------------------------------------------------------- -static void PrintAvalancheDiagram ( int x, int y, int reps, double scale, uint32_t * bins ) -{ - const char * symbols = ".123456789X"; +static void PrintAvalancheDiagram( int x, int y, int reps, double scale, uint32_t * bins ) { + const char * symbols = ".123456789X"; - for(int i = 0; i < y; i++) - { - printf("["); - for(int j = 0; j < x; j++) - { - int k = (y - i) -1; + for (int i = 0; i < y; i++) { + printf("["); + for (int j = 0; j < x; j++) { + int k = (y - i) - 1; - uint32_t bin = bins[k + (j*y)]; + uint32_t bin = bins[k + (j * y)]; - double b = double(bin) / double(reps); - b = fabs(b*2 - 1); + double b = double(bin) / double(reps); + b = fabs(b * 2 - 1); - b *= scale; + b *= scale; - int s = (int)floor(b*10); + int s = (int)floor(b * 10); - if(s > 10) s = 10; - if(s < 0) s = 0; + if (s > 10) { s = 10; } + if (s < 0) { s = 0; } - printf("%c",symbols[s]); - } + printf("%c", symbols[s]); + } - printf("]\n"); - fflush(NULL); - } + printf("]\n"); + fflush(NULL); + } } //---------------------------------------------------------------------------- -static int maxBias ( uint32_t * counts, int buckets, int reps ) -{ - int expected = reps / 2; - int worst = 0; +static int maxBias( uint32_t * counts, int buckets, int reps ) { + int expected = reps / 2; + int worst = 0; - for(int i = 0; i < buckets; i++) - { - int c = abs((int)counts[i] - expected); - if(worst < c) - worst = c; - } + for (int i = 0; i < buckets; i++) { + int c = abs((int)counts[i] - expected); + if (worst < c) { + worst = c; + } + } - return worst; + return worst; } //----------------------------------------------------------------------------- @@ -127,217 +123,207 @@ static int maxBias ( uint32_t * counts, int buckets, int reps ) // cause "echoes" of the patterns in the output, which in turn can cause the // hash function to fail to create an even, random distribution of hash values. -template < typename hashtype > -static void calcBiasRange ( const HashFn hash, const seed_t seed, - std::vector &bins, - const int keybytes, const uint8_t * keys, - a_int & irepp, const int reps, const bool verbose ) -{ - const int keybits = keybytes * 8; - const int hashbytes = sizeof(hashtype); +template +static void calcBiasRange( const HashFn hash, const seed_t seed, std::vector & bins, const int keybytes, + const uint8_t * keys, a_int & irepp, const int reps, const bool verbose ) { + const int keybits = keybytes * 8; + const int hashbytes = sizeof(hashtype); + #if defined(HAVE_AVX2) - const __m256i ONE = _mm256_set1_epi32(1); - const __m256i MASK = _mm256_setr_epi32( - 1 << 0, - 1 << 1, - 1 << 2, - 1 << 3, - 1 << 4, - 1 << 5, - 1 << 6, - 1 << 7); + const __m256i ONE = _mm256_set1_epi32(1); + const __m256i MASK = _mm256_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7); #elif defined(HAVE_SSE_4_1) - const __m128i ONE = _mm_set1_epi32(1); - const __m128i MASK = _mm_setr_epi32( - 1 << 0, - 1 << 1, - 1 << 2, - 1 << 3); + const __m128i ONE = _mm_set1_epi32(1); + const __m128i MASK = _mm_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3); #endif - uint8_t buf[keybytes]; - hashtype A,B; - int irep; + uint8_t buf[keybytes]; + hashtype A, B; + int irep; - while ((irep = irepp++) < reps) - { - if(verbose) { - if(irep % (reps/10) == 0) printf("."); - } + while ((irep = irepp++) < reps) { + if (verbose) { + if (irep % (reps / 10) == 0) { printf("."); } + } - ExtBlob K(buf, &keys[keybytes * irep], keybytes); - hash(K, keybytes, seed, &A); + ExtBlob K( buf, &keys[keybytes * irep], keybytes ); + hash(K, keybytes, seed, &A); - uint32_t * cursor = &bins[0]; + uint32_t * cursor = &bins[0]; - for(int iBit = 0; iBit < keybits; iBit++) - { - K.flipbit(iBit); - hash(K, keybytes, seed, &B); - K.flipbit(iBit); + for (int iBit = 0; iBit < keybits; iBit++) { + K.flipbit(iBit); + hash(K, keybytes, seed, &B); + K.flipbit(iBit); - B ^= A; + B ^= A; #if defined(HAVE_AVX2) - for(int oWord = 0; oWord < (hashbytes/4); oWord++) { - // Get the next 32-bit chunk of the hash difference - uint32_t word; - memcpy(&word, ((const uint8_t *)&B) + 4*oWord, 4); - - // Expand it out into 4 sets of 8 32-bit integer words, with - // each integer being zero or one. - __m256i base = _mm256_set1_epi32(word); - __m256i incr1 =_mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); - base = _mm256_srli_epi32(base, 8); - __m256i incr2 =_mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); - base = _mm256_srli_epi32(base, 8); - __m256i incr3 =_mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); - base = _mm256_srli_epi32(base, 8); - __m256i incr4 =_mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); - - // Add these into the counts in bins[] - __m256i cnt1 = _mm256_loadu_si256((const __m256i *)cursor); - cnt1 = _mm256_add_epi32(cnt1, incr1); - _mm256_storeu_si256((__m256i *)cursor, cnt1); - cursor += 8; - __m256i cnt2 = _mm256_loadu_si256((const __m256i *)cursor); - cnt2 = _mm256_add_epi32(cnt2, incr2); - _mm256_storeu_si256((__m256i *)cursor, cnt2); - cursor += 8; - __m256i cnt3 = _mm256_loadu_si256((const __m256i *)cursor); - cnt3 = _mm256_add_epi32(cnt3, incr3); - _mm256_storeu_si256((__m256i *)cursor, cnt3); - cursor += 8; - __m256i cnt4 = _mm256_loadu_si256((const __m256i *)cursor); - cnt4 = _mm256_add_epi32(cnt4, incr4); - _mm256_storeu_si256((__m256i *)cursor, cnt4); - cursor += 8; - } + for (int oWord = 0; oWord < (hashbytes / 4); oWord++) { + // Get the next 32-bit chunk of the hash difference + uint32_t word; + memcpy(&word, ((const uint8_t *)&B) + 4 * oWord, 4); + + // Expand it out into 4 sets of 8 32-bit integer words, with + // each integer being zero or one. + __m256i base = _mm256_set1_epi32(word); + __m256i incr1 = _mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); + base = _mm256_srli_epi32(base, 8); + __m256i incr2 = _mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); + base = _mm256_srli_epi32(base, 8); + __m256i incr3 = _mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); + base = _mm256_srli_epi32(base, 8); + __m256i incr4 = _mm256_min_epu32(_mm256_and_si256(base, MASK), ONE); + + // Add these into the counts in bins[] + __m256i cnt1 = _mm256_loadu_si256((const __m256i *)cursor); + cnt1 = _mm256_add_epi32(cnt1, incr1); + _mm256_storeu_si256((__m256i *)cursor, cnt1); + cursor += 8; + __m256i cnt2 = _mm256_loadu_si256((const __m256i *)cursor); + cnt2 = _mm256_add_epi32(cnt2, incr2); + _mm256_storeu_si256((__m256i *)cursor, cnt2); + cursor += 8; + __m256i cnt3 = _mm256_loadu_si256((const __m256i *)cursor); + cnt3 = _mm256_add_epi32(cnt3, incr3); + _mm256_storeu_si256((__m256i *)cursor, cnt3); + cursor += 8; + __m256i cnt4 = _mm256_loadu_si256((const __m256i *)cursor); + cnt4 = _mm256_add_epi32(cnt4, incr4); + _mm256_storeu_si256((__m256i *)cursor, cnt4); + cursor += 8; + } #elif defined(HAVE_SSE_4_1) - for(int oWord = 0; oWord < (hashbytes/4); oWord++) { - // Get the next 32-bit chunk of the hash difference - uint32_t word; - memcpy(&word, ((const uint8_t *)&B) + 4*oWord, 4); - - // Expand it out into 8 sets of 4 32-bit integer words, with - // each integer being zero or one, and add them into the - // counts in bins[]. - __m128i base = _mm_set1_epi32(word); - for (int i = 0; i < 8; i++) { - __m128i incr = _mm_min_epu32(_mm_and_si128(base, MASK), ONE); - __m128i cnt = _mm_loadu_si128((const __m128i *)cursor); - cnt = _mm_add_epi32(cnt, incr); - _mm_storeu_si128((__m128i *)cursor, cnt); - base = _mm_srli_epi32(base, 4); - cursor += 4; - } - } + for (int oWord = 0; oWord < (hashbytes / 4); oWord++) { + // Get the next 32-bit chunk of the hash difference + uint32_t word; + memcpy(&word, ((const uint8_t *)&B) + 4 * oWord, 4); + + // Expand it out into 8 sets of 4 32-bit integer words, with + // each integer being zero or one, and add them into the + // counts in bins[]. + __m128i base = _mm_set1_epi32(word); + for (int i = 0; i < 8; i++) { + __m128i incr = _mm_min_epu32(_mm_and_si128(base, MASK), ONE); + __m128i cnt = _mm_loadu_si128((const __m128i *)cursor); + cnt = _mm_add_epi32(cnt, incr); + _mm_storeu_si128((__m128i *)cursor, cnt); + base = _mm_srli_epi32(base, 4); + cursor += 4; + } + } #else - for(int oByte = 0; oByte < hashbytes; oByte++) { - uint8_t byte = B[oByte]; - for(int oBit = 0; oBit < 8; oBit++) { - (*cursor++) += byte & 1; - byte >>= 1; - } - } + for (int oByte = 0; oByte < hashbytes; oByte++) { + uint8_t byte = B[oByte]; + for (int oBit = 0; oBit < 8; oBit++) { + (*cursor++) += byte & 1; + byte >>= 1; + } + } #endif + } } - } } //----------------------------------------------------------------------------- -template < typename hashtype > -static bool AvalancheImpl(HashFn hash, const seed_t seed, const int keybits, - const int reps, bool drawDiagram, bool drawdots) { - Rand r(48273); +template +static bool AvalancheImpl( HashFn hash, const seed_t seed, const int keybits, + const int reps, bool drawDiagram, bool drawdots ) { + Rand r( 48273 ); - assert((keybits & 7)==0); + assert((keybits & 7) == 0); - const int keybytes = keybits / 8; + const int keybytes = keybits / 8; - const int hashbytes = sizeof(hashtype); - const int hashbits = hashbytes * 8; + const int hashbytes = sizeof(hashtype); + const int hashbits = hashbytes * 8; - const int arraysize = keybits * hashbits; + const int arraysize = keybits * hashbits; - printf("Testing %4d-bit keys -> %3d-bit hashes, %6d reps", - keybits, hashbits, reps); - //---------- - std::vector keys(reps * keybytes); - for (int i = 0; i < reps; i++) - r.rand_p(&keys[i*keybytes],keybytes); - addVCodeInput(&keys[0], reps * keybytes); + printf("Testing %4d-bit keys -> %3d-bit hashes, %6d reps", keybits, hashbits, reps); + //---------- + std::vector keys( reps * keybytes ); + for (int i = 0; i < reps; i++) { + r.rand_p(&keys[i * keybytes], keybytes); + } + addVCodeInput(&keys[0], reps * keybytes); - a_int irep(0); + a_int irep( 0 ); - std::vector > bins(g_NCPU); - for (unsigned i = 0; i < g_NCPU; i++) { - bins[i].resize(arraysize); - } + std::vector> bins( g_NCPU ); + for (unsigned i = 0; i < g_NCPU; i++) { + bins[i].resize(arraysize); + } - if (g_NCPU == 1) { - calcBiasRange(hash,seed,bins[0],keybytes,&keys[0],irep,reps,drawdots); - } else { + if (g_NCPU == 1) { + calcBiasRange(hash, seed, bins[0], keybytes, &keys[0], irep, reps, drawdots); + } else { #if defined(HAVE_THREADS) - std::thread t[g_NCPU]; - for (int i=0; i < g_NCPU; i++) { - t[i] = std::thread {calcBiasRange,hash,seed,std::ref(bins[i]),keybytes,&keys[0],std::ref(irep),reps,drawdots}; - } - for (int i=0; i < g_NCPU; i++) { - t[i].join(); - } - for (int i=1; i < g_NCPU; i++) - for (int b=0; b < arraysize; b++) - bins[0][b] += bins[i][b]; + std::thread t[g_NCPU]; + for (int i = 0; i < g_NCPU; i++) { + t[i] = std::thread { + calcBiasRange, hash, seed, std::ref(bins[i]), + keybytes, &keys[0], std::ref(irep), reps, drawdots + }; + } + for (int i = 0; i < g_NCPU; i++) { + t[i].join(); + } + for (int i = 1; i < g_NCPU; i++) { + for (int b = 0; b < arraysize; b++) { + bins[0][b] += bins[i][b]; + } + } #endif - } + } - //---------- + //---------- - int bias = maxBias(&bins[0][0], arraysize, reps); - bool result = true; + int bias = maxBias(&bins[0][0], arraysize, reps); + bool result = true; - // Due to threading and memory complications, add the summed - // avalanche results instead of the hash values. Not ideal, but the - // "real" way is just too expensive. - addVCodeOutput(&bins[0][0], arraysize * sizeof(bins[0][0])); - addVCodeResult(bias); + // Due to threading and memory complications, add the summed + // avalanche results instead of the hash values. Not ideal, but the + // "real" way is just too expensive. + addVCodeOutput(&bins[0][0], arraysize * sizeof(bins[0][0])); + addVCodeResult(bias); - result &= ReportBias(bias, reps, arraysize, drawDiagram); + result &= ReportBias(bias, reps, arraysize, drawDiagram); - recordTestResult(result, "Avalanche", keybits); + recordTestResult(result, "Avalanche", keybits); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool AvalancheTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - bool drawdots = true; //.......... progress dots +template +bool AvalancheTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + bool drawdots = true; // .......... progress dots printf("[[[ Avalanche Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed, false, 2); - std::vector testBitsvec = - { 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 160 }; + std::vector testBitsvec = { 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 160 }; testBitsvec.reserve(50); // Workaround for GCC bug 100366 if (hinfo->bits <= 64) { testBitsvec.insert(testBitsvec.end(), { 512, 1024 }); } if (extra) { - testBitsvec.insert(testBitsvec.end(), { 192, 224, 256, 320, 384, 448, 512, 640, - 768, 896, 1024, 1280, 1536 }); + testBitsvec.insert(testBitsvec.end(), { + 192, 224, 256, 320, 384, 448, 512, 640, + 768, 896, 1024, 1280, 1536 + }); } std::sort(testBitsvec.begin(), testBitsvec.end()); testBitsvec.erase(std::unique(testBitsvec.begin(), testBitsvec.end()), testBitsvec.end()); - for (int testBits : testBitsvec) { - result &= AvalancheImpl(hash,seed,testBits,300000,verbose,drawdots); + for (int testBits: testBitsvec) { + result &= AvalancheImpl(hash, seed, testBits, 300000, verbose, drawdots); } printf("\n%s\n", result ? "" : g_failstr); diff --git a/tests/AvalancheTest.h b/tests/AvalancheTest.h index 8e4ceb82..a61f87f6 100644 --- a/tests/AvalancheTest.h +++ b/tests/AvalancheTest.h @@ -44,5 +44,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool AvalancheTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool AvalancheTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/BadSeedsTest.cpp b/tests/BadSeedsTest.cpp index 0ca787b0..e054ca7a 100644 --- a/tests/BadSeedsTest.cpp +++ b/tests/BadSeedsTest.cpp @@ -56,27 +56,28 @@ #include "BadSeedsTest.h" #if defined(HAVE_THREADS) -#include -#include -#include + #include + #include + #include #endif //----------------------------------------------------------------------------- // Find bad seeds, and test against the known secrets/bad seeds. -template< typename hashtype > -static bool TestSeed(const HashInfo * hinfo, const seed_t seed) { +template +static bool TestSeed( const HashInfo * hinfo, const seed_t seed ) { const HashFn hash = hinfo->hashFn(g_hashEndian); - const std::vector testlens = {1,2,4,8,12,16,32,64,128}; - const std::vector testbytes = {0,32,'0',127,128,255}; - const unsigned numtestbytes = testbytes.size(); - const hashtype zero = {0}; - std::vector hashes(numtestbytes); - std::set dummy_collisions; + const std::vector testlens = { 1, 2, 4, 8, 12, 16, 32, 64, 128 }; + const std::vector testbytes = { 0, 32, '0', 127, 128, 255 }; + const unsigned numtestbytes = testbytes.size(); + const hashtype zero = { 0 }; + std::vector hashes( numtestbytes ); + std::set dummy_collisions; bool result = true; - if (hinfo->is32BitSeed() && (seed > UINT64_C(0xffffffff))) + if (hinfo->is32BitSeed() && (seed > UINT64_C(0xffffffff))) { return true; + } /* Premake all the test keys */ uint8_t keys[numtestbytes][128]; @@ -87,14 +88,13 @@ static bool TestSeed(const HashInfo * hinfo, const seed_t seed) { printf("0x%" PRIx64 "\n", seed); const seed_t hseed = hinfo->Seed(seed, true); - for (int len : testlens) { + for (int len: testlens) { memset(&hashes[0], 0, numtestbytes * sizeof(hashtype)); for (int i = 0; i < numtestbytes; i++) { hash(&keys[i][0], len, hseed, &hashes[i]); if (hashes[0] == zero) { - printf("Confirmed broken seed 0x%" PRIx64 " => hash of 0" \ - " with key[%d] of all 0x%02x\n", - seed, len, testbytes[i]); + printf("Confirmed broken seed 0x%" PRIx64 " => hash of 0" + " with key[%d] of all 0x%02x\n", seed, len, testbytes[i]); result = false; } } @@ -111,14 +111,14 @@ static bool TestSeed(const HashInfo * hinfo, const seed_t seed) { return result; } -template< typename hashtype > -static bool BadSeedsKnown(const HashInfo * hinfo) { +template +static bool BadSeedsKnown( const HashInfo * hinfo ) { bool result = true; const std::set & seeds = hinfo->badseeds; printf("Testing %" PRIu64 " known bad seeds:\n", seeds.size()); - for (seed_t seed : seeds) { + for (seed_t seed: seeds) { bool thisresult = true; thisresult &= TestSeed(hinfo, seed); if (!hinfo->is32BitSeed() && (seed <= 0xffffffff) && (seed != 0)) { @@ -145,38 +145,38 @@ static bool BadSeedsKnown(const HashInfo * hinfo) { #if defined(HAVE_THREADS) // For keeping track of progress printouts across threads static std::atomic seed_progress; -static std::mutex print_mutex; +static std::mutex print_mutex; #else static unsigned seed_progress; #endif // Process part of a 2^32 range, split into g_NCPU threads -template< typename hashtype > -static void TestSeedRangeThread(const HashInfo * hinfo, const uint64_t hi, - const uint32_t start, const uint32_t endlow, - bool &result, bool &newresult) { +template +static void TestSeedRangeThread( const HashInfo * hinfo, const uint64_t hi, const uint32_t start, + const uint32_t endlow, bool & result, bool & newresult ) { const std::set & seeds = hinfo->badseeds; - const HashFn hash = hinfo->hashFn(g_hashEndian); - const seed_t last = hi | endlow; - const hashtype zero = {0}; - //static_assert(testbytes[0] == 0, "Code assumes first test byte is 0"); - const std::vector testbytes = {0,32,127,255}; - const unsigned numtestbytes = testbytes.size(); - std::vector hashes(numtestbytes); - std::set collisions; + const HashFn hash = hinfo->hashFn(g_hashEndian); + const seed_t last = hi | endlow; + const hashtype zero = { 0 }; + // static_assert(testbytes[0] == 0, "Code assumes first test byte is 0"); + const std::vector testbytes = { 0, 32, 127, 255 }; + const unsigned numtestbytes = testbytes.size(); + std::vector hashes( numtestbytes ); + std::set collisions; const char * progress_fmt = - (last <= UINT64_C(0xffffffff)) ? - "%8" PRIx64 "%c" : "%16" PRIx64 "%c"; + (last <= UINT64_C(0xffffffff)) ? + "%8" PRIx64 "%c" : "%16" PRIx64 "%c"; const uint64_t progress_nl_every = - (last <= UINT64_C(0xffffffff)) ? 8 : 4; + (last <= UINT64_C(0xffffffff)) ? 8 : 4; int fails = 0; + result = true; { #if defined(HAVE_THREADS) - std::lock_guard lock(print_mutex); + std::lock_guard lock( print_mutex ); #endif printf("Testing [0x%016" PRIx64 ", 0x%016" PRIx64 "] ... \n", hi | start, last); } @@ -197,11 +197,11 @@ static void TestSeedRangeThread(const HashInfo * hinfo, const uint64_t hi, */ if ((seed & UINT64_C(0x1ffffff)) == UINT64_C(0x1ffffff)) { #if defined(HAVE_THREADS) - std::lock_guard lock(print_mutex); + std::lock_guard lock( print_mutex ); #endif - unsigned count = ++seed_progress; + unsigned count = ++seed_progress; const char spacer = ((count % progress_nl_every) == 0) ? '\n' : ' '; - printf (progress_fmt, seed, spacer); + printf(progress_fmt, seed, spacer); } /* Test the next seed against 16 copies of each test byte */ @@ -216,7 +216,7 @@ static void TestSeedRangeThread(const HashInfo * hinfo, const uint64_t hi, bool known_seed = (std::find(seeds.begin(), seeds.end(), seed) != seeds.end()); { #if defined(HAVE_THREADS) - std::lock_guard lock(print_mutex); + std::lock_guard lock( print_mutex ); #endif if (known_seed) { printf("\nVerified broken seed 0x%" PRIx64 " => 0 with key[16] of all 0 bytes\n", seed); @@ -234,7 +234,7 @@ static void TestSeedRangeThread(const HashInfo * hinfo, const uint64_t hi, /* Report if any collisions were found */ if (FindCollisions(hashes, collisions, 1000, true) > 0) { #if defined(HAVE_THREADS) - std::lock_guard lock(print_mutex); + std::lock_guard lock( print_mutex ); #endif bool known_seed = (std::find(seeds.begin(), seeds.end(), seed) != seeds.end()); if (known_seed) { @@ -255,75 +255,79 @@ static void TestSeedRangeThread(const HashInfo * hinfo, const uint64_t hi, } collisions.clear(); result = false; - if (!known_seed) + if (!known_seed) { newresult = true; + } } } while (seed++ != last); - out: + out: return; } // Test a full 2**32 range [hi + 0, hi + 0xffffffff]. // If no new bad seed is found, then newresult must be left unchanged. -template< typename hashtype > -static bool TestManySeeds(const HashInfo * hinfo, const uint64_t hi, bool &newresult) { - bool result = true; - seed_progress = 0; - - if (g_NCPU == 1) { - TestSeedRangeThread(hinfo, hi, 0x0, 0xffffffff, result, newresult); - printf("\n"); - } else { +template +static bool TestManySeeds( const HashInfo * hinfo, const uint64_t hi, bool & newresult ) { + bool result = true; + + seed_progress = 0; + + if (g_NCPU == 1) { + TestSeedRangeThread(hinfo, hi, 0x0, 0xffffffff, result, newresult); + printf("\n"); + } else { #if defined(HAVE_THREADS) - // split into g_NCPU threads - std::thread t[g_NCPU]; - const uint64_t len = UINT64_C(0x100000000) / g_NCPU; - // Can't make VLAs in C++, so have to use vectors, but can't - // pass a ref of a bool in a vector to a thread... :-< - bool * results = new bool[g_NCPU](); - bool * newresults = new bool[g_NCPU](); - - printf("%d threads starting...\n", g_NCPU); - for (int i=0; i < g_NCPU; i++) { - const uint32_t start = i * len; - const uint32_t end = (i < (g_NCPU - 1)) ? start + (len - 1) : 0xffffffff; - t[i] = std::thread {TestSeedRangeThread, hinfo, hi, start, end, - std::ref(results[i]), std::ref(newresults[i])}; - } - - std::this_thread::sleep_for(std::chrono::seconds(1)); - - for (int i=0; i < g_NCPU; i++) { - t[i].join(); - } - - printf("All %d threads ended\n", g_NCPU); - - for (int i=0; i < g_NCPU; i++) { - result &= results[i]; - newresult |= newresults[i]; - } - - delete [] results; - delete [] newresults; + // split into g_NCPU threads + std::thread t[g_NCPU]; + const uint64_t len = UINT64_C(0x100000000) / g_NCPU; + // Can't make VLAs in C++, so have to use vectors, but can't + // pass a ref of a bool in a vector to a thread... :-< + bool * results = new bool[g_NCPU](); + bool * newresults = new bool[g_NCPU](); + + printf("%d threads starting...\n", g_NCPU); + for (int i = 0; i < g_NCPU; i++) { + const uint32_t start = i * len; + const uint32_t end = (i < (g_NCPU - 1)) ? start + (len - 1) : 0xffffffff; + t[i] = std::thread { + TestSeedRangeThread, hinfo, hi, start, end, + std::ref(results[i]), std::ref(newresults[i]) + }; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + for (int i = 0; i < g_NCPU; i++) { + t[i].join(); + } + + printf("All %d threads ended\n", g_NCPU); + + for (int i = 0; i < g_NCPU; i++) { + result &= results[i]; + newresult |= newresults[i]; + } + + delete [] results; + delete [] newresults; #endif - } + } - // Since this can be threaded, just use the test parameters for the - // VCode input data. - addVCodeInput(hi); // hi - addVCodeInput(0); // lo start - addVCodeInput(0xffffffff); // lo end - // Nothing to add to VCodeOutput - addVCodeResult(result); + // Since this can be threaded, just use the test parameters for the + // VCode input data. + addVCodeInput( hi); // hi + addVCodeInput( 0); // lo start + addVCodeInput(0xffffffff); // lo end + // Nothing to add to VCodeOutput + addVCodeResult(result); - return result; + return result; } -template< typename hashtype > -static bool BadSeedsFind(const HashInfo * hinfo) { - bool result = true; +template +static bool BadSeedsFind( const HashInfo * hinfo ) { + bool result = true; bool newresult = false; printf("Testing the first 2**32 seeds ...\n"); @@ -347,10 +351,10 @@ static bool BadSeedsFind(const HashInfo * hinfo) { } //----------------------------------------------------------------------------- -template < typename hashtype > -bool BadSeedsTest(const HashInfo * hinfo, bool find_new_seeds) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; +template +bool BadSeedsTest( const HashInfo * hinfo, bool find_new_seeds ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; // Never find new bad seeds for mock hashes, except for aesrng if (hinfo->isMock() && (strncmp(hinfo->name, "aesrng", 6) != 0)) { diff --git a/tests/BadSeedsTest.h b/tests/BadSeedsTest.h index d500b4ec..29f679f3 100644 --- a/tests/BadSeedsTest.h +++ b/tests/BadSeedsTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool BadSeedsTest(const HashInfo * info, const bool find_new_seeds); +template +bool BadSeedsTest( const HashInfo * info, const bool find_new_seeds ); diff --git a/tests/BitIndependenceTest.cpp b/tests/BitIndependenceTest.cpp index e2d66f4c..c140315a 100644 --- a/tests/BitIndependenceTest.cpp +++ b/tests/BitIndependenceTest.cpp @@ -63,139 +63,130 @@ // The choices for VCode inputs may seem strange here, but they were // chosen in anticipation of threading this test. -template< typename keytype, typename hashtype > -static bool BicTest3(HashFn hash, const seed_t seed, const int reps, bool verbose = false ) -{ - const int keybytes = sizeof(keytype); - const int keybits = keybytes * 8; - const int hashbytes = sizeof(hashtype); - const int hashbits = hashbytes * 8; - const int pagesize = hashbits*hashbits*4; +template +static bool BicTest3( HashFn hash, const seed_t seed, const int reps, bool verbose = false ) { + const int keybytes = sizeof(keytype); + const int keybits = keybytes * 8; + const int hashbytes = sizeof(hashtype); + const int hashbits = hashbytes * 8; + const int pagesize = hashbits * hashbits * 4; - Rand r(11938); + Rand r( 11938 ); - double maxBias = 0; - int maxK = 0; - int maxA = 0; - int maxB = 0; + double maxBias = 0; + int maxK = 0; + int maxA = 0; + int maxB = 0; - keytype key; - hashtype h1,h2; + keytype key; + hashtype h1, h2; - std::vector bins(keybits*pagesize,0); + std::vector bins( keybits * pagesize, 0 ); - for(int keybit = 0; keybit < keybits; keybit++) - { - if(keybit % (keybits/10) == 0) printf("."); + for (int keybit = 0; keybit < keybits; keybit++) { + if (keybit % (keybits / 10) == 0) { printf("."); } - int * page = &bins[keybit*pagesize]; + int * page = &bins[keybit * pagesize]; - for(int irep = 0; irep < reps; irep++) - { - r.rand_p(&key, keybytes); - addVCodeInput(&key, keybytes); - addVCodeInput(keybit); + for (int irep = 0; irep < reps; irep++) { + r.rand_p(&key, keybytes); + addVCodeInput(&key , keybytes); + addVCodeInput(keybit); - hash(&key, keybytes, seed, &h1); - key.flipbit(keybit); - hash(&key, keybytes, seed, &h2); + hash(&key, keybytes, seed, &h1); + key.flipbit(keybit); + hash(&key, keybytes, seed, &h2); - hashtype d = h1 ^ h2; + hashtype d = h1 ^ h2; - for(int out1 = 0; out1 < hashbits-1; out1++) - for(int out2 = out1+1; out2 < hashbits; out2++) - { - int * b = &page[(out1*hashbits+out2)*4]; + for (int out1 = 0; out1 < hashbits - 1; out1++) { + for (int out2 = out1 + 1; out2 < hashbits; out2++) { + int * b = &page[(out1 * hashbits + out2) * 4]; - uint32_t x = d.getbit(out1) | (d.getbit(out2) << 1); + uint32_t x = d.getbit(out1) | (d.getbit(out2) << 1); - b[x]++; - } + b[x]++; + } + } + } } - } - printf("\n"); + printf("\n"); - for(int out1 = 0; out1 < hashbits-1; out1++) - { - for(int out2 = out1+1; out2 < hashbits; out2++) - { - if(verbose) printf("(%3d,%3d) - ",out1,out2); + for (int out1 = 0; out1 < hashbits - 1; out1++) { + for (int out2 = out1 + 1; out2 < hashbits; out2++) { + if (verbose) { printf("(%3d,%3d) - ", out1, out2); } - for(int keybit = 0; keybit < keybits; keybit++) - { - int * page = &bins[keybit*pagesize]; - int * bins = &page[(out1*hashbits+out2)*4]; + for (int keybit = 0; keybit < keybits; keybit++) { + int * page = &bins[keybit * pagesize]; + int * bins = &page[(out1 * hashbits + out2) * 4 ]; - double bias = 0; + double bias = 0; - for(int b = 0; b < 4; b++) - { - double b2 = double(bins[b]) / double(reps / 2); - b2 = fabs(b2 * 2 - 1); + for (int b = 0; b < 4; b++) { + double b2 = double(bins[b]) / double(reps / 2); + b2 = fabs(b2 * 2 - 1); - if(b2 > bias) bias = b2; - } + if (b2 > bias) { bias = b2; } + } - if(bias > maxBias) - { - maxBias = bias; - maxK = keybit; - maxA = out1; - maxB = out2; - } + if (bias > maxBias) { + maxBias = bias; + maxK = keybit; + maxA = out1; + maxB = out2; + } - if(verbose) - { - if (bias < 0.01) printf("."); - else if(bias < 0.05) printf("o"); - else if(bias < 0.33) printf("O"); - else printf("X"); - } - } + if (verbose) { + if (bias < 0.01) { printf("."); } else if (bias < 0.05) { printf("o"); } else if (bias < 0.33) { + printf("O"); + } else { + printf("X"); + } + } + } - // Finished keybit - if(verbose) printf("\n"); - } + // Finished keybit + if (verbose) { printf("\n"); } + } - if(verbose) - { - for(int i = 0; i < keybits+12; i++) printf("-"); - printf("\n"); + if (verbose) { + for (int i = 0; i < keybits + 12; i++) { printf("-"); } + printf("\n"); + } } - } - addVCodeOutput(&bins[0], keybits*pagesize*sizeof(bins[0])); - addVCodeResult((uint32_t)(maxBias * 1000.0)); - addVCodeResult(maxK); - addVCodeResult(maxA); - addVCodeResult(maxB); + addVCodeOutput(&bins[0], keybits * pagesize * sizeof(bins[0])); + addVCodeResult((uint32_t)(maxBias * 1000.0)); + addVCodeResult(maxK); + addVCodeResult(maxA); + addVCodeResult(maxB); - printf("Max bias %f - (%3d : %3d,%3d)\n",maxBias,maxK,maxA,maxB); + printf("Max bias %f - (%3d : %3d,%3d)\n", maxBias, maxK, maxA, maxB); - // Bit independence is harder to pass than avalanche, so we're a bit more lax here. - bool result = (maxBias < 0.05); - return result; + // Bit independence is harder to pass than avalanche, so we're a bit more lax here. + bool result = (maxBias < 0.05); + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool BicTest(const HashInfo * hinfo, const bool verbose) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - bool fewerreps = (hinfo->bits > 64 || hinfo->isVerySlow()) ? true : false; +template +bool BicTest( const HashInfo * hinfo, const bool verbose ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + bool fewerreps = (hinfo->bits > 64 || hinfo->isVerySlow()) ? true : false; printf("[[[ BIC 'Bit Independence Criteria' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); if (fewerreps) { - result &= BicTest3,hashtype>(hash,seed,100000,verbose); + result &= BicTest3, hashtype>(hash, seed, 100000, verbose); } else { - const long reps = 64000000/hinfo->bits; - //result &= BicTest(hash,2000000); - result &= BicTest3,hashtype>(hash,seed,(int)reps,verbose); + const long reps = 64000000 / hinfo->bits; + // result &= BicTest(hash,2000000); + result &= BicTest3, hashtype>(hash, seed, (int)reps, verbose); } recordTestResult(result, "BIC", (const char *)NULL); @@ -215,206 +206,192 @@ INSTANTIATE(BicTest, HASHTYPELIST); // Tests the Bit Independence Criteron. Stricter than Avalanche, but slow and // not really all that useful. -template< typename keytype, typename hashtype > -void BicTest1 ( HashFn hash, const int keybit, const int reps, double & maxBias, int & maxA, int & maxB, bool verbose ) -{ - Rand r(11938); +template +void BicTest1( HashFn hash, const int keybit, const int reps, double & maxBias, int & maxA, int & maxB, bool verbose ) { + Rand r( 11938 ); - const int keybytes = sizeof(keytype); - const int hashbytes = sizeof(hashtype); - const int hashbits = hashbytes * 8; + const int keybytes = sizeof(keytype); + const int hashbytes = sizeof(hashtype); + const int hashbits = hashbytes * 8; - std::vector bins(hashbits*hashbits*4,0); + std::vector bins( hashbits * hashbits * 4, 0 ); - keytype key; - hashtype h1,h2; + keytype key; + hashtype h1, h2; - for(int irep = 0; irep < reps; irep++) - { - if(verbose) { - if(irep % (reps/10) == 0) printf("."); - } + for (int irep = 0; irep < reps; irep++) { + if (verbose) { + if (irep % (reps / 10) == 0) { printf("."); } + } - r.rand_p(&key,keybytes); - hash(&key,keybytes,g_seed,&h1); + r.rand_p(&key, keybytes); + hash(&key, keybytes, g_seed, &h1); - key.flipbit(keybit); - hash(&key,keybytes,g_seed,&h2); + key.flipbit(keybit); + hash(&key, keybytes, g_seed, &h2); - hashtype d = h1 ^ h2; + hashtype d = h1 ^ h2; - for(int out1 = 0; out1 < hashbits; out1++) - for(int out2 = 0; out2 < hashbits; out2++) - { - if(out1 == out2) continue; + for (int out1 = 0; out1 < hashbits; out1++) { + for (int out2 = 0; out2 < hashbits; out2++) { + if (out1 == out2) { continue; } - uint32_t b = getbit(d,out1) | (getbit(d,out2) << 1); + uint32_t b = getbit(d, out1) | (getbit(d, out2) << 1); - bins[(out1 * hashbits + out2) * 4 + b]++; - } - } - - if(verbose) printf("\n"); - - maxBias = 0; - - for(int out1 = 0; out1 < hashbits; out1++) - { - for(int out2 = 0; out2 < hashbits; out2++) - { - if(out1 == out2) - { - if(verbose) printf("\\"); - continue; - } - - double bias = 0; - - for(int b = 0; b < 4; b++) - { - double b2 = double(bins[(out1 * hashbits + out2) * 4 + b]) / double(reps / 2); - b2 = fabs(b2 * 2 - 1); - - if(b2 > bias) bias = b2; - } - - if(bias > maxBias) - { - maxBias = bias; - maxA = out1; - maxB = out2; - } - - if(verbose) - { - if (bias < 0.01) printf("."); - else if(bias < 0.05) printf("o"); - else if(bias < 0.33) printf("O"); - else printf("X"); - } + bins[(out1 * hashbits + out2) * 4 + b]++; + } + } } - if(verbose) printf("\n"); - } + if (verbose) { printf("\n"); } + + maxBias = 0; + + for (int out1 = 0; out1 < hashbits; out1++) { + for (int out2 = 0; out2 < hashbits; out2++) { + if (out1 == out2) { + if (verbose) { printf("\\"); } + continue; + } + + double bias = 0; + + for (int b = 0; b < 4; b++) { + double b2 = double(bins[(out1 * hashbits + out2) * 4 + b]) / double(reps / 2); + b2 = fabs(b2 * 2 - 1); + + if (b2 > bias) { bias = b2; } + } + + if (bias > maxBias) { + maxBias = bias; + maxA = out1; + maxB = out2; + } + + if (verbose) { + if (bias < 0.01) { printf("."); } else if (bias < 0.05) { printf("o"); } else if (bias < 0.33) { + printf("O"); + } else { + printf("X"); + } + } + } + + if (verbose) { printf("\n"); } + } } //---------- -template< typename keytype, typename hashtype > -bool BicTest1 ( HashFn hash, const int reps ) -{ - const int keybytes = sizeof(keytype); - const int keybits = keybytes * 8; +template +bool BicTest1( HashFn hash, const int reps ) { + const int keybytes = sizeof(keytype); + const int keybits = keybytes * 8; - double maxBias = 0; - int maxK = 0; - int maxA = 0; - int maxB = 0; + double maxBias = 0; + int maxK = 0; + int maxA = 0; + int maxB = 0; - for(int i = 0; i < keybits; i++) - { - if(i % (keybits/10) == 0) printf("."); + for (int i = 0; i < keybits; i++) { + if (i % (keybits / 10) == 0) { printf("."); } - double bias; - int a,b; + double bias; + int a, b; - BicTest1(hash,i,reps,bias,a,b,true); + BicTest1(hash, i, reps, bias, a, b, true); - if(bias > maxBias) - { - maxBias = bias; - maxK = i; - maxA = a; - maxB = b; + if (bias > maxBias) { + maxBias = bias; + maxK = i; + maxA = a; + maxB = b; + } } - } - printf("Max bias %f - (%3d : %3d,%3d)\n",maxBias,maxK,maxA,maxB); + printf("Max bias %f - (%3d : %3d,%3d)\n", maxBias, maxK, maxA, maxB); - // Bit independence is harder to pass than avalanche, so we're a bit more lax here. + // Bit independence is harder to pass than avalanche, so we're a bit more lax here. - bool result = (maxBias < 0.05); + bool result = (maxBias < 0.05); - return result; + return result; } //----------------------------------------------------------------------------- // BIC test variant - iterate over output bits, then key bits. No temp storage, // but slooooow -template< typename keytype, typename hashtype > -void BicTest2 ( HashFn hash, const int reps, bool verbose = true ) -{ - const int keybytes = sizeof(keytype); - const int keybits = keybytes * 8; - const int hashbytes = sizeof(hashtype); - const int hashbits = hashbytes * 8; - - Rand r(11938); - - double maxBias = 0; - int maxK = 0; - int maxA = 0; - int maxB = 0; - - keytype key; - hashtype h1,h2; - - for(int out1 = 0; out1 < hashbits-1; out1++) - for(int out2 = out1+1; out2 < hashbits; out2++) - { - if(verbose) printf("(%3d,%3d) - ",out1,out2); - - for(int keybit = 0; keybit < keybits; keybit++) - { - int bins[4] = { 0, 0, 0, 0 }; - - for(int irep = 0; irep < reps; irep++) - { - r.rand_p(&key,keybytes); - hash(&key,keybytes,g_seed,&h1); - key.flipbit(keybit); - hash(&key,keybytes,g_seed,&h2); +template +void BicTest2( HashFn hash, const int reps, bool verbose = true ) { + const int keybytes = sizeof(keytype); + const int keybits = keybytes * 8; + const int hashbytes = sizeof(hashtype); + const int hashbits = hashbytes * 8; - hashtype d = h1 ^ h2; + Rand r( 11938 ); - uint32_t b = getbit(d,out1) | (getbit(d,out2) << 1); + double maxBias = 0; + int maxK = 0; + int maxA = 0; + int maxB = 0; - bins[b]++; - } + keytype key; + hashtype h1, h2; - double bias = 0; + for (int out1 = 0; out1 < hashbits - 1; out1++) { + for (int out2 = out1 + 1; out2 < hashbits; out2++) { + if (verbose) { printf("(%3d,%3d) - ", out1, out2); } - for(int b = 0; b < 4; b++) - { - double b2 = double(bins[b]) / double(reps / 2); - b2 = fabs(b2 * 2 - 1); + for (int keybit = 0; keybit < keybits; keybit++) { + int bins[4] = { 0, 0, 0, 0 }; - if(b2 > bias) bias = b2; - } + for (int irep = 0; irep < reps; irep++) { + r.rand_p(&key, keybytes); + hash(&key, keybytes, g_seed, &h1); + key.flipbit(keybit); + hash(&key, keybytes, g_seed, &h2); - if(bias > maxBias) - { - maxBias = bias; - maxK = keybit; - maxA = out1; - maxB = out2; - } + hashtype d = h1 ^ h2; - if(verbose) - { - if (bias < 0.05) printf("."); - else if(bias < 0.10) printf("o"); - else if(bias < 0.50) printf("O"); - else printf("X"); - } - } + uint32_t b = getbit(d, out1) | (getbit(d, out2) << 1); - // Finished keybit + bins[b]++; + } - if(verbose) printf("\n"); - } + double bias = 0; - printf("Max bias %f - (%3d : %3d,%3d)\n",maxBias,maxK,maxA,maxB); + for (int b = 0; b < 4; b++) { + double b2 = double(bins[b]) / double(reps / 2); + b2 = fabs(b2 * 2 - 1); + + if (b2 > bias) { bias = b2; } + } + + if (bias > maxBias) { + maxBias = bias; + maxK = keybit; + maxA = out1; + maxB = out2; + } + + if (verbose) { + if (bias < 0.05) { printf("."); } else if (bias < 0.10) { printf("o"); } else if (bias < 0.50) { + printf("O"); + } else { + printf("X"); + } + } + } + + // Finished keybit + + if (verbose) { printf("\n"); } + } + } + + printf("Max bias %f - (%3d : %3d,%3d)\n", maxBias, maxK, maxA, maxB); } + #endif /* 0 */ diff --git a/tests/BitIndependenceTest.h b/tests/BitIndependenceTest.h index a068729e..7cf6def5 100644 --- a/tests/BitIndependenceTest.h +++ b/tests/BitIndependenceTest.h @@ -45,5 +45,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool BicTest(const HashInfo * info, const bool verbose); +template +bool BicTest( const HashInfo * info, const bool verbose ); diff --git a/tests/CyclicKeysetTest.cpp b/tests/CyclicKeysetTest.cpp index 52b17bcf..0b245071 100644 --- a/tests/CyclicKeysetTest.cpp +++ b/tests/CyclicKeysetTest.cpp @@ -49,7 +49,7 @@ #include "Platform.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // For EstimateNbCollisions +#include "Stats.h" // For EstimateNbCollisions #include "Random.h" #include "Analyze.h" #include "Instantiate.h" @@ -63,69 +63,68 @@ // // (This keyset type is designed to make MurmurHash2 fail) -static inline uint32_t f3mix ( uint32_t k ) -{ - k ^= k >> 16; - k *= 0x85ebca6b; - k ^= k >> 13; - k *= 0xc2b2ae35; - k ^= k >> 16; +static inline uint32_t f3mix( uint32_t k ) { + k ^= k >> 16; + k *= 0x85ebca6b; + k ^= k >> 13; + k *= 0xc2b2ae35; + k ^= k >> 16; - return k; + return k; } -template < typename hashtype > -static bool CyclicKeyImpl(HashFn hash, const seed_t seed, int cycleLen, int cycleReps, const int keycount, bool drawDiagram) { - printf("Keyset 'Cyclic' - %d cycles of %d bytes - %d keys\n",cycleReps,cycleLen,keycount); +template +static bool CyclicKeyImpl( HashFn hash, const seed_t seed, int cycleLen, + int cycleReps, const int keycount, bool drawDiagram ) { + printf("Keyset 'Cyclic' - %d cycles of %d bytes - %d keys\n", cycleReps, cycleLen, keycount); - Rand r(483723); + Rand r( 483723 ); - std::vector hashes; - hashes.resize(keycount); + std::vector hashes; + hashes.resize(keycount); - int keyLen = cycleLen * cycleReps; + int keyLen = cycleLen * cycleReps; - uint8_t * cycle = new uint8_t[cycleLen + 16]; - uint8_t * key = new uint8_t[keyLen]; + uint8_t * cycle = new uint8_t[cycleLen + 16]; + uint8_t * key = new uint8_t[keyLen ]; - //---------- + //---------- - for(int i = 0; i < keycount; i++) - { - r.rand_p(cycle,cycleLen); + for (int i = 0; i < keycount; i++) { + r.rand_p(cycle, cycleLen); - *(uint32_t*)cycle = f3mix(i ^ 0x746a94f1); + *(uint32_t *)cycle = f3mix(i ^ 0x746a94f1); - for(int j = 0; j < keyLen; j++) - { - key[j] = cycle[j % cycleLen]; - } + for (int j = 0; j < keyLen; j++) { + key[j] = cycle[j % cycleLen]; + } - hash(key, keyLen, seed, &hashes[i]); - addVCodeInput(key, keyLen); - } + hash(key, keyLen, seed, &hashes[i]); + addVCodeInput(key, keyLen); + } - //---------- + //---------- - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - delete [] key; - delete [] cycle; + delete [] key; + delete [] cycle; - addVCodeResult(result); + addVCodeResult(result); - recordTestResult(result, "Cyclic", cycleLen); + recordTestResult(result, "Cyclic", cycleLen); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool CyclicKeyTest(const HashInfo * hinfo, const bool verbose) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; +template +bool CyclicKeyTest( const HashInfo * hinfo, const bool verbose ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + #if defined(DEBUG) const int reps = 2; #else @@ -136,12 +135,12 @@ bool CyclicKeyTest(const HashInfo * hinfo, const bool verbose) { const seed_t seed = hinfo->Seed(g_seed); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+0,8,reps,verbose); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+1,8,reps,verbose); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+2,8,reps,verbose); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+3,8,reps,verbose); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+4,8,reps,verbose); - result &= CyclicKeyImpl(hash,seed,sizeof(hashtype)+8,8,reps,verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 0, 8, reps, verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 1, 8, reps, verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 2, 8, reps, verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 3, 8, reps, verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 4, 8, reps, verbose); + result &= CyclicKeyImpl(hash, seed, sizeof(hashtype) + 8, 8, reps, verbose); printf("%s\n", result ? "" : g_failstr); diff --git a/tests/CyclicKeysetTest.h b/tests/CyclicKeysetTest.h index 54b72cb4..7119f1c4 100644 --- a/tests/CyclicKeysetTest.h +++ b/tests/CyclicKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool CyclicKeyTest(const HashInfo * info, const bool verbose); +template +bool CyclicKeyTest( const HashInfo * info, const bool verbose ); diff --git a/tests/DiffDistributionTest.cpp b/tests/DiffDistributionTest.cpp index d74efcf3..7feda675 100644 --- a/tests/DiffDistributionTest.cpp +++ b/tests/DiffDistributionTest.cpp @@ -59,61 +59,59 @@ // generate random key pairs and run full distribution/collision tests on the // hash differentials -template < typename keytype, typename hashtype > -static bool DiffDistTest2(HashFn hash, const seed_t seed, bool drawDiagram) { - Rand r(857374); +template +static bool DiffDistTest2( HashFn hash, const seed_t seed, bool drawDiagram ) { + Rand r( 857374 ); - int keybits = sizeof(keytype) * 8; - const int keycount = 256*256*32; - keytype k; + int keybits = sizeof(keytype) * 8; + const int keycount = 256 * 256 * 32; + keytype k; - std::vector hashes(keycount); - hashtype h1,h2; + std::vector hashes( keycount ); + hashtype h1, h2; - bool result = true; + bool result = true; - for(int keybit = 0; keybit < keybits; keybit++) - { - printf("Testing bit %d - %d keys\n",keybit, keycount); + for (int keybit = 0; keybit < keybits; keybit++) { + printf("Testing bit %d - %d keys\n", keybit, keycount); - for(int i = 0; i < keycount; i++) - { - r.rand_p(&k, sizeof(keytype)); - hash(&k, sizeof(keytype), seed, &h1); - addVCodeInput(&k, sizeof(keytype)); + for (int i = 0; i < keycount; i++) { + r.rand_p(&k, sizeof(keytype)); + hash(&k, sizeof(keytype), seed, &h1); + addVCodeInput(&k, sizeof(keytype)); - k.flipbit(keybit); - hash(&k, sizeof(keytype), seed, &h2); - addVCodeInput(&k, sizeof(keytype)); + k.flipbit(keybit); + hash(&k, sizeof(keytype), seed, &h2); + addVCodeInput(&k, sizeof(keytype)); - hashes[i] = h1 ^ h2; - } + hashes[i] = h1 ^ h2; + } - bool thisresult = TestHashList(hashes,drawDiagram,true,true); - printf("\n"); + bool thisresult = TestHashList(hashes, drawDiagram, true, true); + printf("\n"); - addVCodeResult(thisresult); + addVCodeResult(thisresult); - recordTestResult(thisresult, "DiffDist", keybit); + recordTestResult(thisresult, "DiffDist", keybit); - result &= thisresult; - } + result &= thisresult; + } - return result; + return result; } //---------------------------------------------------------------------------- -template < typename hashtype > -bool DiffDistTest(const HashInfo * hinfo, const bool verbose) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; +template +bool DiffDistTest( const HashInfo * hinfo, const bool verbose ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; printf("[[[ DiffDist 'Differential Distribution' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); - result &= DiffDistTest2,hashtype>(hash, seed, verbose); + result &= DiffDistTest2, hashtype>(hash, seed, verbose); printf("%s\n", result ? "" : g_failstr); @@ -126,7 +124,7 @@ INSTANTIATE(DiffDistTest, HASHTYPELIST); // An old implementation; currently unused. #if 0 -#include "SparseKeysetTest.h" // for SparseKeygenRecurse + #include "SparseKeysetTest.h" // for SparseKeygenRecurse //----------------------------------------------------------------------------- // Differential distribution test - for each N-bit input differential, generate // a large set of differential key pairs, hash them, and test the output @@ -143,56 +141,53 @@ INSTANTIATE(DiffDistTest, HASHTYPELIST); // #TODO - put diagram drawing back on -template < typename keytype, typename hashtype > -void DiffDistTest ( HashFn hash, const int diffbits, int trials, double & worst, double & avg ) -{ - std::vector keys(trials); - std::vector A(trials),B(trials); +template +void DiffDistTest( HashFn hash, const int diffbits, int trials, double & worst, double & avg ) { + std::vector keys( trials ); + std::vector A( trials ), B(trials); - //FIXME seedHash(hash, g_seed); - for(int i = 0; i < trials; i++) - { - rand_p(&keys[i],sizeof(keytype)); + // FIXME seedHash(hash, g_seed); + for (int i = 0; i < trials; i++) { + rand_p(&keys[i], sizeof(keytype)); - hash(&keys[i],sizeof(keytype),g_seed,(uint32_t*)&A[i]); - } + hash(&keys[i], sizeof(keytype), g_seed, (uint32_t *)&A[i]); + } - //---------- + //---------- - std::vector diffs; + std::vector diffs; - keytype temp(0); + keytype temp( 0 ); - SparseKeygenRecurse(0,diffbits,true,temp,diffs); + SparseKeygenRecurse(0, diffbits, true, temp, diffs); - //---------- + //---------- - worst = 0; - avg = 0; + worst = 0; + avg = 0; - hashtype h2; + hashtype h2; - for(size_t j = 0; j < diffs.size(); j++) - { - keytype & d = diffs[j]; + for (size_t j = 0; j < diffs.size(); j++) { + keytype & d = diffs[j]; - for(int i = 0; i < trials; i++) - { - keytype k2 = keys[i] ^ d; + for (int i = 0; i < trials; i++) { + keytype k2 = keys[i] ^ d; - hash(&k2,sizeof(k2),g_seed,&h2); + hash(&k2, sizeof(k2), g_seed, &h2); - B[i] = A[i] ^ h2; - } + B[i] = A[i] ^ h2; + } - double dworst,davg; + double dworst, davg; - TestDistributionFast(B,dworst,davg); + TestDistributionFast(B, dworst, davg); - avg += davg; - worst = (dworst > worst) ? dworst : worst; - } + avg += davg; + worst = (dworst > worst) ? dworst : worst; + } - avg /= double(diffs.size()); + avg /= double(diffs.size()); } + #endif /* 0 */ diff --git a/tests/DiffDistributionTest.h b/tests/DiffDistributionTest.h index 6ffb44be..3647d957 100644 --- a/tests/DiffDistributionTest.h +++ b/tests/DiffDistributionTest.h @@ -48,5 +48,5 @@ // Differential distribution tests - generate a bunch of random keys, // see what happens to the hash value when we flip a bit of the key. -template < typename hashtype > -bool DiffDistTest(const HashInfo * info, const bool verbose); +template +bool DiffDistTest( const HashInfo * info, const bool verbose ); diff --git a/tests/DifferentialTest.cpp b/tests/DifferentialTest.cpp index 032d885c..26ef2f2a 100644 --- a/tests/DifferentialTest.cpp +++ b/tests/DifferentialTest.cpp @@ -47,7 +47,7 @@ #include "Platform.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // for chooseUpToK +#include "Stats.h" // for chooseUpToK #include "Random.h" #include "Analyze.h" #include "Instantiate.h" @@ -59,7 +59,7 @@ #include #if defined(HAVE_THREADS) -#include + #include typedef std::atomic a_int; #else typedef int a_int; @@ -70,47 +70,45 @@ typedef int a_int; // occured once (these could be false positives). If we find identical // hash counts of 3 or more (2+ collisions), the differential test fails. -template < class keytype > -static bool ProcessDifferentials ( std::map & diffcounts, int reps, bool dumpCollisions ) -{ - int totalcount = 0; - int ignore = 0; +template +static bool ProcessDifferentials( std::map & diffcounts, int reps, bool dumpCollisions ) { + int totalcount = 0; + int ignore = 0; - bool result = true; + bool result = true; - if (diffcounts.size()) { - for (std::pair dc : diffcounts) { - uint32_t count = dc.second; + if (diffcounts.size()) { + for (std::pair dc: diffcounts) { + uint32_t count = dc.second; - totalcount += count; + totalcount += count; - if (count == 1) { - ignore++; - } else { - result = false; + if (count == 1) { + ignore++; + } else { + result = false; - if(dumpCollisions) { - double pct = 100 * (double(count) / double(reps)); - dc.first.printbits(""); - printf(" - %4.2f%%\n", pct ); - } - } - } - } + if (dumpCollisions) { + double pct = 100 * (double(count) / double(reps)); + dc.first.printbits(""); + printf(" - %4.2f%%\n", pct); + } + } + } + } - printf("%d total collisions, of which %d single collisions were ignored", - totalcount,ignore); + printf("%d total collisions, of which %d single collisions were ignored", totalcount, ignore); - addVCodeResult(totalcount); - addVCodeResult(ignore); + addVCodeResult(totalcount); + addVCodeResult(ignore ); - if(result == false) { - printf(" !!!!!"); - } + if (result == false) { + printf(" !!!!!"); + } - printf("\n\n"); + printf("\n\n"); - return result; + return result; } //----------------------------------------------------------------------------- @@ -121,144 +119,149 @@ static bool ProcessDifferentials ( std::map & diffcounts, int // 2^32 tests, we'll probably see some spurious random collisions, so don't report // them. -template < bool recursemore, typename keytype, typename hashtype > -static void DiffTestRecurse(const HashFn hash, const seed_t seed, keytype & k1, keytype & k2, hashtype & h1, hashtype & h2, int start, int bitsleft, std::map & diffcounts ) -{ - const int bits = sizeof(keytype)*8; +template +static void DiffTestRecurse( const HashFn hash, const seed_t seed, keytype & k1, keytype & k2, hashtype & h1, + hashtype & h2, int start, int bitsleft, std::map & diffcounts ) { + const int bits = sizeof(keytype) * 8; - assume(start < bits); - for(int i = start; i < bits; i++) - { - keytype k2_prev = k2; + assume(start < bits); + for (int i = start; i < bits; i++) { + keytype k2_prev = k2; - k2.flipbit(i); + k2.flipbit(i); - bitsleft--; + bitsleft--; - hash(&k2, sizeof(k2), seed, &h2); + hash(&k2, sizeof(k2), seed, &h2); - if(h1 == h2) - { - ++diffcounts[k1 ^ k2]; - } + if (h1 == h2) { + ++diffcounts[k1 ^ k2]; + } - if(recursemore && likely((i+1) < bits)) - { - if (bitsleft > 1) - DiffTestRecurse(hash,seed,k1,k2,h1,h2,i+1,bitsleft,diffcounts); - else - DiffTestRecurse(hash,seed,k1,k2,h1,h2,i+1,bitsleft,diffcounts); - } + if (recursemore && likely((i + 1) < bits)) { + if (bitsleft > 1) { + DiffTestRecurse(hash, seed, k1, k2, h1, h2, i + 1, bitsleft, diffcounts); + } else { + DiffTestRecurse(hash, seed, k1, k2, h1, h2, i + 1, bitsleft, diffcounts); + } + } - //k2.flipbit(i); - k2 = k2_prev; - bitsleft++; - } + // k2.flipbit(i); + k2 = k2_prev; + bitsleft++; + } } //----------------------------------------------------------------------------- -template < typename keytype, typename hashtype > -static void DiffTestImplThread(const HashFn hash, const seed_t seed, std::map &diffcounts, const uint8_t * keys, int diffbits, a_int & irepp, const int reps) { - const int keybytes = sizeof(keytype); +template +static void DiffTestImplThread( const HashFn hash, const seed_t seed, std::map & diffcounts, + const uint8_t * keys, int diffbits, a_int & irepp, const int reps ) { + const int keybytes = sizeof(keytype); + + keytype k1, k2; + hashtype h1, h2; - keytype k1,k2; - hashtype h1,h2; - h1 = h2 = 0; + h1 = h2 = 0; - int irep; - while ((irep = irepp++) < reps) { - if ((reps >= 10) && (irep % (reps/10) == 0)) { printf("."); } + int irep; + while ((irep = irepp++) < reps) { + if ((reps >= 10) && (irep % (reps / 10) == 0)) { printf("."); } - memcpy(&k1, &keys[keybytes * irep], sizeof(k1)); - k2 = k1; + memcpy(&k1, &keys[keybytes * irep], sizeof(k1)); + k2 = k1; - hash(&k1, sizeof(k1), seed, (void*)&h1); + hash(&k1, sizeof(k1), seed, (void *)&h1); - DiffTestRecurse(hash,seed,k1,k2,h1,h2,0,diffbits,diffcounts); - } + DiffTestRecurse(hash, seed, k1, k2, h1, h2, 0, diffbits, diffcounts); + } } //----------------------------------------------------------------------------- -template < typename keytype, typename hashtype > -static bool DiffTestImpl(HashFn hash, const seed_t seed, int diffbits, int reps, bool dumpCollisions) { - const int keybytes = sizeof(keytype); - const int keybits = sizeof(keytype) * 8; - const int hashbits = sizeof(hashtype) * 8; +template +static bool DiffTestImpl( HashFn hash, const seed_t seed, int diffbits, int reps, bool dumpCollisions ) { + const int keybytes = sizeof(keytype); + const int keybits = sizeof(keytype ) * 8; + const int hashbits = sizeof(hashtype) * 8; - double diffcount = chooseUpToK(keybits,diffbits); - double testcount = (diffcount * double(reps)); - double expected = testcount / pow(2.0,double(hashbits)); + double diffcount = chooseUpToK(keybits, diffbits); + double testcount = (diffcount * double(reps)); + double expected = testcount / pow(2.0, double(hashbits)); - printf("Testing %0.f up-to-%d-bit differentials in %d-bit keys -> %d bit hashes.\n", - diffcount,diffbits,keybits,hashbits); - printf("%d reps, %0.f total tests, expecting %2.2f random collisions", - reps,testcount,expected); + printf("Testing %0.f up-to-%d-bit differentials in %d-bit keys -> %d bit hashes.\n", + diffcount, diffbits, keybits, hashbits); + printf("%d reps, %0.f total tests, expecting %2.2f random collisions", reps, testcount, expected); - Rand r(100); - std::vector keys(reps * keybytes); + Rand r( 100 ); + std::vector keys( reps * keybytes ); - for (int i = 0; i < reps; i++) - r.rand_p(&keys[i*keybytes],keybytes); - addVCodeInput(&keys[0], reps * keybytes); + for (int i = 0; i < reps; i++) { + r.rand_p(&keys[i * keybytes], keybytes); + } + addVCodeInput(&keys[0], reps * keybytes); - a_int irep(0); + a_int irep( 0 ); - std::vector > diffcounts(g_NCPU); + std::vector> diffcounts( g_NCPU ); - if ((g_NCPU == 1) || (reps < 10)) { - DiffTestImplThread(hash,seed,diffcounts[0],&keys[0],diffbits,irep,reps); - } else { + if ((g_NCPU == 1) || (reps < 10)) { + DiffTestImplThread(hash, seed, diffcounts[0], &keys[0], diffbits, irep, reps); + } else { #if defined(HAVE_THREADS) - std::thread t[g_NCPU]; - for (int i=0; i < g_NCPU; i++) { - t[i] = std::thread {DiffTestImplThread,hash,seed,std::ref(diffcounts[i]),&keys[0],diffbits,std::ref(irep),reps}; - } - for (int i=0; i < g_NCPU; i++) { - t[i].join(); - } - for (int i=1; i < g_NCPU; i++) - for (std::pair dc : diffcounts[i]) - diffcounts[0][dc.first] += dc.second; + std::thread t[g_NCPU]; + for (int i = 0; i < g_NCPU; i++) { + t[i] = std::thread { + DiffTestImplThread, hash, seed, std::ref(diffcounts[i]), + &keys[0], diffbits, std::ref(irep), reps + }; + } + for (int i = 0; i < g_NCPU; i++) { + t[i].join(); + } + for (int i = 1; i < g_NCPU; i++) { + for (std::pair dc: diffcounts[i]) { + diffcounts[0][dc.first] += dc.second; + } + } #endif - } + } - for (std::pair dc : diffcounts[0]) { - addVCodeOutput(&dc.first, sizeof(keytype)); - addVCodeOutput(&dc.second, sizeof(uint32_t)); - } + for (std::pair dc: diffcounts[0]) { + addVCodeOutput(&dc.first , sizeof(keytype) ); + addVCodeOutput(&dc.second, sizeof(uint32_t)); + } - printf("\n"); + printf("\n"); - bool result = true; + bool result = true; - result &= ProcessDifferentials(diffcounts[0],reps,dumpCollisions); + result &= ProcessDifferentials(diffcounts[0], reps, dumpCollisions); - recordTestResult(result, "Differential", diffbits); + recordTestResult(result, "Differential", diffbits); - return result; + return result; } //---------------------------------------------------------------------------- -template < typename hashtype > -bool DiffTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool dumpCollisions = verbose; - bool result = true; +template +bool DiffTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool dumpCollisions = verbose; + bool result = true; // Do fewer reps with slow or very bad hashes bool slowhash = hinfo->bits > 128 || hinfo->isSlow(); - int reps = hinfo->isMock() ? 2 : ((slowhash && !extra) ? 100 : 1000); + int reps = hinfo->isMock() ? 2 : ((slowhash && !extra) ? 100 : 1000); printf("[[[ Diff 'Differential' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); - result &= DiffTestImpl< Blob<64>, hashtype >(hash,seed,5,reps,dumpCollisions); - result &= DiffTestImpl< Blob<128>, hashtype >(hash,seed,4,reps,dumpCollisions); - result &= DiffTestImpl< Blob<256>, hashtype >(hash,seed,3,reps,dumpCollisions); + result &= DiffTestImpl, hashtype>(hash, seed, 5, reps, dumpCollisions); + result &= DiffTestImpl, hashtype >(hash, seed, 4, reps, dumpCollisions); + result &= DiffTestImpl, hashtype >(hash, seed, 3, reps, dumpCollisions); printf("%s\n", result ? "" : g_failstr); diff --git a/tests/DifferentialTest.h b/tests/DifferentialTest.h index c298dd6d..5efede43 100644 --- a/tests/DifferentialTest.h +++ b/tests/DifferentialTest.h @@ -47,5 +47,5 @@ // Differential collision tests - generate a bunch of random keys, // see what happens to the hash value when we flip a few bits of the key. -template < typename hashtype > -bool DiffTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool DiffTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/HashMapTest.cpp b/tests/HashMapTest.cpp index 52145554..07f2a3b4 100644 --- a/tests/HashMapTest.cpp +++ b/tests/HashMapTest.cpp @@ -46,7 +46,7 @@ #include "Timing.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // For FilterOutliers, CalcMean, CalcStdv +#include "Stats.h" // For FilterOutliers, CalcMean, CalcStdv #include "Random.h" #include "HashMapTest.h" @@ -67,216 +67,211 @@ using namespace std; typedef std::unordered_map> std_hashmap; + std::function> std_hashmap; typedef phmap::flat_hash_map> fast_hashmap; + std::function> fast_hashmap; //----------------------------------------------------------------------------- // This should be a realistic I-Cache test, when our hash is used inlined // in a hash table. There the size matters more than the bulk speed. -std::vector HashMapInit(bool verbose) { - std::vector wordvec; - std::string line; - unsigned sum = 0; - - const char * ptr = hashmap_words + 1; // Skip over initial newline - while (*ptr != '\0') - { - const char * end = (const char *)rawmemchr(ptr, '\n'); - std::string str (ptr, end - ptr); - wordvec.push_back(str); - std::transform(str.begin(), str.begin()+1, str.begin(), ::toupper); - wordvec.push_back(str); - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - wordvec.push_back(str); - sum += end - ptr; - ptr = end + 1; - } - - if (verbose) { - printf ("Read %" PRId64 " words from internal list, ", wordvec.size()); - printf ("avg len: %0.3f\n\n", (sum+0.0)/wordvec.size()); - } - return wordvec; +std::vector HashMapInit( bool verbose ) { + std::vector wordvec; + std::string line; + unsigned sum = 0; + + const char * ptr = hashmap_words + 1; // Skip over initial newline + + while (*ptr != '\0') { + const char * end = (const char *)rawmemchr(ptr, '\n'); + std::string str( ptr, end - ptr ); + wordvec.push_back(str); + std::transform(str.begin(), str.begin() + 1, str.begin(), ::toupper); + wordvec.push_back(str); + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + wordvec.push_back(str); + sum += end - ptr; + ptr = end + 1; + } + + if (verbose) { + printf("Read %" PRId64 " words from internal list, ", wordvec.size()); + printf("avg len: %0.3f\n\n", (sum + 0.0) / wordvec.size()); + } + return wordvec; } //----------------------------------------------------------------------------- -static double HashMapSpeedTest ( HashFn hash, const int hashbits, - std::vector words, - const seed_t seed, const int trials, bool verbose ) -{ - //using phmap::flat_node_hash_map; - Rand r(82762); - std_hashmap hashmap(words.size(), [=](const std::string &key) - { - // 256 needed for hasshe2, but only size_t used - static char out[256] = { 0 }; - hash(key.c_str(), key.length(), seed, &out); - return *(size_t*)out; - }); - fast_hashmap phashmap(words.size(), [=](const std::string &key) - { - static char out[256] = { 0 }; // 256 for hasshe2, but stripped to 64/32 - hash(key.c_str(), key.length(), seed, &out); - return *(size_t*)out; - }); - - std::vector::iterator it; - std::vector times; - double t1; - - printf("std::unordered_map\n"); - printf("Init std HashMapTest: "); - fflush(NULL); - times.reserve(trials); - if (0 /*need_minlen64_align16(pfhash)*/) { - for (it = words.begin(); it != words.end(); it++) { - // requires min len 64, and 16byte key alignment - (*it).resize(64); +static double HashMapSpeedTest( HashFn hash, const int hashbits, std::vector words, + const seed_t seed, const int trials, bool verbose ) { + // using phmap::flat_node_hash_map; + Rand r( 82762 ); + + std_hashmap hashmap( words.size(), [=]( const std::string & key ) { + // 256 needed for hasshe2, but only size_t used + static char out[256] = { 0 }; + hash(key.c_str(), key.length(), seed, &out); + return *(size_t *)out; + } ); + fast_hashmap phashmap( words.size(), [=]( const std::string & key ) { + // 256 for hasshe2, but stripped to 64/32 + static char out[256] = { 0 }; + hash(key.c_str(), key.length(), seed, &out); + return *(size_t *)out; + } ); + + std::vector::iterator it; + std::vector times; + double t1; + + printf("std::unordered_map\n" ); + printf("Init std HashMapTest: "); + fflush(NULL); + times.reserve(trials); + if (0 /*need_minlen64_align16(pfhash)*/) { + for (it = words.begin(); it != words.end(); it++) { + // requires min len 64, and 16byte key alignment + (*it).resize(64); + } } - } - { - // hash inserts plus 1% deletes - volatile int64_t begin, end; - int i = 0; - begin = timer_start(); - for (it = words.begin(); it != words.end(); it++, i++) { - std::string line = *it; - hashmap[line] = 1; - if (i % 100 == 0) - hashmap.erase(line); + { + // hash inserts plus 1% deletes + volatile int64_t begin, end; + int i = 0; + begin = timer_start(); + for (it = words.begin(); it != words.end(); it++, i++) { + std::string line = *it; + hashmap[line] = 1; + if (i % 100 == 0) { + hashmap.erase(line); + } + } + end = timer_end(); + t1 = (double)(end - begin) / (double)words.size(); + } + fflush(NULL); + printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", t1, words.size()); + printf("Running std HashMapTest: "); + if (t1 > 10000.) { // e.g. multiply_shift 459271.700 + printf("SKIP"); + return 0.; } - end = timer_end(); - t1 = (double)(end - begin) / (double)words.size(); - } - fflush(NULL); - printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", - t1, words.size()); - printf("Running std HashMapTest: "); - if (t1 > 10000.) { // e.g. multiply_shift 459271.700 - printf("SKIP"); - return 0.; - } - fflush(NULL); - - for(int itrial = 0; itrial < trials; itrial++) - { // hash query - volatile int64_t begin, end; - int i = 0, found = 0; - double t; - begin = timer_start(); - for ( it = words.begin(); it != words.end(); it++, i++ ) - { - std::string line = *it; - if (hashmap[line]) - found++; + fflush(NULL); + + for (int itrial = 0; itrial < trials; itrial++) { // hash query + volatile int64_t begin, end; + int i = 0, found = 0; + double t; + begin = timer_start(); + for (it = words.begin(); it != words.end(); it++, i++) { + std::string line = *it; + if (hashmap[line]) { + found++; + } } - end = timer_end(); - t = (double)(end - begin) / (double)words.size(); - if(found > 0 && t > 0) times.push_back(t); + end = timer_end(); + t = (double)(end - begin) / (double)words.size(); + if ((found > 0) && (t > 0)) { times.push_back(t); } } - hashmap.clear(); - - std::sort(times.begin(),times.end()); - FilterOutliers(times); - double mean = CalcMean(times); - double stdv = CalcStdv(times); - printf("%0.3f cycles/op", mean); - printf(" (%0.1f stdv)\n", stdv); - - times.clear(); - - printf("\ngreg7mdp/parallel-hashmap\n"); - printf("Init fast HashMapTest: "); - fflush(NULL); - times.reserve(trials); - { // hash inserts and 1% deletes - volatile int64_t begin, end; - int i = 0; - begin = timer_start(); - for (it = words.begin(); it != words.end(); it++, i++) { - std::string line = *it; - phashmap[line] = 1; - if (i % 100 == 0) - phashmap.erase(line); + hashmap.clear(); + + std::sort(times.begin(), times.end()); + FilterOutliers(times); + double mean = CalcMean(times); + double stdv = CalcStdv(times); + printf("%0.3f cycles/op", mean); + printf(" (%0.1f stdv)\n", stdv); + + times.clear(); + + printf("\ngreg7mdp/parallel-hashmap\n"); + printf("Init fast HashMapTest: " ); + fflush(NULL); + times.reserve(trials); + { // hash inserts and 1% deletes + volatile int64_t begin, end; + int i = 0; + begin = timer_start(); + for (it = words.begin(); it != words.end(); it++, i++) { + std::string line = *it; + phashmap[line] = 1; + if (i % 100 == 0) { + phashmap.erase(line); + } + } + end = timer_end(); + t1 = (double)(end - begin) / (double)words.size(); + } + fflush(NULL); + printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", t1, words.size()); + printf("Running fast HashMapTest: "); + if (t1 > 10000.) { // e.g. multiply_shift 459271.700 + printf("SKIP"); + return 0.; } - end = timer_end(); - t1 = (double)(end - begin) / (double)words.size(); - } - fflush(NULL); - printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", - t1, words.size()); - printf("Running fast HashMapTest: "); - if (t1 > 10000.) { // e.g. multiply_shift 459271.700 - printf("SKIP"); - return 0.; - } - fflush(NULL); - for(int itrial = 0; itrial < trials; itrial++) - { // hash query - volatile int64_t begin, end; - int i = 0, found = 0; - double t; - begin = timer_start(); - for ( it = words.begin(); it != words.end(); it++, i++ ) - { - std::string line = *it; - if (phashmap[line]) - found++; + fflush(NULL); + for (int itrial = 0; itrial < trials; itrial++) { // hash query + volatile int64_t begin, end; + int i = 0, found = 0; + double t; + begin = timer_start(); + for (it = words.begin(); it != words.end(); it++, i++) { + std::string line = *it; + if (phashmap[line]) { + found++; + } } - end = timer_end(); - t = (double)(end - begin) / (double)words.size(); - if(found > 0 && t > 0) times.push_back(t); + end = timer_end(); + t = (double)(end - begin) / (double)words.size(); + if ((found > 0) && (t > 0)) { times.push_back(t); } } - phashmap.clear(); - fflush(NULL); - - std::sort(times.begin(),times.end()); - FilterOutliers(times); - double mean1 = CalcMean(times); - double stdv1 = CalcStdv(times); - printf("%0.3f cycles/op", mean1); - printf(" (%0.1f stdv) ", stdv1); - fflush(NULL); - - return mean; + phashmap.clear(); + fflush(NULL); + + std::sort(times.begin(), times.end()); + FilterOutliers(times); + double mean1 = CalcMean(times); + double stdv1 = CalcStdv(times); + printf("%0.3f cycles/op", mean1); + printf(" (%0.1f stdv) " , stdv1); + fflush(NULL); + + return mean; } //----------------------------------------------------------------------------- -static bool HashMapImpl ( HashFn hash, - const int hashbits, std::vector words, - const seed_t seed, const int trials, bool verbose ) -{ - double mean = 0.0; - try { - mean = HashMapSpeedTest( hash, hashbits, words, seed, trials, verbose); - } - catch (...) { - printf(" aborted !!!!\n"); - } - // if faster than ~sha1 - if (mean > 5. && mean < 1500.) - printf(" ....... PASS\n"); - else - printf(" ....... FAIL\n"); - return true; +static bool HashMapImpl( HashFn hash, const int hashbits, std::vector words, + const seed_t seed, const int trials, bool verbose ) { + double mean = 0.0; + + try { + mean = HashMapSpeedTest(hash, hashbits, words, seed, trials, verbose); + } catch (...) { + printf(" aborted !!!!\n"); + } + // if faster than ~sha1 + if ((mean > 5.) && (mean < 1500.)) { + printf(" ....... PASS\n"); + } else { + printf(" ....... FAIL\n"); + } + return true; } //----------------------------------------------------------------------------- -bool HashMapTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int trials = (hinfo->isVerySlow() && !extra) ? 5 : 50; - bool result = true; +bool HashMapTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int trials = (hinfo->isVerySlow() && !extra) ? 5 : 50; + bool result = true; printf("[[[ 'Hashmap' Speed Tests ]]]\n\n"); if (hinfo->isMock()) { - printf("Skipping Hashmap test; it is designed for true hashes\n\n"); - return result; + printf("Skipping Hashmap test; it is designed for true hashes\n\n"); + return result; } std::vector words = HashMapInit(verbose); @@ -285,9 +280,9 @@ bool HashMapTest(const HashInfo * hinfo, const bool verbose, const bool extra) { return result; } - Rand r(477537); + Rand r( 477537 ); const seed_t seed = hinfo->Seed(g_seed ^ r.rand_u64()); - result &= HashMapImpl(hash,hinfo->bits,words,seed,trials,verbose); + result &= HashMapImpl(hash, hinfo->bits, words, seed, trials, verbose); printf("\n%s\n", result ? "" : g_failstr); diff --git a/tests/HashMapTest.h b/tests/HashMapTest.h index 3ae36444..482c7ff0 100644 --- a/tests/HashMapTest.h +++ b/tests/HashMapTest.h @@ -43,6 +43,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -std::vector HashMapInit(bool verbose); +std::vector HashMapInit( bool verbose ); -bool HashMapTest(const HashInfo * info, const bool verbose, const bool extra); +bool HashMapTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/PRNGTest.cpp b/tests/PRNGTest.cpp index 5036d884..364e340f 100644 --- a/tests/PRNGTest.cpp +++ b/tests/PRNGTest.cpp @@ -60,38 +60,38 @@ //----------------------------------------------------------------------------- // Keyset 'Prng' -template< typename hashtype > -static void Prn_gen(int nbRn, HashFn hash, const seed_t seed, std::vector & hashes) { - assert(nbRn > 0); - - printf("Generating random numbers by hashing previous output - %d keys\n", nbRn); - - // Since hash() inputs depend upon previous outputs, we can't use - // that to verify cross-system consistency across hashes, so just - // use the test parameters for the input VCode. - addVCodeInput(nbRn); - addVCodeInput(sizeof(hashtype)); - - hashtype hcopy; - memset(&hcopy, 0, sizeof(hcopy)); - - // a generated random number becomes the input for the next one - for (int i=0; i< nbRn; i++) { - hashtype h; - hash(&hcopy, sizeof(hcopy), seed, &h); - hashes.push_back(h); - memcpy(&hcopy, &h, sizeof(h)); - } +template +static void Prn_gen( int nbRn, HashFn hash, const seed_t seed, std::vector & hashes ) { + assert(nbRn > 0); + + printf("Generating random numbers by hashing previous output - %d keys\n", nbRn); + + // Since hash() inputs depend upon previous outputs, we can't use + // that to verify cross-system consistency across hashes, so just + // use the test parameters for the input VCode. + addVCodeInput(nbRn); + addVCodeInput(sizeof(hashtype)); + + hashtype hcopy; + memset(&hcopy, 0, sizeof(hcopy)); + + // a generated random number becomes the input for the next one + for (int i = 0; i < nbRn; i++) { + hashtype h; + hash(&hcopy, sizeof(hcopy), seed, &h); + hashes.push_back(h); + memcpy(&hcopy, &h, sizeof(h)); + } } //----------------------------------------------------------------------------- -template < typename hashtype > -bool PRNGTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - bool testCollision = true; - bool testDistribution = extra; +template +bool PRNGTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + bool testCollision = true; + bool testDistribution = extra; std::vector hashes; printf("[[[ Prng Tests ]]]\n\n"); diff --git a/tests/PRNGTest.h b/tests/PRNGTest.h index 2a8f66b6..8f304336 100644 --- a/tests/PRNGTest.h +++ b/tests/PRNGTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool PRNGTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool PRNGTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/PerlinNoiseTest.cpp b/tests/PerlinNoiseTest.cpp index 62268298..fb0cc4c8 100644 --- a/tests/PerlinNoiseTest.cpp +++ b/tests/PerlinNoiseTest.cpp @@ -60,59 +60,58 @@ //----------------------------------------------------------------------------- // Keyset 'Perlin Noise' - X,Y coordinates on input & seed -template< typename hashtype > -static bool PerlinNoise (int Xbits, int Ybits, int inputLen, int step, - const HashInfo * hinfo, bool testColl, bool testDist, bool drawDiagram) -{ - assert(0 < Ybits && Ybits < 31); - assert(0 < Xbits && Xbits < 31); - assert(Xbits + Ybits < 31); - assert(inputLen*8 > Xbits); // enough space to run the test - - std::vector hashes; - int const xMax = (1 << Xbits); - int const yMax = (1 << Ybits); - const HashFn hash = hinfo->hashFn(g_hashEndian); +template +static bool PerlinNoise( int Xbits, int Ybits, int inputLen, int step, const HashInfo * hinfo, + bool testColl, bool testDist, bool drawDiagram ) { + assert( 0 < Ybits && Ybits < 31); + assert( 0 < Xbits && Xbits < 31); + assert( Xbits + Ybits < 31 ); + assert(inputLen * 8 > Xbits ); // enough space to run the test + + std::vector hashes; + int const xMax = (1 << Xbits); + int const yMax = (1 << Ybits); + const HashFn hash = hinfo->hashFn(g_hashEndian); #define INPUT_LEN_MAX 256 - assert(inputLen <= INPUT_LEN_MAX); - uint8_t key[INPUT_LEN_MAX] = {0}; - - printf("Generating coordinates from %3i-byte keys - %d keys\n", inputLen, xMax * yMax); - - addVCodeInput(yMax); - // Since seeding can be expensive, loop over the seed-dependent - // variable first. - for (uint64_t y = 0; y < yMax; y++) { - const seed_t seed = hinfo->Seed(y, true); - for (uint64_t x = 0; x < xMax; x++) { - // Put x in little-endian order - uint64_t xin = COND_BSWAP(x, isBE()); - memcpy(key, &xin, sizeof(xin)); - - hashtype h; - hash(key, inputLen, seed, &h); - addVCodeInput(key, inputLen); - hashes.push_back(h); - } - } - - bool result = TestHashList(hashes,drawDiagram,testColl,testDist); - printf("\n"); - - recordTestResult(result, "PerlinNoise", inputLen); - - addVCodeResult(result); - - return result; + assert(inputLen <= INPUT_LEN_MAX ); + uint8_t key[INPUT_LEN_MAX] = { 0 }; + + printf("Generating coordinates from %3i-byte keys - %d keys\n", inputLen, xMax * yMax); + + addVCodeInput(yMax); + // Since seeding can be expensive, loop over the seed-dependent + // variable first. + for (uint64_t y = 0; y < yMax; y++) { + const seed_t seed = hinfo->Seed(y, true); + for (uint64_t x = 0; x < xMax; x++) { + // Put x in little-endian order + uint64_t xin = COND_BSWAP(x, isBE()); + memcpy(key, &xin, sizeof(xin)); + + hashtype h; + hash(key, inputLen, seed, &h); + addVCodeInput(key, inputLen); + hashes.push_back(h); + } + } + + bool result = TestHashList(hashes, drawDiagram, testColl, testDist); + printf("\n"); + + recordTestResult(result, "PerlinNoise", inputLen); + + addVCodeResult(result); + + return result; } //----------------------------------------------------------------------------- -template< typename hashtype > -bool PerlinNoiseTest (const HashInfo * hinfo, const bool verbose, const bool extra) { - bool result = true; - bool testCollision = true; +template +bool PerlinNoiseTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + bool result = true; + bool testCollision = true; bool testDistribution = extra; printf("[[[ Keyset 'PerlinNoise' Tests ]]]\n\n"); diff --git a/tests/PerlinNoiseTest.h b/tests/PerlinNoiseTest.h index 866e6061..25483937 100644 --- a/tests/PerlinNoiseTest.h +++ b/tests/PerlinNoiseTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template< typename hashtype > -bool PerlinNoiseTest (const HashInfo * info, const bool verbose, const bool extra); +template +bool PerlinNoiseTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/PermutationKeysetTest.cpp b/tests/PermutationKeysetTest.cpp index ce55928c..bbe97074 100644 --- a/tests/PermutationKeysetTest.cpp +++ b/tests/PermutationKeysetTest.cpp @@ -60,182 +60,239 @@ //----------------------------------------------------------------------------- // Keyset 'Combination' - all possible combinations of input blocks -template< typename hashtype > -static void CombinationKeygenRecurse(uint8_t * key, int len, int maxlen, - const uint8_t * blocks, uint32_t blockcount, uint32_t blocksz, - HashFn hash, const seed_t seed, std::vector & hashes) { - if(len == maxlen) return; // end recursion +template +static void CombinationKeygenRecurse( uint8_t * key, int len, int maxlen, const uint8_t * blocks, uint32_t blockcount, + uint32_t blocksz, HashFn hash, const seed_t seed, std::vector & hashes ) { + if (len == maxlen) { return; } // end recursion - for(int i = 0; i < blockcount; i++) - { - memcpy(&key[len * blocksz], &blocks[i * blocksz], blocksz); + for (int i = 0; i < blockcount; i++) { + memcpy(&key[len * blocksz], &blocks[i * blocksz], blocksz); - hashtype h; - hash(key, (len+1) * blocksz, seed, &h); - addVCodeInput(key, (len+1) * blocksz); - hashes.push_back(h); + hashtype h; + hash(key, (len + 1) * blocksz, seed, &h); + addVCodeInput(key, (len + 1) * blocksz); + hashes.push_back(h); - CombinationKeygenRecurse(key,len+1,maxlen,blocks,blockcount,blocksz,hash,seed,hashes); - } + CombinationKeygenRecurse(key, len + 1, maxlen, blocks, blockcount, blocksz, hash, seed, hashes); + } } -template< typename hashtype > -static bool CombinationKeyTest( HashFn hash, const seed_t seed, int maxlen, - const uint8_t * blocks, uint32_t blockcount, uint32_t blocksz, const char * testdesc, - bool testColl, bool testDist, bool drawDiagram) { - printf("Keyset 'Combination %s' - up to %d blocks from a set of %d - ",testdesc,maxlen,blockcount); +template +static bool CombinationKeyTest( HashFn hash, const seed_t seed, int maxlen, const uint8_t * blocks, uint32_t blockcount, + uint32_t blocksz, const char * testdesc, bool testColl, bool testDist, bool drawDiagram ) { + printf("Keyset 'Combination %s' - up to %d blocks from a set of %d - ", testdesc, maxlen, blockcount); - //---------- + //---------- - std::vector hashes; + std::vector hashes; - uint8_t * key = new uint8_t[maxlen*blocksz]; + uint8_t * key = new uint8_t[maxlen * blocksz]; - CombinationKeygenRecurse(key,0,maxlen,blocks,blockcount,blocksz,hash,seed,hashes); + CombinationKeygenRecurse(key, 0, maxlen, blocks, blockcount, blocksz, hash, seed, hashes); - delete [] key; + delete [] key; - printf("%d keys\n",(int)hashes.size()); + printf("%d keys\n", (int)hashes.size()); - //---------- + //---------- - bool result = TestHashList(hashes,drawDiagram,testColl,testDist); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram, testColl, testDist); + printf("\n" ); - return result; + return result; } //----------------------------------------------------------------------------- const struct { - const char * desc; - const int maxlen; - const uint32_t nrBlocks; - const uint32_t szBlock; // Verify nrBlocks * szBlock == blocks.size() - const std::vector blocks; + const char * desc; + const int maxlen; + const uint32_t nrBlocks; + const uint32_t szBlock; // Verify nrBlocks * szBlock == blocks.size() + const std::vector blocks; } keytests[] = { // This one breaks lookup3, surprisingly - { "4-bytes [3 low bits]", 7, 8, 4, - { 0, 0, 0, 0, - 1, 0, 0, 0, - 2, 0, 0, 0, - 3, 0, 0, 0, - 4, 0, 0, 0, - 5, 0, 0, 0, - 6, 0, 0, 0, - 7, 0, 0, 0 } }, - { "4-bytes [3 high bits]", 7, 8, 4, - { 0, 0, 0, 0, - 0, 0, 0, 32, - 0, 0, 0, 64, - 0, 0, 0, 96, - 0, 0, 0, 128, - 0, 0, 0, 160, - 0, 0, 0, 192, - 0, 0, 0, 224 } }, - { "4-bytes [3 high+low bits]", 6, 15, 4, - { 0, 0, 0, 0, - 1, 0, 0, 0, - 2, 0, 0, 0, - 3, 0, 0, 0, - 4, 0, 0, 0, - 5, 0, 0, 0, - 6, 0, 0, 0, - 7, 0, 0, 0, - 0, 0, 0, 32, - 0, 0, 0, 64, - 0, 0, 0, 96, - 0, 0, 0, 128, - 0, 0, 0, 160, - 0, 0, 0, 192, - 0, 0, 0, 224 } }, - { "4-bytes [0, low bit]", 0, 2, 4, - { 0, 0, 0, 0, - 1, 0, 0, 0 } }, - { "4-bytes [0, high bit]", 0, 2, 4, - { 0, 0, 0, 0, - 0, 0, 0, 128 } }, - { "8-bytes [0, low bit]", 0, 2, 8, - { 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, } }, - { "8-bytes [0, high bit]", 0, 2, 8, - { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 128, } }, - { "16-bytes [0, low bit]", 0, 2, 16, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }, - { "16-bytes [0, high bit]", 0, 2, 16, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, } }, - { "32-bytes [0, low bit]", 0, 2, 32, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }, - { "32-bytes [0, high bit]", 0, 2, 32, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, } }, - { "64-bytes [0, low bit]", 0, 2, 64, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }, - { "64-bytes [0, high bit]", 0, 2, 64, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, } }, - { "128-bytes [0, low bit]", 0, 2, 128, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } }, - { "128-bytes [0, high bit]", 0, 2, 128, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, } }, + { + "4-bytes [3 low bits]", 7, 8, 4, + { + 0, 0, 0, 0, + 1, 0, 0, 0, + 2, 0, 0, 0, + 3, 0, 0, 0, + 4, 0, 0, 0, + 5, 0, 0, 0, + 6, 0, 0, 0, + 7, 0, 0, 0 + } + }, + { + "4-bytes [3 high bits]", 7, 8, 4, + { + 0, 0, 0, 0, + 0, 0, 0, 32, + 0, 0, 0, 64, + 0, 0, 0, 96, + 0, 0, 0, 128, + 0, 0, 0, 160, + 0, 0, 0, 192, + 0, 0, 0, 224 + } + }, + { + "4-bytes [3 high+low bits]", 6, 15, 4, + { + 0, 0, 0, 0, + 1, 0, 0, 0, + 2, 0, 0, 0, + 3, 0, 0, 0, + 4, 0, 0, 0, + 5, 0, 0, 0, + 6, 0, 0, 0, + 7, 0, 0, 0, + 0, 0, 0, 32, + 0, 0, 0, 64, + 0, 0, 0, 96, + 0, 0, 0, 128, + 0, 0, 0, 160, + 0, 0, 0, 192, + 0, 0, 0, 224 + } + }, + { + "4-bytes [0, low bit]", 0, 2, 4, + { + 0, 0, 0, 0, + 1, 0, 0, 0 + } + }, + { + "4-bytes [0, high bit]", 0, 2, 4, + { + 0, 0, 0, 0, + 0, 0, 0, 128 + } + }, + { + "8-bytes [0, low bit]", 0, 2, 8, + { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + } + }, + { + "8-bytes [0, high bit]", 0, 2, 8, + { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 128, + } + }, + { + "16-bytes [0, low bit]", 0, 2, 16, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + }, + { + "16-bytes [0, high bit]", 0, 2, 16, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, + } + }, + { + "32-bytes [0, low bit]", 0, 2, 32, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + }, + { + "32-bytes [0, high bit]", 0, 2, 32, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, + } + }, + { + "64-bytes [0, low bit]", 0, 2, 64, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + }, + { + "64-bytes [0, high bit]", 0, 2, 64, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, + } + }, + { + "128-bytes [0, low bit]", 0, 2, 128, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + }, + { + "128-bytes [0, high bit]", 0, 2, 128, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, + } + }, }; -template < typename hashtype > -bool PermutedKeyTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int default_maxlen = extra ? 23 : (hinfo->bits >= 128) ? 17 : 22; - bool result = true; +template +bool PermutedKeyTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int default_maxlen = extra ? 23 : (hinfo->bits >= 128) ? 17 : 22; + bool result = true; printf("[[[ Keyset 'Permutation' Tests ]]]\n\n"); @@ -243,12 +300,11 @@ bool PermutedKeyTest(const HashInfo * hinfo, const bool verbose, const bool extr for (auto test: keytests) { bool curresult = true; - int maxlen = test.maxlen > 0 ? test.maxlen : default_maxlen; + int maxlen = test.maxlen > 0 ? test.maxlen : default_maxlen; assert(test.blocks.size() == test.nrBlocks * test.szBlock); - curresult &= CombinationKeyTest(hash, seed, maxlen, - &(test.blocks[0]), test.nrBlocks, test.szBlock, test.desc, - true, true, verbose); + curresult &= CombinationKeyTest(hash, seed, maxlen, &(test.blocks[0]), + test.nrBlocks, test.szBlock, test.desc, true, true, verbose); recordTestResult(curresult, "Permutation", test.desc); diff --git a/tests/PermutationKeysetTest.h b/tests/PermutationKeysetTest.h index 6d1a3b42..90506ddd 100644 --- a/tests/PermutationKeysetTest.h +++ b/tests/PermutationKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool PermutedKeyTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool PermutedKeyTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/PopcountTest.cpp b/tests/PopcountTest.cpp index 3f907bc6..8fa8407d 100644 --- a/tests/PopcountTest.cpp +++ b/tests/PopcountTest.cpp @@ -78,258 +78,261 @@ typedef uint32_t popcnt_hist[65]; // Copy the results into g_NCPU ranges of 2^32 -static void PopcountThread(const HashInfo * hinfo, const seed_t seed, const int inputSize, - const unsigned start, const unsigned end, const unsigned step, - popcnt_hist &hist1, popcnt_hist &hist2) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - long double const n = (end-(start+1)) / step; - uint64_t previous = 0; +static void PopcountThread( const HashInfo * hinfo, const seed_t seed, const int inputSize, const unsigned start, + const unsigned end, const unsigned step, popcnt_hist & hist1, popcnt_hist & hist2 ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + long double const n = (end - (start + 1)) / step; + uint64_t previous = 0; + #define INPUT_SIZE_MAX 256 - assert(inputSize <= INPUT_SIZE_MAX); - char key[INPUT_SIZE_MAX] = {0}; + assert(inputSize <= INPUT_SIZE_MAX ); + char key[INPUT_SIZE_MAX] = { 0 }; #define HASH_SIZE_MAX 64 - char hbuff[HASH_SIZE_MAX] = {0}; - const int hbits = std::min(hinfo->bits, 64U); // limited due to popcount8 - - assert(sizeof(unsigned) <= inputSize); - assert(start < end); - //assert(step > 0); + char hbuff[HASH_SIZE_MAX] = { 0 }; + const int hbits = std::min(hinfo->bits, 64U); // limited due to popcount8 - uint64_t i = start - step; - memcpy(key, &i, sizeof(i)); - hash(key, inputSize, seed, hbuff); - memcpy(&previous, hbuff, 8); + assert(sizeof(unsigned) <= inputSize); + assert(start < end); + // assert(step > 0); - for (uint64_t i=start; i<=end; i+=step) { + uint64_t i = start - step; memcpy(key, &i, sizeof(i)); hash(key, inputSize, seed, hbuff); + memcpy(&previous, hbuff, 8); + + for (uint64_t i = start; i <= end; i += step) { + memcpy(key, &i, sizeof(i)); + hash(key, inputSize, seed, hbuff); - // popcount8 assumed to work on 64-bit - // note : ideally, one should rather popcount the whole hash - uint64_t h; - memcpy(&h, hbuff, 8); + // popcount8 assumed to work on 64-bit + // note : ideally, one should rather popcount the whole hash + uint64_t h; + memcpy(&h, hbuff, 8); - uint64_t setbits = popcount8(h); - hist1[setbits]++; + uint64_t setbits = popcount8(h); + hist1[setbits]++; - // derivative - setbits = popcount8(h ^ previous); - hist2[setbits]++; - previous = h; - } + // derivative + setbits = popcount8(h ^ previous); + hist2[setbits]++; + previous = h; + } } -static bool PopcountResults ( long double srefh, long double srefl, - long double b1h, long double b1l, - long double b0h, long double b0l ) -{ - double worst; - { - double chi2 = (b1h-srefh) * (b1h-srefh) / (b1l+srefl); - printf("From counting 1s : %9.2Lf, %9.2Lf - moment chisq %10.4f\n", - b1h, b1l, chi2); - worst = chi2; - } - { - double chi2 = (b0h-srefh) * (b0h-srefh) / (b0l+srefl); - printf("From counting 0s : %9.2Lf, %9.2Lf - moment chisq %10.4f\n", - b0h, b0l, chi2); - worst = std::max(worst, chi2); - } - - // note : previous threshold : 3.84145882069413 - int const rank = (worst < 500.) + (worst < 50.) + (worst < 5.); - assert(0 <= rank && rank <= 3); - - const char* rankstr[4] = { "FAIL !!!!", "pass", "Good", "Great" }; - printf("Test result: %s\n", rankstr[rank]); - - addVCodeResult((uint32_t)(worst * 1000.0)); - - return (rank > 0); +static bool PopcountResults( long double srefh, long double srefl, long double b1h, + long double b1l, long double b0h, long double b0l ) { + double worst; + { + double chi2 = (b1h - srefh) * (b1h - srefh) / (b1l + srefl); + printf("From counting 1s : %9.2Lf, %9.2Lf - moment chisq %10.4f\n", b1h, b1l, chi2); + worst = chi2; + } + { + double chi2 = (b0h - srefh) * (b0h - srefh) / (b0l + srefl); + printf("From counting 0s : %9.2Lf, %9.2Lf - moment chisq %10.4f\n", b0h, b0l, chi2); + worst = std::max(worst, chi2); + } + + // note : previous threshold : 3.84145882069413 + int const rank = (worst < 500.) + (worst < 50.) + (worst < 5.); + + assert(0 <= rank && rank <= 3); + + const char * rankstr[4] = { "FAIL !!!!", "pass", "Good", "Great" }; + printf("Test result: %s\n", rankstr[rank]); + + addVCodeResult((uint32_t)(worst * 1000.0)); + + return rank > 0; } -static bool PopcountTestImpl(const HashInfo * hinfo, int inputSize, int step) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - const unsigned mx = 0xffffffff; - const long double n = UINT64_C(0x100000000) / step; - const int hbits = std::min(hinfo->bits, 64U); // limited due to popcount8 - - assert(hbits <= HASH_SIZE_MAX*8); - assert(inputSize >= 4); - - printf("\nGenerating hashes from a linear sequence of %i-bit numbers " - "with a step size of %d ... \n", inputSize*8, step); - - /* Notes on the ranking system. - * Ideally, this test should report and sum all popcount values - * and compare the resulting distribution to an ideal distribution. - * - * What happens here is quite simplified : - * the test gives "points" for each popcount, and sum them all. - * The metric (using N^5) is heavily influenced by the largest outliers. - * For example, a 64-bit hash should have a popcount close to 32. - * But a popcount==40 will tilt the metric upward - * more than popcount==24 will tilt the metric downward. - * In reality, both situations should be ranked similarly. - * - * To compensate, we measure both popcount1 and popcount0, - * and compare to some pre-calculated "optimal" sums for the hash size. - * - * Another limitation of this test is that it only popcounts the first 64-bit. - * For large hashes, bits beyond this limit are ignored. - * - * Derivative hash testing: - * In this scenario, 2 consecutive hashes are xored, - * and the outcome of this xor operation is then popcount controlled. - * Obviously, the _order_ in which the hash values are generated becomes critical. - * - * This scenario comes from the prng world, - * where derivative of the generated suite of random numbers is analyzed - * to ensure the suite is truly "random". - * - * However, in almost all prng, the seed of next random number is the previous random number. - * - * This scenario is quite different: it introduces a fixed distance between 2 consecutive "seeds". - * This is especially detrimental to algorithms relying on linear operations, such as multiplications. - * - * This scenario is relevant if the hash is used as a prng and generates values from a linearly increasing counter as a seed. - * It is not relevant for scenarios employing the hash as a prng - * with the more classical method of using the previous random number as a seed for the next one. - * This scenario has no relevance for classical usages of hash algorithms, - * such as hash tables, bloom filters and such, were only the raw values are ever used. - */ - - long double srefh, srefl; - switch (hbits/8) { - case 8: - srefh = 38918200.; - if (step == 2) - srefl = 273633.333333; - else if (step == 6) - srefl = 820900.0; - else - abort(); - break; - case 4: - srefh = 1391290.; - if (step == 2) - srefl = 686.6666667; - else if (step == 6) - srefl = 2060.0; - else - abort(); - break; - default: - printf("hash size not covered \n"); - abort(); - } - - // Because of threading, the actual inputs can't be hashed into the - // main thread's state, so just hash the parameters of the input data. - addVCodeInput(0); // start - addVCodeInput(0xffffffff); // end - addVCodeInput(step); // step - addVCodeInput(inputSize); // size - - popcnt_hist rawhash[g_NCPU]; - popcnt_hist xorhash[g_NCPU]; - memset(rawhash, 0, sizeof(rawhash)); - memset(xorhash, 0, sizeof(xorhash)); - - const seed_t seed = hinfo->Seed(g_seed, false, 1); - - if (g_NCPU == 1) { - PopcountThread(hinfo, seed, inputSize, 0, 0xffffffff, step, rawhash[0], xorhash[0]); - } else { +static bool PopcountTestImpl( const HashInfo * hinfo, int inputSize, int step ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + const unsigned mx = 0xffffffff; + const long double n = UINT64_C(0x100000000) / step; + const int hbits = std::min(hinfo->bits, 64U); // limited due to popcount8 + + assert(hbits <= HASH_SIZE_MAX * 8); + assert(inputSize >= 4); + + printf("\nGenerating hashes from a linear sequence of %i-bit numbers " + "with a step size of %d ... \n", inputSize * 8, step); + + /* + * Notes on the ranking system. + * Ideally, this test should report and sum all popcount values + * and compare the resulting distribution to an ideal distribution. + * + * What happens here is quite simplified : + * the test gives "points" for each popcount, and sum them all. + * The metric (using N^5) is heavily influenced by the largest outliers. + * For example, a 64-bit hash should have a popcount close to 32. + * But a popcount==40 will tilt the metric upward + * more than popcount==24 will tilt the metric downward. + * In reality, both situations should be ranked similarly. + * + * To compensate, we measure both popcount1 and popcount0, + * and compare to some pre-calculated "optimal" sums for the hash size. + * + * Another limitation of this test is that it only popcounts the first 64-bit. + * For large hashes, bits beyond this limit are ignored. + * + * Derivative hash testing: + * In this scenario, 2 consecutive hashes are xored, + * and the outcome of this xor operation is then popcount controlled. + * Obviously, the _order_ in which the hash values are generated becomes critical. + * + * This scenario comes from the prng world, + * where derivative of the generated suite of random numbers is analyzed + * to ensure the suite is truly "random". + * + * However, in almost all prng, the seed of next random number is the previous random number. + * + * This scenario is quite different: it introduces a fixed distance between 2 consecutive "seeds". + * This is especially detrimental to algorithms relying on linear operations, such as multiplications. + * + * This scenario is relevant if the hash is used as a prng and generates values from a linearly increasing counter + *as a seed. + * It is not relevant for scenarios employing the hash as a prng + * with the more classical method of using the previous random number as a seed for the next one. + * This scenario has no relevance for classical usages of hash algorithms, + * such as hash tables, bloom filters and such, were only the raw values are ever used. + */ + + long double srefh, srefl; + switch (hbits / 8) { + case 8: + srefh = 38918200.; + if (step == 2) { + srefl = 273633.333333; + } else if (step == 6) { + srefl = 820900.0; + } else { + abort(); + } + break; + case 4: + srefh = 1391290.; + if (step == 2) { + srefl = 686.6666667; + } else if (step == 6) { + srefl = 2060.0; + } else { + abort(); + } + break; + default: + printf("hash size not covered \n"); + abort(); + } + + // Because of threading, the actual inputs can't be hashed into the + // main thread's state, so just hash the parameters of the input data. + addVCodeInput( 0); // start + addVCodeInput(0xffffffff); // end + addVCodeInput( step); // step + addVCodeInput( inputSize); // size + + popcnt_hist rawhash[g_NCPU]; + popcnt_hist xorhash[g_NCPU]; + memset(rawhash, 0, sizeof(rawhash)); + memset(xorhash, 0, sizeof(xorhash)); + + const seed_t seed = hinfo->Seed(g_seed, false, 1); + + if (g_NCPU == 1) { + PopcountThread(hinfo, seed, inputSize, 0, 0xffffffff, step, rawhash[0], xorhash[0]); + } else { #if defined(HAVE_THREADS) - // split into g_NCPU threads - std::thread t[g_NCPU]; - printf("%d threads starting... ", g_NCPU); - - const uint64_t len = UINT64_C(0x100000000) / (step * g_NCPU); - for (int i=0; i < g_NCPU; i++) { - const uint32_t start = i * len * step; - const uint32_t end = (i < (g_NCPU - 1)) ? start + (len * step - 1) : 0xffffffff; - //printf("thread[%d]: %d, 0x%x - 0x%x %d\n", i, inputSize, start, end, step); - t[i] = std::thread {PopcountThread, hinfo, seed, inputSize, start, end, step, std::ref(rawhash[i]), std::ref(xorhash[i]) }; - } - - std::this_thread::sleep_for(std::chrono::seconds(1)); - - for (int i=0; i < g_NCPU; i++) { - t[i].join(); - } - - printf(" done\n"); - for (int i = 1; i < g_NCPU; i++) { - for (int j = 0; j <= hbits; j++) { - rawhash[0][j] += rawhash[i][j]; - xorhash[0][j] += xorhash[i][j]; - } - } + // split into g_NCPU threads + std::thread t[g_NCPU]; + printf("%d threads starting... ", g_NCPU); + + const uint64_t len = UINT64_C(0x100000000) / (step * g_NCPU); + for (int i = 0; i < g_NCPU; i++) { + const uint32_t start = i * len * step; + const uint32_t end = (i < (g_NCPU - 1)) ? start + (len * step - 1) : 0xffffffff; + // printf("thread[%d]: %d, 0x%x - 0x%x %d\n", i, inputSize, start, end, step); + t[i] = std::thread { + PopcountThread, hinfo, seed, inputSize, start, end, step, std::ref(rawhash[i]), std::ref(xorhash[i]) + }; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + for (int i = 0; i < g_NCPU; i++) { + t[i].join(); + } + + printf(" done\n"); + for (int i = 1; i < g_NCPU; i++) { + for (int j = 0; j <= hbits; j++) { + rawhash[0][j] += rawhash[i][j]; + xorhash[0][j] += xorhash[i][j]; + } + } #endif - } - - long double b0h = 0, b0l = 0, db0h = 0, db0l = 0; - long double b1h = 0, b1l = 0, db1h = 0, db1l = 0; - // b1h = SUM[ 1-bits**5 ] - // b0h = SUM[ 0-bits**5 ] - // b1l = SUM[ 1-bits**10 ] - // b0l = SUM[ 0-bits**10 ] + } - for (uint64_t j = 0; j <= hbits; j++) { - long double mult1 = j * j * j * j * j; - long double mult0 = (hbits - j) * (hbits - j) * (hbits - j) * (hbits - j) * (hbits - j); - b1h += mult1 * (long double)rawhash[0][j]; - b0h += mult0 * (long double)rawhash[0][j]; - db1h += mult1 * (long double)xorhash[0][j]; - db0h += mult0 * (long double)xorhash[0][j]; - b1l += mult1 * mult1 * (long double)rawhash[0][j]; - b0l += mult0 * mult0 * (long double)rawhash[0][j]; - db1l += mult1 * mult1 * (long double)xorhash[0][j]; - db0l += mult0 * mult0 * (long double)xorhash[0][j]; - } + long double b0h = 0, b0l = 0, db0h = 0, db0l = 0; + long double b1h = 0, b1l = 0, db1h = 0, db1l = 0; + // b1h = SUM[ 1-bits**5 ] + // b0h = SUM[ 0-bits**5 ] + // b1l = SUM[ 1-bits**10 ] + // b0l = SUM[ 0-bits**10 ] + + for (uint64_t j = 0; j <= hbits; j++) { + long double mult1 = j * j * j * j * j; + long double mult0 = (hbits - j) * (hbits - j) * (hbits - j) * (hbits - j) * (hbits - j); + b1h += mult1 * (long double)rawhash[0][j]; + b0h += mult0 * (long double)rawhash[0][j]; + db1h += mult1 * (long double)xorhash[0][j]; + db0h += mult0 * (long double)xorhash[0][j]; + b1l += mult1 * mult1 * (long double)rawhash[0][j]; + b0l += mult0 * mult0 * (long double)rawhash[0][j]; + db1l += mult1 * mult1 * (long double)xorhash[0][j]; + db0l += mult0 * mult0 * (long double)xorhash[0][j]; + } - b1h /= n; b1l = (b1l/n - b1h*b1h) / n; - db1h /= n; db1l = (db1l/n - db1h*db1h) / n; - b0h /= n; b0l = (b0l/n - b0h*b0h) / n; - db0h /= n; db0l = (db0l/n - db0h*db0h) / n; + b1h /= n; b1l = (b1l / n - b1h * b1h ) / n; + db1h /= n; db1l = (db1l / n - db1h * db1h) / n; + b0h /= n; b0l = (b0l / n - b0h * b0h ) / n; + db0h /= n; db0l = (db0l / n - db0h * db0h) / n; - bool result = true; + bool result = true; - printf("Ideal results : %9.2Lf, %9.2Lf\n", srefh, srefl); + printf("Ideal results : %9.2Lf, %9.2Lf\n", srefh, srefl); - printf("\nResults from literal hashes :\n"); - result &= PopcountResults(srefh, srefl, b1h, b1l, b0h, b0l); + printf("\nResults from literal hashes :\n" ); + result &= PopcountResults(srefh, srefl, b1h, b1l, b0h, b0l); - printf("\nResults from derivative hashes (XOR of 2 consecutive values) :\n"); - result &= PopcountResults(srefh, srefl, db1h, db1l, db0h, db0l); + printf("\nResults from derivative hashes (XOR of 2 consecutive values) :\n"); + result &= PopcountResults(srefh, srefl, db1h, db1l, db0h, db0l); - printf("\n"); + printf("\n"); - // Similar threading problems for the outputs, so just hash in the - // summary data. - addVCodeOutput(&rawhash[0][0], 65 * sizeof(rawhash[0][0])); - addVCodeOutput(&xorhash[0][0], 65 * sizeof(xorhash[0][0])); + // Similar threading problems for the outputs, so just hash in the + // summary data. + addVCodeOutput(&rawhash[0][0], 65 * sizeof(rawhash[0][0])); + addVCodeOutput(&xorhash[0][0], 65 * sizeof(xorhash[0][0])); - recordTestResult(result, "Popcount", inputSize); + recordTestResult(result, "Popcount", inputSize); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool PopcountTest(const HashInfo * hinfo, const bool extra) { - const int step = ((hinfo->isVerySlow() || hinfo->bits > 128) && extra) ? 6 : 2; - bool result = true; +template +bool PopcountTest( const HashInfo * hinfo, const bool extra ) { + const int step = ((hinfo->isVerySlow() || hinfo->bits > 128) && extra) ? 6 : 2; + bool result = true; printf("[[[ Popcount Tests ]]]\n"); result &= PopcountTestImpl(hinfo, 4, step); if (extra) { - result &= PopcountTestImpl(hinfo, 8, step); + result &= PopcountTestImpl(hinfo, 8, step); result &= PopcountTestImpl(hinfo, 16, step); } diff --git a/tests/PopcountTest.h b/tests/PopcountTest.h index 33af51c6..623361c7 100644 --- a/tests/PopcountTest.h +++ b/tests/PopcountTest.h @@ -61,5 +61,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool PopcountTest(const HashInfo * info, const bool extra); +template +bool PopcountTest( const HashInfo * info, const bool extra ); diff --git a/tests/SanityTest.cpp b/tests/SanityTest.cpp index 6ed17400..01a2e860 100644 --- a/tests/SanityTest.cpp +++ b/tests/SanityTest.cpp @@ -58,24 +58,23 @@ // These sentinel bytes MUST be different values static const uint8_t sentinel1 = 0x5c; static const uint8_t sentinel2 = 0x36; -static_assert(sentinel1 != sentinel2, - "valid sentinel bytes in SanityTest"); +static_assert(sentinel1 != sentinel2, "valid sentinel bytes in SanityTest"); //---------------------------------------------------------------------------- // Helper for printing out the right number of progress dots -static void progressdots(int cur, int min, int max, int totaldots) { +static void progressdots( int cur, int min, int max, int totaldots ) { // cur goes from [min, max]. When cur is max, totaldots should // have been printed. Print out enough dots, assuming either we // were called for cur-1, or that we are being called for the // first time with cur==min. assert(totaldots > 0); - assert(min < max); - assert(cur >= min); - assert(cur <= max); + assert(min < max ); + assert(cur >= min ); + assert(cur <= max ); int count = 0; - int span = max - min + 1; + int span = max - min + 1; if (span > totaldots) { // Possibly zero dots per call. // Always print out one dot the first time through. @@ -91,7 +90,7 @@ static void progressdots(int cur, int min, int max, int totaldots) { } if (count == 0) { int expect = (cur - min + 1) * totaldots / span; - int sofar = (cur - min ) * totaldots / span; + int sofar = (cur - min ) * totaldots / span; count = expect - sofar; } @@ -115,7 +114,7 @@ static void progressdots(int cur, int min, int max, int totaldots) { #define maybeprintf(...) if (verbose) { printf(__VA_ARGS__); } -static bool verify_sentinel(const uint8_t * buf, size_t len, const uint8_t sentinel, bool verbose) { +static bool verify_sentinel( const uint8_t * buf, size_t len, const uint8_t sentinel, bool verbose ) { for (size_t i = 0; i < len; i++) { if (buf[i] != sentinel) { maybeprintf(" %" PRIu64 ": 0x%02X != 0x%02X: ", i, buf[i], sentinel); @@ -125,8 +124,8 @@ static bool verify_sentinel(const uint8_t * buf, size_t len, const uint8_t senti return true; } -template < bool checksentinels > -static bool verify_hashmatch(const uint8_t * buf1, const uint8_t * buf2, size_t len, bool verbose) { +template +static bool verify_hashmatch( const uint8_t * buf1, const uint8_t * buf2, size_t len, bool verbose ) { if (likely(memcmp(buf1, buf2, len) == 0)) { return true; } @@ -136,8 +135,7 @@ static bool verify_hashmatch(const uint8_t * buf1, const uint8_t * buf2, size_t (buf1[i] == sentinel1) && (buf2[i] == sentinel2)) { maybeprintf(" output byte %" PRIu64 " not altered:", i); } else { - maybeprintf(" output byte %" PRIu64 " inconsistent (0x%02X != 0x%02X):", - i, buf1[i], buf2[i]); + maybeprintf(" output byte %" PRIu64 " inconsistent (0x%02X != 0x%02X):", i, buf1[i], buf2[i]); } break; } @@ -149,32 +147,32 @@ static bool verify_hashmatch(const uint8_t * buf1, const uint8_t * buf2, size_t // that hashing the same thing gives the same result. // // This test can halt early, so don't add input bytes to the VCode. -bool SanityTest1(const HashInfo * hinfo, const seed_t seed, bool verbose) { - Rand r(883743); - bool result = true; - bool danger = false; - - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int hashbytes = hinfo->bits / 8; - const int reps = 10; - const int keymax = 256; - const int pad = 16*3; - const int buflen = keymax + pad; - - uint8_t * buffer1 = new uint8_t[buflen]; - uint8_t * buffer2 = new uint8_t[buflen]; - uint8_t * hash1 = new uint8_t[buflen]; - uint8_t * hash2 = new uint8_t[buflen]; +bool SanityTest1( const HashInfo * hinfo, const seed_t seed, bool verbose ) { + Rand r( 883743 ); + bool result = true; + bool danger = false; + + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int hashbytes = hinfo->bits / 8; + const int reps = 10; + const int keymax = 256; + const int pad = 16 * 3; + const int buflen = keymax + pad; + + uint8_t * buffer1 = new uint8_t[buflen]; + uint8_t * buffer2 = new uint8_t[buflen]; + uint8_t * hash1 = new uint8_t[buflen]; + uint8_t * hash2 = new uint8_t[buflen]; maybeprintf("Running sanity check 1 "); memset(hash1, sentinel1, buflen); memset(hash2, sentinel2, buflen); - for(int irep = 0; irep < reps; irep++) { - if (irep % (reps/10) == 0) maybeprintf("."); + for (int irep = 0; irep < reps; irep++) { + if (irep % (reps / 10) == 0) { maybeprintf("."); } - for(int len = 0; len <= keymax; len++) { + for (int len = 0; len <= keymax; len++) { // Make 2 copies of some random input data, and hash one // of them. r.rand_p(buffer1, buflen); @@ -191,8 +189,7 @@ bool SanityTest1(const HashInfo * hinfo, const seed_t seed, bool verbose) { } // See if the hash overflowed its output buffer - if (!verify_sentinel(hash1 + hashbytes, buflen - hashbytes, - sentinel1, verbose)) { + if (!verify_sentinel(hash1 + hashbytes, buflen - hashbytes, sentinel1, verbose)) { maybeprintf(" hash overflowed output buffer (pass 1):"); result = false; danger = true; @@ -203,8 +200,7 @@ bool SanityTest1(const HashInfo * hinfo, const seed_t seed, bool verbose) { hash(buffer1, len, seed, hash2); // See if the hash overflowed output buffer this time - if (!verify_sentinel(hash2 + hashbytes, buflen - hashbytes, - sentinel2, verbose)) { + if (!verify_sentinel(hash2 + hashbytes, buflen - hashbytes, sentinel2, verbose)) { maybeprintf(" hash overflowed output buffer (pass 2):"); result = false; danger = true; @@ -219,8 +215,8 @@ bool SanityTest1(const HashInfo * hinfo, const seed_t seed, bool verbose) { } } - end_sanity: - if(result == false) { + end_sanity: + if (result == false) { printf("%s", verbose ? " FAIL !!!!!\n" : " FAIL"); } else { printf("%s", verbose ? " PASS\n" : " pass"); @@ -254,42 +250,41 @@ bool SanityTest1(const HashInfo * hinfo, const seed_t seed, bool verbose) { // This test is expensive, so only run 1 rep. // // This test can halt early, so don't add input bytes to the VCode. -bool SanityTest2(const HashInfo * hinfo, const seed_t seed, bool verbose) { - Rand r(883744); - bool result = true; +bool SanityTest2( const HashInfo * hinfo, const seed_t seed, bool verbose ) { + Rand r( 883744 ); + bool result = true; - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int hashbytes = hinfo->bits / 8; - const int reps = 5; - const int keymax = 128; - const int pad = 16; // Max alignment offset tested - const int buflen = keymax + pad*3; + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int hashbytes = hinfo->bits / 8; + const int reps = 5; + const int keymax = 128; + const int pad = 16;// Max alignment offset tested + const int buflen = keymax + pad * 3; // XXX Check alignment!?! - uint8_t * buffer1 = new uint8_t[buflen]; - uint8_t * buffer2 = new uint8_t[buflen]; - uint8_t * hash1 = new uint8_t[hashbytes]; - uint8_t * hash2 = new uint8_t[hashbytes]; - uint8_t * hash3 = new uint8_t[hashbytes]; + uint8_t * buffer1 = new uint8_t[buflen ]; + uint8_t * buffer2 = new uint8_t[buflen ]; + uint8_t * hash1 = new uint8_t[hashbytes]; + uint8_t * hash2 = new uint8_t[hashbytes]; + uint8_t * hash3 = new uint8_t[hashbytes]; maybeprintf("Running sanity check 2 "); for (int irep = 0; irep < reps; irep++) { - - for(int len = 1; len <= keymax; len++) { - ExtBlob key1(&buffer1[pad], len); + for (int len = 1; len <= keymax; len++) { + ExtBlob key1( &buffer1[pad], len ); // Fill the first buffer with random data r.rand_p(buffer1, buflen); - if (verbose) { progressdots(len + irep*keymax, 1, reps*keymax, 10); } + if (verbose) { progressdots(len + irep * keymax, 1, reps * keymax, 10); } // Record the hash of key1. hash1 becomes the correct // answer that the rest of the loop will test against. hash(key1, len, seed, hash1); addVCodeOutput(hash1, hashbytes); // See if the hash behaves sanely using only key1 - for(int bit = 0; bit < (len * 8); bit++) { + for (int bit = 0; bit < (len * 8); bit++) { // Flip a bit, hash the key -> we should get a different result. key1.flipbit(bit); hash(key1, len, seed, hash2); @@ -311,9 +306,9 @@ bool SanityTest2(const HashInfo * hinfo, const seed_t seed, bool verbose) { } } - for(int offset = pad; offset < pad*2; offset++) { + for (int offset = pad; offset < pad * 2; offset++) { // Make key2 have alignment independent of key1 - ExtBlob key2(&buffer2[offset], len); + ExtBlob key2( &buffer2[offset], len ); // Fill the second buffer with different random data r.rand_p(buffer2, buflen); @@ -343,7 +338,7 @@ bool SanityTest2(const HashInfo * hinfo, const seed_t seed, bool verbose) { memcpy(buffer2 + offset - pad, buffer1, len + 2 * pad); uint8_t * const key2_start = buffer2 + offset; uint8_t * const key2_end = buffer2 + offset + len; - for(uint8_t * ptr = key2_start - pad; ptr < key2_end + pad; ptr++) { + for (uint8_t * ptr = key2_start - pad; ptr < key2_end + pad; ptr++) { if ((ptr >= key2_start) && (ptr < key2_end)) { continue; } *ptr ^= 0xFF; hash(key2, len, seed, hash3); @@ -362,8 +357,8 @@ bool SanityTest2(const HashInfo * hinfo, const seed_t seed, bool verbose) { } } - end_sanity: - if(result == false) { + end_sanity: + if (result == false) { printf("%s", verbose ? " FAIL !!!!!\n" : " ... FAIL"); } else { printf("%s", verbose ? " PASS\n" : " ... pass"); @@ -387,18 +382,18 @@ bool SanityTest2(const HashInfo * hinfo, const seed_t seed, bool verbose) { // Seed() is first called once in the main process, and 2) when Seed() // is called per-hash inside each thread. -static void hashthings(const HashInfo * hinfo, seed_t seed, - uint32_t reps, uint32_t order, bool reseed, bool verbose, - std::vector &keys, std::vector &hashes) { - const HashFn hash = hinfo->hashFn(g_hashEndian); +static void hashthings( const HashInfo * hinfo, seed_t seed, uint32_t reps, uint32_t order, bool reseed, + bool verbose, std::vector & keys, std::vector & hashes ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); const uint32_t hashbytes = hinfo->bits / 8; // Each thread should hash the keys in a different, random order - std::vector idxs(reps); + std::vector idxs( reps ); + if (order != 0) { - Rand r(46742 + order); + Rand r( 46742 + order ); for (int i = 0; i < reps; i++) { idxs[i] = i; } - for(int i = reps - 1; i > 0; i--) { + for (int i = reps - 1; i > 0; i--) { std::swap(idxs[i], idxs[r.rand_range(i + 1)]); } } @@ -412,19 +407,19 @@ static void hashthings(const HashInfo * hinfo, seed_t seed, if (reseed) { seed = hinfo->Seed(idx * UINT64_C(0xa5), true, 1); } hash(&keys[idx * reps], idx + 1, seed, &hashes[idx * hashbytes]); if (verbose && (order < 2)) { progressdots(i, 0, reps - 1, 4); } - if (order == 0) { addVCodeInput(&keys[idx * reps], idx + 1);} + if (order == 0) { addVCodeInput(&keys[idx * reps], idx + 1); } } } -static bool ThreadingTest (const HashInfo * hinfo, bool seedthread, bool verbose) { - Rand r(609163); +static bool ThreadingTest( const HashInfo * hinfo, bool seedthread, bool verbose ) { + Rand r( 609163 ); - const uint32_t hashbytes = hinfo->bits / 8; - const uint32_t reps = 1024*16; - const uint32_t keybytes = (reps * reps); - std::vector keys(keybytes); - std::vector mainhashes(reps * hashbytes); - const seed_t seed = seedthread ? 0 : hinfo->Seed(0x12345, true, 1); + const uint32_t hashbytes = hinfo->bits / 8; + const uint32_t reps = 1024 * 16; + const uint32_t keybytes = (reps * reps); + std::vector keys( keybytes ); + std::vector mainhashes( reps * hashbytes ); + const seed_t seed = seedthread ? 0 : hinfo->Seed(0x12345, true, 1); bool result = true; maybeprintf("Running thread-safety test %d ", seedthread ? 2 : 1); @@ -447,10 +442,12 @@ static bool ThreadingTest (const HashInfo * hinfo, bool seedthread, bool verbose if (g_NCPU > 1) { #if defined(HAVE_THREADS) // Compute all the hashes in different random orders in threads - std::vector > threadhashes(g_NCPU, std::vector(reps * hashbytes)); + std::vector> threadhashes( g_NCPU, std::vector(reps * hashbytes)); std::thread t[g_NCPU]; for (int i = 0; i < g_NCPU; i++) { - t[i] = std::thread {hashthings,hinfo,seed,reps,i+1,seedthread,verbose,std::ref(keys),std::ref(threadhashes[i])}; + t[i] = std::thread { + hashthings, hinfo, seed, reps, i + 1, seedthread, verbose, std::ref(keys), std::ref(threadhashes[i]) + }; } for (int i = 0; i < g_NCPU; i++) { t[i].join(); @@ -468,7 +465,7 @@ static bool ThreadingTest (const HashInfo * hinfo, bool seedthread, bool verbose for (int j = 0; j < reps; j++) { if (memcmp(&mainhashes[j * hashbytes], &threadhashes[i][j * hashbytes], hashbytes) != 0) { maybeprintf("\nMismatch between main process and thread #%d at index %d\n", i, j); - if (verbose) { ExtBlob(&mainhashes[j * hashbytes] , hashbytes).printhex(" main :"); } + if (verbose) { ExtBlob(&mainhashes[j * hashbytes], hashbytes).printhex(" main :"); } if (verbose) { ExtBlob(&threadhashes[i][j * hashbytes], hashbytes).printhex(" thread :"); } result = false; break; // Only breaks out of j loop @@ -476,7 +473,7 @@ static bool ThreadingTest (const HashInfo * hinfo, bool seedthread, bool verbose } } - if(result == false) { + if (result == false) { printf("%s", verbose ? " FAIL !!!!!\n\n" : " ... FAIL"); } else { printf("%s", verbose ? " PASS\n" : " ... pass"); @@ -500,142 +497,136 @@ static bool ThreadingTest (const HashInfo * hinfo, bool seedthread, bool verbose //---------------------------------------------------------------------------- // Appending zero bytes to a key should always cause it to produce a different // hash value -bool AppendedZeroesTest (const HashInfo * hinfo, const seed_t seed, bool verbose) { - Rand r(173994); +bool AppendedZeroesTest( const HashInfo * hinfo, const seed_t seed, bool verbose ) { + Rand r( 173994 ); - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int hashbytes = hinfo->bits / 8; - bool result = true; + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int hashbytes = hinfo->bits / 8; + bool result = true; - maybeprintf("Running append zeroes test "); + maybeprintf("Running append zeroes test "); - for(int rep = 0; rep < 100; rep++) - { - if(rep % 10 == 0) maybeprintf("."); + for (int rep = 0; rep < 100; rep++) { + if (rep % 10 == 0) { maybeprintf("."); } - unsigned char key[256]; - memset(key,0,sizeof(key)); + unsigned char key[256]; + memset(key, 0, sizeof(key)); - r.rand_p(key,32); - // This test can halt early, so don't add input bytes to the VCode. + r.rand_p(key, 32); + // This test can halt early, so don't add input bytes to the VCode. - std::vector> hashes; + std::vector> hashes; - for(int i = 0; i < 32; i++) { - std::vector h(hashbytes); - hash(key,32+i,seed,&h[0]); - hashes.push_back(h); - addVCodeOutput(&h[0], hashbytes); - } + for (int i = 0; i < 32; i++) { + std::vector h( hashbytes ); + hash(key, 32 + i, seed, &h[0]); + hashes.push_back(h); + addVCodeOutput(&h[0], hashbytes); + } - // Sort in little-endian order, for human friendliness - std::sort(hashes.begin(), hashes.end(), - [](const std::vector& a, const std::vector& b) { + // Sort in little-endian order, for human friendliness + std::sort(hashes.begin(), hashes.end(), []( const std::vector & a, const std::vector & b ) { for (int i = a.size() - 1; i >= 0; i--) { if (a[i] != b[i]) { return a[i] < b[i]; } } return false; - } ); + }); - for(int i = 1; i < 32; i++) { - if (memcmp(&hashes[i][0], &hashes[i-1][0], hashbytes) == 0) { - result = false; - goto done; + for (int i = 1; i < 32; i++) { + if (memcmp(&hashes[i][0], &hashes[i - 1][0], hashbytes) == 0) { + result = false; + goto done; + } } } - } - done: - if(result == false) { - printf("%s", verbose ? " FAIL !!!!!\n" : " ... FAIL"); - } else { - printf("%s", verbose ? " PASS\n" : " ... pass"); - } + done: + if (result == false) { + printf("%s", verbose ? " FAIL !!!!!\n" : " ... FAIL"); + } else { + printf("%s", verbose ? " PASS\n" : " ... pass"); + } - recordTestResult(result, "Sanity", "Append zeroes"); + recordTestResult(result, "Sanity", "Append zeroes"); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //---------------------------------------------------------------------------- // Prepending zero bytes to a key should also always cause it to // produce a different hash value -bool PrependedZeroesTest (const HashInfo * hinfo, const seed_t seed, bool verbose) { - Rand r(534281); +bool PrependedZeroesTest( const HashInfo * hinfo, const seed_t seed, bool verbose ) { + Rand r( 534281 ); - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int hashbytes = hinfo->bits / 8; - bool result = true; + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int hashbytes = hinfo->bits / 8; + bool result = true; - maybeprintf("Running prepend zeroes test "); + maybeprintf("Running prepend zeroes test "); - for(int rep = 0; rep < 100; rep++) - { - if(rep % 10 == 0) maybeprintf("."); + for (int rep = 0; rep < 100; rep++) { + if (rep % 10 == 0) { maybeprintf("."); } - unsigned char key[256]; - memset(key,0,sizeof(key)); + unsigned char key[256]; + memset(key, 0, sizeof(key)); - r.rand_p(key+32,32); - // This test can halt early, so don't add input bytes to the VCode. + r.rand_p(key + 32, 32); + // This test can halt early, so don't add input bytes to the VCode. - std::vector> hashes; + std::vector> hashes; - for(int i = 0; i < 32; i++) { - std::vector h(hashbytes); - hash(key+32-i,32+i,seed,&h[0]); - hashes.push_back(h); - addVCodeOutput(&h[0], hashbytes); - } + for (int i = 0; i < 32; i++) { + std::vector h( hashbytes ); + hash(key + 32 - i, 32 + i, seed, &h[0]); + hashes.push_back(h); + addVCodeOutput(&h[0], hashbytes); + } - // Sort in little-endian order, for human friendliness - std::sort(hashes.begin(), hashes.end(), - [](const std::vector& a, const std::vector& b) { + // Sort in little-endian order, for human friendliness + std::sort(hashes.begin(), hashes.end(), []( const std::vector & a, const std::vector & b ) { for (int i = a.size() - 1; i >= 0; i--) { if (a[i] != b[i]) { return a[i] < b[i]; } } return false; - } ); + }); - for(int i = 1; i < 32; i++) { - if (memcmp(&hashes[i][0], &hashes[i-1][0], hashbytes) == 0) { - result = false; - goto done; + for (int i = 1; i < 32; i++) { + if (memcmp(&hashes[i][0], &hashes[i - 1][0], hashbytes) == 0) { + result = false; + goto done; + } } } - } - done: - if(result == false) { - printf("%s", verbose ? " FAIL !!!!!\n" : " ... FAIL"); - } else { - printf("%s", verbose ? " PASS\n" : " ... pass"); - } + done: + if (result == false) { + printf("%s", verbose ? " FAIL !!!!!\n" : " ... FAIL"); + } else { + printf("%s", verbose ? " PASS\n" : " ... pass"); + } - recordTestResult(result, "Sanity", "Prepend zeroes"); + recordTestResult(result, "Sanity", "Prepend zeroes"); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } -void SanityTestHeader(void) { - printf("%-25s %13s %13s %13s\n", - "Name", " Sanity 1+2 ", " Zeroes ", " Thread-safe "); - printf("%-25s %13s %13s %13s\n", - "-------------------------", "-------------", - "-------------", "-------------"); +void SanityTestHeader( void ) { + printf("%-25s %13s %13s %13s\n", "Name", " Sanity 1+2 ", " Zeroes ", " Thread-safe "); + printf("%-25s %13s %13s %13s\n", "-------------------------", + "-------------", "-------------", "-------------"); } -bool SanityTest(const HashInfo * hinfo, bool oneline) { - bool verbose = !oneline; - bool result = true; +bool SanityTest( const HashInfo * hinfo, bool oneline ) { + bool verbose = !oneline; + bool result = true; bool threadresult = true; if (oneline) { printf("%-25s ", hinfo->name); } @@ -650,7 +641,7 @@ bool SanityTest(const HashInfo * hinfo, bool oneline) { // These should be last, as they re-seed threadresult &= ThreadingTest(hinfo, false, verbose); - threadresult &= ThreadingTest(hinfo, true, verbose); + threadresult &= ThreadingTest(hinfo, true , verbose); // If threading test cannot give meaningful results, then don't // bother printing them out. :) But still run them above so the @@ -671,7 +662,7 @@ bool SanityTest(const HashInfo * hinfo, bool oneline) { printf("%sSANITY_FAILS unset, but hash failed", oneline ? "\t" : ""); } - out: + out: if (oneline) { printf("\n"); } diff --git a/tests/SanityTest.h b/tests/SanityTest.h index fb9f5b1f..7347f374 100644 --- a/tests/SanityTest.h +++ b/tests/SanityTest.h @@ -46,5 +46,5 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -bool SanityTest(const HashInfo * hinfo, bool oneline = false); -void SanityTestHeader(void); +bool SanityTest( const HashInfo * hinfo, bool oneline = false ); +void SanityTestHeader( void ); diff --git a/tests/SeedTest.cpp b/tests/SeedTest.cpp index 283e2fd0..2af3c57c 100644 --- a/tests/SeedTest.cpp +++ b/tests/SeedTest.cpp @@ -49,7 +49,7 @@ #include "Platform.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // For chooseUpToK +#include "Stats.h" // For chooseUpToK #include "Analyze.h" #include "Instantiate.h" #include "VCode.h" @@ -61,122 +61,122 @@ //----------------------------------------------------------------------------- // Keyset 'Seed' - hash "the quick brown fox..." using different seeds -template < typename hashtype, uint32_t seedbits, bool bigseed > -static bool SeedTestImpl(const HashInfo * hinfo, bool drawDiagram) { - assert(seedbits <= 31); - const HashFn hash = hinfo->hashFn(g_hashEndian); - const int totalkeys = 1 << seedbits; - const int hibits = seedbits >> 1; - const int lobits = seedbits - hibits; - const int shiftbits = bigseed ? (64 - hibits) : (32 - hibits); +template +static bool SeedTestImpl( const HashInfo * hinfo, bool drawDiagram ) { + assert(seedbits <= 31); + const HashFn hash = hinfo->hashFn(g_hashEndian); + const int totalkeys = 1 << seedbits; + const int hibits = seedbits >> 1; + const int lobits = seedbits - hibits; + const int shiftbits = bigseed ? (64 - hibits) : (32 - hibits); - printf("Keyset 'Seed' - %d keys\n", totalkeys); + printf("Keyset 'Seed' - %d keys\n", totalkeys); - const char text[64] = "The quick brown fox jumps over the lazy dog"; - const int len = (int)strlen(text); + const char text[64] = "The quick brown fox jumps over the lazy dog"; + const int len = (int)strlen(text); - addVCodeInput(text, len); - addVCodeInput(totalkeys); + addVCodeInput(text , len); + addVCodeInput(totalkeys); - //---------- + //---------- - std::vector hashes; + std::vector hashes; - hashes.resize(totalkeys); + hashes.resize(totalkeys); - for(seed_t i = 0; i < (1 << hibits); i++) { - for(seed_t j = 0; j < (1 << lobits); j++) { - const seed_t seed = (i << shiftbits) + j; - const seed_t hseed = hinfo->Seed(seed, true); - hash(text, len, hseed, &hashes[(i << lobits) + j]); - } - } + for (seed_t i = 0; i < (1 << hibits); i++) { + for (seed_t j = 0; j < (1 << lobits); j++) { + const seed_t seed = (i << shiftbits) + j; + const seed_t hseed = hinfo->Seed(seed, true); + hash(text, len, hseed, &hashes[(i << lobits) + j]); + } + } - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - recordTestResult(result, "Seed", "Seq"); + recordTestResult(result, "Seed", "Seq"); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- // Keyset 'SparseSeed' - hash "sphinx of black quartz..." using seeds with few // bits set/cleared -template < typename hashtype, bool bigseed > -static bool SparseSeedTestImpl(const HashInfo * hinfo, uint32_t maxbits, bool drawDiagram) { - assert(maxbits < 16); - const HashFn hash = hinfo->hashFn(g_hashEndian); - uint64_t totalkeys = 2 + 2*chooseUpToK(bigseed ? 64 : 32, maxbits); - uint64_t cnt = 0; +template +static bool SparseSeedTestImpl( const HashInfo * hinfo, uint32_t maxbits, bool drawDiagram ) { + assert(maxbits < 16); + const HashFn hash = hinfo->hashFn(g_hashEndian); + uint64_t totalkeys = 2 + 2 * chooseUpToK(bigseed ? 64 : 32, maxbits); + uint64_t cnt = 0; - printf("Keyset 'SparseSeed' - %" PRId64 " keys\n", totalkeys); + printf("Keyset 'SparseSeed' - %" PRId64 " keys\n", totalkeys); - const char text[64] = "Sphinx of black quartz, judge my vow"; - const int len = (int)strlen(text); + const char text[64] = "Sphinx of black quartz, judge my vow"; + const int len = (int)strlen(text); - addVCodeInput(text, len); - addVCodeInput(totalkeys); + addVCodeInput(text , len); + addVCodeInput(totalkeys); - //---------- + //---------- - std::vector hashes; - hashes.resize(totalkeys); + std::vector hashes; + hashes.resize(totalkeys); - seed_t seed; + seed_t seed; - seed = hinfo->Seed(0, true); - hash(text, len, seed, &hashes[cnt++]); + seed = hinfo->Seed(0, true); + hash(text, len, seed, &hashes[cnt++]); - seed = hinfo->Seed(~0, true); - hash(text, len, seed, &hashes[cnt++]); + seed = hinfo->Seed(~0, true); + hash(text, len, seed, &hashes[cnt++]); - for(seed_t i = 1; i <= maxbits; i++) { - uint64_t seed = (UINT64_C(1) << i) - 1; - bool done; + for (seed_t i = 1; i <= maxbits; i++) { + uint64_t seed = (UINT64_C(1) << i) - 1; + bool done; - do { - seed_t hseed; - hseed = hinfo->Seed(seed, true); - hash(text, len, hseed, &hashes[cnt++]); + do { + seed_t hseed; + hseed = hinfo->Seed(seed, true); + hash(text, len, hseed, &hashes[cnt++]); - hseed = hinfo->Seed(~seed, true); - hash(text, len, hseed, &hashes[cnt++]); + hseed = hinfo->Seed(~seed, true); + hash(text, len, hseed, &hashes[cnt++]); - /* Next lexicographic bit pattern, from "Bit Twiddling Hacks" */ - uint64_t t = (seed | (seed - 1)) + 1; - seed = t | ((((t & -t) / (seed & -seed)) >> 1) - 1); - done = bigseed ? (seed == ~0) : ((seed >> 32) != 0); - } while (!done); - } + /* Next lexicographic bit pattern, from "Bit Twiddling Hacks" */ + uint64_t t = (seed | (seed - 1)) + 1; + seed = t | ((((t & -t) / (seed & -seed)) >> 1) - 1); + done = bigseed ? (seed == ~0) : ((seed >> 32) != 0); + } while (!done); + } - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - recordTestResult(result, "Seed", "Sparse"); + recordTestResult(result, "Seed", "Sparse"); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool SeedTest(const HashInfo * hinfo, const bool verbose) { +template +bool SeedTest( const HashInfo * hinfo, const bool verbose ) { bool result = true; printf("[[[ Keyset 'Seed' Tests ]]]\n\n"); if (hinfo->is32BitSeed()) { - result &= SeedTestImpl( hinfo, verbose ); - result &= SparseSeedTestImpl( hinfo, 7, verbose ); + result &= SeedTestImpl (hinfo , verbose); + result &= SparseSeedTestImpl(hinfo, 7, verbose); } else { - result &= SeedTestImpl( hinfo, verbose ); - result &= SparseSeedTestImpl( hinfo, 5, verbose ); + result &= SeedTestImpl (hinfo , verbose); + result &= SparseSeedTestImpl(hinfo, 5, verbose); } printf("%s\n", result ? "" : g_failstr); diff --git a/tests/SeedTest.h b/tests/SeedTest.h index 93a62572..6d8546ad 100644 --- a/tests/SeedTest.h +++ b/tests/SeedTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool SeedTest(const HashInfo * info, const bool verbose); +template +bool SeedTest( const HashInfo * info, const bool verbose ); diff --git a/tests/SparseKeysetTest.cpp b/tests/SparseKeysetTest.cpp index a4d64d16..fa63a9b7 100644 --- a/tests/SparseKeysetTest.cpp +++ b/tests/SparseKeysetTest.cpp @@ -58,12 +58,11 @@ //----------------------------------------------------------------------------- // Keyset 'Sparse' - generate all possible N-bit keys with up to K bits set -template < typename keytype, typename hashtype > -static void SparseKeygenRecurse(HashFn hash, const seed_t seed, - int start, int bitsleft, bool inclusive, - keytype & k, std::vector & hashes) { +template +static void SparseKeygenRecurse( HashFn hash, const seed_t seed, int start, int bitsleft, + bool inclusive, keytype & k, std::vector & hashes ) { const int nbytes = sizeof(keytype); - const int nbits = nbytes * 8; + const int nbits = nbytes * 8; hashtype h; @@ -77,7 +76,7 @@ static void SparseKeygenRecurse(HashFn hash, const seed_t seed, } if (bitsleft > 1) { - SparseKeygenRecurse(hash, seed, i+1, bitsleft-1, inclusive, k, hashes); + SparseKeygenRecurse(hash, seed, i + 1, bitsleft - 1, inclusive, k, hashes); } k.flipbit(i); @@ -85,98 +84,96 @@ static void SparseKeygenRecurse(HashFn hash, const seed_t seed, } //---------- -template < int keybits, typename hashtype > -static bool SparseKeyImpl(HashFn hash, const seed_t seed, - const int setbits, bool inclusive, - bool testColl, bool testDist, bool drawDiagram) { - printf("Keyset 'Sparse' - %d-bit keys with %s %d bits set - ",keybits, - inclusive ? "up to" : "exactly", setbits); +template +static bool SparseKeyImpl( HashFn hash, const seed_t seed, const int setbits, bool inclusive, + bool testColl, bool testDist, bool drawDiagram ) { + printf("Keyset 'Sparse' - %d-bit keys with %s %d bits set - ", keybits, inclusive ? "up to" : "exactly", setbits); - typedef Blob keytype; + typedef Blob keytype; - std::vector hashes; + std::vector hashes; - keytype k; - memset(&k,0,sizeof(k)); + keytype k; + memset(&k, 0, sizeof(k)); - if (inclusive) { - hashes.resize(1); - hash(&k, sizeof(keytype), seed, &hashes[0]); - } + if (inclusive) { + hashes.resize(1); + hash(&k, sizeof(keytype), seed, &hashes[0]); + } - SparseKeygenRecurse(hash,seed,0,setbits,inclusive,k,hashes); + SparseKeygenRecurse(hash, seed, 0, setbits, inclusive, k, hashes); - printf("%d keys\n",(int)hashes.size()); + printf("%d keys\n", (int)hashes.size()); - bool result = TestHashList(hashes,drawDiagram,testColl,testDist); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram, testColl, testDist); + printf("\n" ); - recordTestResult(result, "Sparse", keybits); + recordTestResult(result, "Sparse", keybits); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool SparseKeyTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; +template +bool SparseKeyTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; printf("[[[ Keyset 'Sparse' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); - result &= SparseKeyImpl< 16,hashtype>(hash,seed,9,true,true,true,verbose); - result &= SparseKeyImpl< 24,hashtype>(hash,seed,8,true,true,true,verbose); - result &= SparseKeyImpl< 32,hashtype>(hash,seed,7,true,true,true,verbose); - result &= SparseKeyImpl< 40,hashtype>(hash,seed,6,true,true,true,verbose); - result &= SparseKeyImpl< 48,hashtype>(hash,seed,6,true,true,true,verbose); - result &= SparseKeyImpl< 56,hashtype>(hash,seed,5,true,true,true,verbose); - result &= SparseKeyImpl< 64,hashtype>(hash,seed,5,true,true,true,verbose); - result &= SparseKeyImpl< 72,hashtype>(hash,seed,5,true,true,true,verbose); - result &= SparseKeyImpl< 96,hashtype>(hash,seed,4,true,true,true,verbose); + result &= SparseKeyImpl<16, hashtype>(hash, seed, 9, true, true, true, verbose); + result &= SparseKeyImpl<24, hashtype>(hash, seed, 8, true, true, true, verbose); + result &= SparseKeyImpl<32, hashtype>(hash, seed, 7, true, true, true, verbose); + result &= SparseKeyImpl<40, hashtype>(hash, seed, 6, true, true, true, verbose); + result &= SparseKeyImpl<48, hashtype>(hash, seed, 6, true, true, true, verbose); + result &= SparseKeyImpl<56, hashtype>(hash, seed, 5, true, true, true, verbose); + result &= SparseKeyImpl<64, hashtype>(hash, seed, 5, true, true, true, verbose); + result &= SparseKeyImpl<72, hashtype>(hash, seed, 5, true, true, true, verbose); + result &= SparseKeyImpl<96, hashtype>(hash, seed, 4, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl< 112,hashtype>(hash,seed,4,true,true,true,verbose); - result &= SparseKeyImpl< 128,hashtype>(hash,seed,4,true,true,true,verbose); - result &= SparseKeyImpl< 144,hashtype>(hash,seed,4,true,true,true,verbose); + result &= SparseKeyImpl<112, hashtype>(hash, seed, 4, true, true, true, verbose); + result &= SparseKeyImpl<128, hashtype>(hash, seed, 4, true, true, true, verbose); + result &= SparseKeyImpl<144, hashtype>(hash, seed, 4, true, true, true, verbose); } - result &= SparseKeyImpl< 160,hashtype>(hash,seed,4,true,true,true,verbose); + result &= SparseKeyImpl<160, hashtype>(hash, seed, 4, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl< 192,hashtype>(hash,seed,4,true,true,true,verbose); + result &= SparseKeyImpl<192, hashtype>(hash, seed, 4, true, true, true, verbose); } - result &= SparseKeyImpl< 256,hashtype>(hash,seed,3,true,true,true,verbose); + result &= SparseKeyImpl<256, hashtype>(hash, seed, 3, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl< 288,hashtype>(hash,seed,3,true,true,true,verbose); - result &= SparseKeyImpl< 320,hashtype>(hash,seed,3,true,true,true,verbose); - result &= SparseKeyImpl< 384,hashtype>(hash,seed,3,true,true,true,verbose); - result &= SparseKeyImpl< 448,hashtype>(hash,seed,3,true,true,true,verbose); + result &= SparseKeyImpl<288, hashtype>(hash, seed, 3, true, true, true, verbose); + result &= SparseKeyImpl<320, hashtype>(hash, seed, 3, true, true, true, verbose); + result &= SparseKeyImpl<384, hashtype>(hash, seed, 3, true, true, true, verbose); + result &= SparseKeyImpl<448, hashtype>(hash, seed, 3, true, true, true, verbose); } else if (hinfo->bits > 64) { goto END_Sparse; } - result &= SparseKeyImpl< 512,hashtype>(hash,seed,3,true,true,true,verbose); + result &= SparseKeyImpl<512, hashtype>(hash, seed, 3, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl< 640,hashtype>(hash,seed,3,true,true,true,verbose); - result &= SparseKeyImpl< 768,hashtype>(hash,seed,3,true,true,true,verbose); - result &= SparseKeyImpl< 896,hashtype>(hash,seed,2,true,true,true,verbose); + result &= SparseKeyImpl<640, hashtype>(hash, seed, 3, true, true, true, verbose); + result &= SparseKeyImpl<768, hashtype>(hash, seed, 3, true, true, true, verbose); + result &= SparseKeyImpl<896, hashtype>(hash, seed, 2, true, true, true, verbose); } - result &= SparseKeyImpl<1024,hashtype>(hash,seed,2,true,true,true,verbose); + result &= SparseKeyImpl<1024, hashtype>(hash, seed, 2, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl<1280,hashtype>(hash,seed,2,true,true,true,verbose); - result &= SparseKeyImpl<1536,hashtype>(hash,seed,2,true,true,true,verbose); + result &= SparseKeyImpl<1280, hashtype>(hash, seed, 2, true, true, true, verbose); + result &= SparseKeyImpl<1536, hashtype>(hash, seed, 2, true, true, true, verbose); } - result &= SparseKeyImpl<2048,hashtype>(hash,seed,2,true,true,true,verbose); + result &= SparseKeyImpl<2048, hashtype>(hash, seed, 2, true, true, true, verbose); if (extra) { - result &= SparseKeyImpl<3072,hashtype>(hash,seed,2,true,true,true,verbose); - result &= SparseKeyImpl<4096,hashtype>(hash,seed,2,true,true,true,verbose); - result &= SparseKeyImpl<6144,hashtype>(hash,seed,2,true,true,true,verbose); - result &= SparseKeyImpl<8192,hashtype>(hash,seed,2,true,true,true,verbose); - result &= SparseKeyImpl<9992,hashtype>(hash,seed,2,true,true,true,verbose); + result &= SparseKeyImpl<3072, hashtype>(hash, seed, 2, true, true, true, verbose); + result &= SparseKeyImpl<4096, hashtype>(hash, seed, 2, true, true, true, verbose); + result &= SparseKeyImpl<6144, hashtype>(hash, seed, 2, true, true, true, verbose); + result &= SparseKeyImpl<8192, hashtype>(hash, seed, 2, true, true, true, verbose); + result &= SparseKeyImpl<9992, hashtype>(hash, seed, 2, true, true, true, verbose); } - END_Sparse: + END_Sparse: printf("%s\n", result ? "" : g_failstr); return result; diff --git a/tests/SparseKeysetTest.h b/tests/SparseKeysetTest.h index c3064ee6..676d2ca9 100644 --- a/tests/SparseKeysetTest.h +++ b/tests/SparseKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool SparseKeyTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool SparseKeyTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/SpeedTest.cpp b/tests/SpeedTest.cpp index b2cf4d20..07bd185e 100644 --- a/tests/SpeedTest.cpp +++ b/tests/SpeedTest.cpp @@ -50,7 +50,7 @@ #include "Timing.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // For FilterOutliers, CalcMean, CalcStdv +#include "Stats.h" // For FilterOutliers, CalcMean, CalcStdv #include "Random.h" #include "SpeedTest.h" @@ -61,7 +61,7 @@ constexpr int BULK_TRIALS = 2999; // Timings per hash for large (>=128b) keys constexpr int TINY_TRIALS = 200; // Timings per hash for small (<128b) keys -constexpr int TINY_SAMPLES = 15000;// Samples per timing run for small sizes +constexpr int TINY_SAMPLES = 15000; // Samples per timing run for small sizes //----------------------------------------------------------------------------- // This is functionally a speed test, and so will not inform VCodes, @@ -72,18 +72,17 @@ constexpr int TINY_SAMPLES = 15000;// Samples per timing run for small sizes // as possible, but that's hard to do portably. We'll try and get as close as // possible by marking the function as NEVER_INLINE (to keep the optimizer from // moving it) and marking the timing variables as "volatile register". -NEVER_INLINE static int64_t timehash(HashFn hash, const seed_t seed, - const void * const key, int len) { - volatile int64_t begin, end; - uint32_t temp[16]; +NEVER_INLINE static int64_t timehash( HashFn hash, const seed_t seed, const void * const key, int len ) { + volatile int64_t begin, end; + uint32_t temp[16]; - begin = timer_start(); + begin = timer_start(); - hash(key,len,seed,temp); + hash(key, len, seed, temp); - end = timer_end(); + end = timer_end(); - return end - begin; + return end - begin; } //----------------------------------------------------------------------------- @@ -109,177 +108,177 @@ NEVER_INLINE static int64_t timehash(HashFn hash, const seed_t seed, // x64 platforms, which leads to unfairly inflated cycle counts. // // WARNING: This assumes that at least 4 bytes can be written to key! -NEVER_INLINE static uint64_t timehash_small(HashFn hash, const seed_t seed, - uint8_t * const key, int len) { - const uint64_t incr = 0x1000001; - uint64_t maxi = incr * TINY_SAMPLES; - volatile unsigned long long int begin, end; - uint32_t hash_temp[16] = {0}; - - begin = timer_start(); - - for (uint64_t i = 0; i < maxi; i += incr) { - hash(key, len, seed, hash_temp); - // It's possible that even with this loop data dependency that - // hash invocations still would not be fully serialized. Another - // option is to add lfence instruction to enforce serialization - // at the CPU level. It's hard to say which one is the most - // realistic and sensible approach. - uint32_t j = i ^ hash_temp[0]; - memcpy(key, &j, 4); - } - - end = timer_end(); - - return end - begin; +NEVER_INLINE static uint64_t timehash_small( HashFn hash, const seed_t seed, uint8_t * const key, int len ) { + const uint64_t incr = 0x1000001; + uint64_t maxi = incr * TINY_SAMPLES; + volatile unsigned long long int begin, end; + uint32_t hash_temp[16] = { 0 }; + + begin = timer_start(); + + for (uint64_t i = 0; i < maxi; i += incr) { + hash(key, len, seed, hash_temp); + // It's possible that even with this loop data dependency that + // hash invocations still would not be fully serialized. Another + // option is to add lfence instruction to enforce serialization + // at the CPU level. It's hard to say which one is the most + // realistic and sensible approach. + uint32_t j = i ^ hash_temp[0]; + memcpy(key, &j, 4); + } + + end = timer_end(); + + return end - begin; } //----------------------------------------------------------------------------- double stddev; -static double SpeedTest(HashFn hash, seed_t seed, const int trials, - const int blocksize, const int align, - const int varysize, const int varyalign) { - Rand r(seed); - uint8_t *buf = new uint8_t[blocksize + 512]; // assumes (align + varyalign) <= 257 - uintptr_t t1 = reinterpret_cast(buf); - - t1 = (t1 + 255) & UINT64_C(0xFFFFFFFFFFFFFF00); - t1 += align; - - uint8_t * block = reinterpret_cast(t1); - - std::vector sizes; - if (varysize > 0) - { - sizes.reserve(trials); - for(int i = 0; i < trials; i++) - sizes.push_back(blocksize - varysize + (i % (varysize + 1))); - for(int i = trials - 1; i > 0; i--) - std::swap(sizes[i], sizes[r.rand_range(i + 1)]); - } - - std::vector alignments; - if (varyalign > 0) - { - alignments.reserve(trials); - for(int i = 0; i < trials; i++) - alignments.push_back((i + 1) % (varyalign + 1)); - for(int i = trials - 1; i > 0; i--) - std::swap(alignments[i], alignments[r.rand_range(i + 1)]); - } - - //---------- - - std::vector times; - times.reserve(trials); - - int testsize = blocksize; - for(int itrial = 0; itrial < trials; itrial++) - { - if (varysize > 0) - testsize = sizes[itrial]; - if (varyalign > 0) - block = reinterpret_cast(t1 + alignments[itrial]); - - r.rand_p(block,testsize); - - double t; - if (testsize < 128) { - t = (double)timehash_small(hash,seed,block,testsize)/(double)TINY_SAMPLES; - } else { - t = (double)timehash(hash,seed,block,testsize); +static double SpeedTest( HashFn hash, seed_t seed, const int trials, const int blocksize, + const int align, const int varysize, const int varyalign ) { + Rand r( seed ); + uint8_t * buf = new uint8_t[blocksize + 512]; // assumes (align + varyalign) <= 257 + uintptr_t t1 = reinterpret_cast(buf); + + t1 = (t1 + 255) & UINT64_C(0xFFFFFFFFFFFFFF00); + t1 += align; + + uint8_t * block = reinterpret_cast(t1); + + std::vector sizes; + if (varysize > 0) { + sizes.reserve(trials); + for (int i = 0; i < trials; i++) { + sizes.push_back(blocksize - varysize + (i % (varysize + 1))); + } + for (int i = trials - 1; i > 0; i--) { + std::swap(sizes[i], sizes[r.rand_range(i + 1)]); + } } - if(t > 0) times.push_back(t); - } + std::vector alignments; + if (varyalign > 0) { + alignments.reserve(trials); + for (int i = 0; i < trials; i++) { + alignments.push_back((i + 1) % (varyalign + 1)); + } + for (int i = trials - 1; i > 0; i--) { + std::swap(alignments[i], alignments[r.rand_range(i + 1)]); + } + } + + //---------- - delete [] buf; + std::vector times; + times.reserve(trials); - //---------- + int testsize = blocksize; + for (int itrial = 0; itrial < trials; itrial++) { + if (varysize > 0) { + testsize = sizes[itrial]; + } + if (varyalign > 0) { + block = reinterpret_cast(t1 + alignments[itrial]); + } - std::sort(times.begin(),times.end()); + r.rand_p(block, testsize); - FilterOutliers(times); - stddev = CalcStdv(times); + double t; + if (testsize < 128) { + t = (double)timehash_small(hash, seed, block, testsize) / (double)TINY_SAMPLES; + } else { + t = (double)timehash(hash , seed, block, testsize); + } - return CalcMean(times); + if (t > 0) { times.push_back(t); } + } + + delete [] buf; + + //---------- + + std::sort(times.begin(), times.end()); + + FilterOutliers(times); + stddev = CalcStdv(times); + + return CalcMean(times); } //----------------------------------------------------------------------------- // 256k blocks seem to give the best results. -static void BulkSpeedTest ( HashFn hash, seed_t seed, bool vary_align, bool vary_size) -{ - const int blocksize = 256 * 1024; - const int maxvary = vary_size ? 127 : 0; +static void BulkSpeedTest( HashFn hash, seed_t seed, bool vary_align, bool vary_size ) { + const int blocksize = 256 * 1024; + const int maxvary = vary_size ? 127 : 0; - if (vary_size) - printf("Bulk speed test - [%d, %d]-byte keys\n",blocksize - maxvary, blocksize); - else - printf("Bulk speed test - %d-byte keys\n",blocksize); - double sumbpc = 0.0; + if (vary_size) { + printf("Bulk speed test - [%d, %d]-byte keys\n", blocksize - maxvary, blocksize); + } else { + printf("Bulk speed test - %d-byte keys\n", blocksize); + } + double sumbpc = 0.0; - volatile double warmup_cycles = SpeedTest(hash,seed,BULK_TRIALS,blocksize,0,0,0); + volatile double warmup_cycles = SpeedTest(hash, seed, BULK_TRIALS, blocksize, 0, 0, 0); - for(int align = 7; align >= 0; align--) - { - double cycles = SpeedTest(hash,seed,BULK_TRIALS,blocksize,align,maxvary,0); + for (int align = 7; align >= 0; align--) { + double cycles = SpeedTest(hash, seed, BULK_TRIALS, blocksize, align, maxvary, 0); - double bestbpc = ((double)blocksize - ((double)maxvary / 2)) / cycles; + double bestbpc = ((double)blocksize - ((double)maxvary / 2)) / cycles; - double bestbps = (bestbpc * 3000000000.0 / 1048576.0); - printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps); - sumbpc += bestbpc; - } - if (vary_align) - { - double cycles = SpeedTest(hash,seed,BULK_TRIALS,blocksize,0,maxvary,7); + double bestbps = (bestbpc * 3000000000.0 / 1048576.0); + printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n", align, bestbpc, bestbps); + sumbpc += bestbpc; + } + if (vary_align) { + double cycles = SpeedTest(hash, seed, BULK_TRIALS, blocksize, 0, maxvary, 7); - double bestbpc = ((double)blocksize - ((double)maxvary / 2)) / cycles; + double bestbpc = ((double)blocksize - ((double)maxvary / 2)) / cycles; - double bestbps = (bestbpc * 3000000000.0 / 1048576.0); - printf("Alignment rnd - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",bestbpc,bestbps); - // Deliberately not counted in the Average stat, so the two can be directly compared - } + double bestbps = (bestbpc * 3000000000.0 / 1048576.0); + printf("Alignment rnd - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n", bestbpc, bestbps); + // Deliberately not counted in the Average stat, so the two can be directly compared + } - sumbpc = sumbpc / 8.0; - printf("Average - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",sumbpc,(sumbpc * 3000000000.0 / 1048576.0)); - fflush(NULL); + sumbpc = sumbpc / 8.0; + printf("Average - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n", sumbpc, (sumbpc * 3000000000.0 / 1048576.0)); + fflush(NULL); } //----------------------------------------------------------------------------- -static double TinySpeedTest ( HashFn hash, int maxkeysize, seed_t seed, bool verbose, bool include_vary ) -{ - double sum = 0.0; - - printf("Small key speed test - [1, %2d]-byte keys\n",maxkeysize); - - for(int i = 1; i <= maxkeysize; i++) - { - volatile int j = i; - double cycles = SpeedTest(hash,seed,TINY_TRIALS,j,0,0,0); - if(verbose) printf(" %2d-byte keys - %8.2f cycles/hash (%8.6f stdv%8.4f%%)\n",j,cycles,stddev,100.0*stddev/cycles); - sum += cycles; - } - if (include_vary) { - double cycles = SpeedTest(hash,seed,TINY_TRIALS*8,maxkeysize,0,maxkeysize-1,0); - if(verbose) printf(" rnd-byte keys - %8.2f cycles/hash (%8.6f stdv)\n", cycles,stddev); - // Deliberately not counted in the Average stat, so the two can be directly compared - } - - sum = sum / (double)maxkeysize; - printf("Average - %8.2f cycles/hash\n",sum); - - return sum; +static double TinySpeedTest( HashFn hash, int maxkeysize, seed_t seed, bool verbose, bool include_vary ) { + double sum = 0.0; + + printf("Small key speed test - [1, %2d]-byte keys\n", maxkeysize); + + for (int i = 1; i <= maxkeysize; i++) { + volatile int j = i; + double cycles = SpeedTest(hash, seed, TINY_TRIALS, j, 0, 0, 0); + if (verbose) { + printf(" %2d-byte keys - %8.2f cycles/hash (%8.6f stdv%8.4f%%)\n", + j, cycles, stddev, 100.0 * stddev / cycles); + } + sum += cycles; + } + if (include_vary) { + double cycles = SpeedTest(hash, seed, TINY_TRIALS * 8, maxkeysize, 0, maxkeysize - 1, 0); + if (verbose) { printf(" rnd-byte keys - %8.2f cycles/hash (%8.6f stdv)\n", cycles, stddev); } + // Deliberately not counted in the Average stat, so the two can be directly compared + } + + sum = sum / (double)maxkeysize; + printf("Average - %8.2f cycles/hash\n", sum); + + return sum; } //----------------------------------------------------------------------------- -bool SpeedTest(const HashInfo * hinfo) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - Rand r(633692); +bool SpeedTest( const HashInfo * hinfo ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + Rand r( 633692 ); printf("[[[ Speed Tests ]]]\n\n"); @@ -300,23 +299,20 @@ bool SpeedTest(const HashInfo * hinfo) { //----------------------------------------------------------------------------- // Does 5 different speed tests to try to summarize hash performance -void ShortSpeedTestHeader(void) { +void ShortSpeedTestHeader( void ) { printf("Bulk results are in bytes/cycle, short results are in cycles/hash\n\n"); - printf("%-25s %11s %18s %18s %18s %18s \n", - "Name", " Bulk ", " 1-8 bytes ", "9-16 bytes", - "17-24 bytes", "25-32 bytes"); - printf("%-25s %11s %18s %18s %18s %18s \n", - "-------------------------", "-----------", - "------------------", "------------------", - "------------------", "------------------"); + printf("%-25s %11s %18s %18s %18s %18s \n", "Name", " Bulk ", + " 1-8 bytes ", "9-16 bytes", "17-24 bytes", "25-32 bytes"); + printf("%-25s %11s %18s %18s %18s %18s \n", "-------------------------", "-----------", "------------------", + "------------------", "------------------", "------------------"); } -void ShortSpeedTest(const HashInfo * hinfo) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - Rand r(321321); +void ShortSpeedTest( const HashInfo * hinfo ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + Rand r( 321321 ); - const int maxvaryalign = 7; + const int maxvaryalign = 7; const int basealignoffset = 0; printf("%-25s", hinfo->name); @@ -324,16 +320,15 @@ void ShortSpeedTest(const HashInfo * hinfo) { const seed_t seed = hinfo->Seed(g_seed ^ r.rand_u64()); { - const int baselen = 256 * 1024; + const int baselen = 256 * 1024; const int maxvarylen = 127; // Do a warmup to get things into cache volatile double warmup_cycles = - SpeedTest(hash,seed,BULK_TRIALS,baselen,0,0,0); + SpeedTest(hash, seed, BULK_TRIALS, baselen, 0, 0, 0); // Do a bulk speed test, varying precise block size and alignment - double cycles = SpeedTest(hash, seed, BULK_TRIALS, - baselen, basealignoffset, maxvarylen, maxvaryalign); + double cycles = SpeedTest(hash, seed, BULK_TRIALS, baselen, basealignoffset, maxvarylen, maxvaryalign); double curbpc = ((double)baselen - ((double)maxvarylen / 2)) / cycles; printf(" %8.2f ", curbpc); } @@ -342,18 +337,18 @@ void ShortSpeedTest(const HashInfo * hinfo) { // group of 8 byte lengths (1-8, 9-16, 17-24, 25-31), varying the // alignment during each test. for (int i = 1; i <= 4; i++) { - const int baselen = i * 8; - double cycles = 0.0; - double worstdevpct = 0.0; + const int baselen = i * 8; + double cycles = 0.0; + double worstdevpct = 0.0; for (int j = 0; j < 8; j++) { - double curcyc = SpeedTest(hash, seed, TINY_TRIALS, - baselen + j, basealignoffset, 0, maxvaryalign); - double devpct = 100.0*stddev/curcyc; + double curcyc = SpeedTest(hash, seed, TINY_TRIALS, baselen + j, basealignoffset, 0, maxvaryalign); + double devpct = 100.0 * stddev / curcyc; cycles += curcyc; - if (worstdevpct < devpct) + if (worstdevpct < devpct) { worstdevpct = devpct; + } } - printf(" %7.2f [%5.3f] ", cycles/8.0, worstdevpct); + printf(" %7.2f [%5.3f] ", cycles / 8.0, worstdevpct); } printf("\n"); diff --git a/tests/SpeedTest.h b/tests/SpeedTest.h index 44062814..f6551c7d 100644 --- a/tests/SpeedTest.h +++ b/tests/SpeedTest.h @@ -43,6 +43,6 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -bool SpeedTest(const HashInfo * info); -void ShortSpeedTest(const HashInfo * hinfo); -void ShortSpeedTestHeader(void); +bool SpeedTest( const HashInfo * info ); +void ShortSpeedTest( const HashInfo * hinfo ); +void ShortSpeedTestHeader( void ); diff --git a/tests/TextKeysetTest.cpp b/tests/TextKeysetTest.cpp index 7e7f676d..37e3e05f 100644 --- a/tests/TextKeysetTest.cpp +++ b/tests/TextKeysetTest.cpp @@ -66,173 +66,171 @@ // where "core" consists of all possible combinations of the given character // set of length N. -template < typename hashtype > -static bool TextKeyImpl(HashFn hash, const seed_t seed, const char * prefix, const char * coreset, const int corelen, const char * suffix, bool drawDiagram ) -{ - const int prefixlen = (int)strlen(prefix); - const int suffixlen = (int)strlen(suffix); - const int corecount = (int)strlen(coreset); - - const int keybytes = prefixlen + corelen + suffixlen; - long keycount = (long)pow(double(corecount),double(corelen)); - if (keycount > INT32_MAX / 8) - keycount = INT32_MAX / 8; - - uint8_t * key = new uint8_t[std::min(keybytes+1, 64)]; - memcpy(key,prefix,prefixlen); - memset(key+prefixlen, 'X', corelen); - memcpy(key+prefixlen+corelen,suffix,suffixlen); - key[keybytes] = 0; - - printf("Keyset 'Text' - keys of form \"%s\" - %ld keys\n", key, keycount); - - //---------- - - std::vector hashes; - hashes.resize(keycount); - - for(int i = 0; i < (int)keycount; i++) - { - int t = i; - - for(int j = 0; j < corelen; j++) - { - key[prefixlen+j] = coreset[t % corecount]; t /= corecount; +template +static bool TextKeyImpl( HashFn hash, const seed_t seed, const char * prefix, const char * coreset, + const int corelen, const char * suffix, bool drawDiagram ) { + const int prefixlen = (int)strlen(prefix); + const int suffixlen = (int)strlen(suffix); + const int corecount = (int)strlen(coreset); + + const int keybytes = prefixlen + corelen + suffixlen; + long keycount = (long)pow(double(corecount), double(corelen)); + + if (keycount > INT32_MAX / 8) { + keycount = INT32_MAX / 8; } - hash(key,keybytes,seed,&hashes[i]); - addVCodeInput(key, keybytes); - } + uint8_t * key = new uint8_t[std::min(keybytes + 1, 64)]; + memcpy(key, prefix, prefixlen); + memset(key + prefixlen, 'X', corelen); + memcpy(key + prefixlen + corelen, suffix, suffixlen); + key[keybytes] = 0; - //---------- - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + printf("Keyset 'Text' - keys of form \"%s\" - %ld keys\n", key, keycount); - memset(key+prefixlen, 'X', corelen); - recordTestResult(result, "Text", (const char *)key); + //---------- - addVCodeResult(result); + std::vector hashes; + hashes.resize(keycount); - delete [] key; + for (int i = 0; i < (int)keycount; i++) { + int t = i; - return result; + for (int j = 0; j < corelen; j++) { + key[prefixlen + j] = coreset[t % corecount]; t /= corecount; + } + + hash(key, keybytes, seed, &hashes[i]); + addVCodeInput(key, keybytes); + } + + //---------- + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); + + memset(key + prefixlen, 'X', corelen); + recordTestResult(result, "Text", (const char *)key); + + addVCodeResult(result); + + delete [] key; + + return result; } //----------------------------------------------------------------------------- // Keyset 'Words' - pick random chars from coreset (alnum or password chars) -template < typename hashtype > -static bool WordsKeyImpl(HashFn hash, const seed_t seed, - const long keycount, const int minlen, const int maxlen, - const char * coreset, const char* name, bool drawDiagram) { - const int corecount = (int)strlen(coreset); - printf("Keyset 'Words' - %d-%d random chars from %s charset - %ld keys\n", minlen, maxlen, name, keycount); - assert (minlen >= 0); - assert (maxlen > minlen); - - std::unordered_set words; // need to be unique, otherwise we report collisions - std::vector hashes; - hashes.resize(keycount); - Rand r(483723); - - char* key = new char[std::min(maxlen+1, 64)]; - std::string key_str; - - for(long i = 0; i < keycount; i++) - { - const int len = minlen + (r.rand_u32() % (maxlen - minlen)); - key[len] = 0; - for(int j = 0; j < len; j++) - { - key[j] = coreset[r.rand_u32() % corecount]; - } - key_str = key; - if (words.count(key_str) > 0) { // not unique - i--; - continue; - } - words.insert(key_str); - - hash(key, len, seed, &hashes[i]); - addVCodeInput(key, len); +template +static bool WordsKeyImpl( HashFn hash, const seed_t seed, const long keycount, const int minlen, + const int maxlen, const char * coreset, const char * name, bool drawDiagram ) { + const int corecount = (int)strlen(coreset); + + printf("Keyset 'Words' - %d-%d random chars from %s charset - %ld keys\n", minlen, maxlen, name, keycount); + assert(minlen >= 0 ); + assert(maxlen > minlen); + + std::unordered_set words; // need to be unique, otherwise we report collisions + std::vector hashes; + hashes.resize(keycount); + Rand r( 483723 ); + + char * key = new char[std::min(maxlen + 1, 64)]; + std::string key_str; + + for (long i = 0; i < keycount; i++) { + const int len = minlen + (r.rand_u32() % (maxlen - minlen)); + key[len] = 0; + for (int j = 0; j < len; j++) { + key[j] = coreset[r.rand_u32() % corecount]; + } + key_str = key; + if (words.count(key_str) > 0) { // not unique + i--; + continue; + } + words.insert(key_str); + + hash(key, len, seed, &hashes[i]); + addVCodeInput(key, len); #if 0 && defined DEBUG - uint64_t h; - memcpy(&h, &hashes[i], std::max(sizeof(hashtype),8)); - printf("%d %s %lx\n", i, (char*)key, h); + uint64_t h; + memcpy(&h, &hashes[i], std::max(sizeof(hashtype), 8)); + printf("%d %s %lx\n", i, (char *)key, h); #endif - } - delete [] key; + } + delete [] key; - //---------- - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + //---------- + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - recordTestResult(result, "Text", name); + recordTestResult(result, "Text", name); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } -template < typename hashtype > -static bool WordsStringImpl(HashFn hash, const seed_t seed, - std::vector & words, bool drawDiagram) { - long wordscount = words.size(); - printf("Keyset 'Words' - dictionary words - %ld keys\n", wordscount); - - std::unordered_set wordset; // need to be unique, otherwise we report collisions - std::vector hashes; - hashes.resize(wordscount); - Rand r(483723); - - for(int i = 0; i < (int)wordscount; i++) { - if (wordset.count(words[i]) > 0) { // not unique - i--; - continue; +template +static bool WordsStringImpl( HashFn hash, const seed_t seed, std::vector & words, bool drawDiagram ) { + long wordscount = words.size(); + + printf("Keyset 'Words' - dictionary words - %ld keys\n", wordscount); + + std::unordered_set wordset; // need to be unique, otherwise we report collisions + std::vector hashes; + hashes.resize(wordscount); + Rand r( 483723 ); + + for (int i = 0; i < (int)wordscount; i++) { + if (wordset.count(words[i]) > 0) { // not unique + i--; + continue; + } + wordset.insert(words[i]); + const int len = words[i].length(); + const char * key = words[i].c_str(); + hash(key, len, seed, &hashes[i]); + addVCodeInput(key, len); } - wordset.insert(words[i]); - const int len = words[i].length(); - const char *key = words[i].c_str(); - hash(key, len, seed, &hashes[i]); - addVCodeInput(key, len); - } - //---------- - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + //---------- + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - recordTestResult(result, "Text", "dictionary"); + recordTestResult(result, "Text", "dictionary"); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool TextKeyTest(const HashInfo * hinfo, const bool verbose) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - const char * alnum = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +template +bool TextKeyTest( const HashInfo * hinfo, const bool verbose ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + const char * alnum = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; const char * passwordchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" - ".,!?:;-+=()<>/|\"'@#$%&*_^"; + ".,!?:;-+=()<>/|\"'@#$%&*_^"; bool result = true; printf("[[[ Keyset 'Text' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); - result &= TextKeyImpl(hash, seed, "Foo", alnum, 4, "Bar", verbose ); - result &= TextKeyImpl(hash, seed, "FooBar", alnum, 4, "", verbose ); - result &= TextKeyImpl(hash, seed, "", alnum, 4, "FooBar", verbose ); + result &= TextKeyImpl(hash, seed, "Foo" , alnum, 4, "Bar" , verbose); + result &= TextKeyImpl(hash, seed, "FooBar", alnum, 4, "" , verbose); + result &= TextKeyImpl(hash, seed, "" , alnum, 4, "FooBar", verbose); // maybe use random-len vector of strings here, from len 6-16 - result &= WordsKeyImpl(hash, seed, 4000000, 6, 16, alnum, "alnum", verbose ); - result &= WordsKeyImpl(hash, seed, 4000000, 6, 16, passwordchars, "password", verbose ); + result &= WordsKeyImpl (hash, seed, 4000000, 6, 16, alnum , "alnum", verbose); + result &= WordsKeyImpl (hash, seed, 4000000, 6, 16, passwordchars, "password", verbose); std::vector words = HashMapInit(verbose); - result &= WordsStringImpl(hash, seed, words, verbose ); + result &= WordsStringImpl(hash, seed, words, verbose); printf("%s\n", result ? "" : g_failstr); diff --git a/tests/TextKeysetTest.h b/tests/TextKeysetTest.h index c97e255f..870261dc 100644 --- a/tests/TextKeysetTest.h +++ b/tests/TextKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool TextKeyTest(const HashInfo * info, const bool verbose); +template +bool TextKeyTest( const HashInfo * info, const bool verbose ); diff --git a/tests/TwoBytesKeysetTest.cpp b/tests/TwoBytesKeysetTest.cpp index 4458598e..808f64f3 100644 --- a/tests/TwoBytesKeysetTest.cpp +++ b/tests/TwoBytesKeysetTest.cpp @@ -49,7 +49,7 @@ #include "Platform.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // for chooseK +#include "Stats.h" // for chooseK #include "Analyze.h" #include "Instantiate.h" #include "VCode.h" @@ -59,18 +59,18 @@ //----------------------------------------------------------------------------- // Keyset 'TwoBytes' - generate all keys up to length N with two non-zero bytes -template< typename hashtype > -static void TwoBytesKeygen(HashFn hash, const seed_t seed, - int maxlen, std::vector & hashes) { +template +static void TwoBytesKeygen( HashFn hash, const seed_t seed, int maxlen, std::vector & hashes ) { //---------- // Compute # of keys int keycount = 0; + for (int i = 2; i <= maxlen; i++) { - keycount += (int)chooseK(i,2); + keycount += (int)chooseK(i, 2); } - keycount *= 255*255; + keycount *= 255 * 255; for (int i = 2; i <= maxlen; i++) { - keycount += i*255; + keycount += i * 255; } printf("Keyset 'TwoBytes' - up-to-%d-byte keys - %d keys\n", maxlen, keycount); @@ -81,7 +81,7 @@ static void TwoBytesKeygen(HashFn hash, const seed_t seed, memset(key, 0, 256); for (int keylen = 2; keylen <= maxlen; keylen++) { - for (int byteA = 0; byteA < keylen; byteA++){ + for (int byteA = 0; byteA < keylen; byteA++) { for (int valA = 1; valA <= 255; valA++) { hashtype h; key[byteA] = (uint8_t)valA; @@ -96,8 +96,8 @@ static void TwoBytesKeygen(HashFn hash, const seed_t seed, //---------- // Add all keys with two non-zero bytes for (int keylen = 2; keylen <= maxlen; keylen++) { - for (int byteA = 0; byteA < keylen-1; byteA++) { - for (int byteB = byteA+1; byteB < keylen; byteB++) { + for (int byteA = 0; byteA < keylen - 1; byteA++) { + for (int byteB = byteA + 1; byteB < keylen; byteB++) { for (int valA = 1; valA <= 255; valA++) { key[byteA] = (uint8_t)valA; for (int valB = 1; valB <= 255; valB++) { @@ -115,28 +115,29 @@ static void TwoBytesKeygen(HashFn hash, const seed_t seed, } } -template < typename hashtype > -static bool TwoBytesTest2(HashFn hash, const seed_t seed, int maxlen, bool drawDiagram) { - std::vector hashes; +template +static bool TwoBytesTest2( HashFn hash, const seed_t seed, int maxlen, bool drawDiagram ) { + std::vector hashes; - TwoBytesKeygen(hash, seed, maxlen, hashes); + TwoBytesKeygen(hash, seed, maxlen, hashes); - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - recordTestResult(result, "TwoBytes", maxlen); + recordTestResult(result, "TwoBytes", maxlen); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool TwoBytesKeyTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - int maxlen; +template +bool TwoBytesKeyTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + int maxlen; + if (extra) { maxlen = 24; } else if (hinfo->isVerySlow()) { diff --git a/tests/TwoBytesKeysetTest.h b/tests/TwoBytesKeysetTest.h index df21eac6..c5789be2 100644 --- a/tests/TwoBytesKeysetTest.h +++ b/tests/TwoBytesKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool TwoBytesKeyTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool TwoBytesKeyTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/WindowedKeysetTest.cpp b/tests/WindowedKeysetTest.cpp index c92b5a6e..adf3594d 100644 --- a/tests/WindowedKeysetTest.cpp +++ b/tests/WindowedKeysetTest.cpp @@ -49,7 +49,7 @@ #include "Platform.h" #include "Hashinfo.h" #include "TestGlobals.h" -#include "Stats.h" // For EstimateNbCollisions +#include "Stats.h" // For EstimateNbCollisions #include "Analyze.h" #include "Instantiate.h" #include "VCode.h" @@ -62,84 +62,84 @@ // Keyset 'Window' - for all possible N-bit windows of a K-bit key, generate // all possible keys with bits set in that window -template < typename keytype, typename hashtype > -static bool WindowedKeyImpl(HashFn hash, const seed_t seed, int windowbits, - bool testCollision, bool testDistribution, bool drawDiagram) { - const int keybits = sizeof(keytype) * 8; - const int hashbits = sizeof(hashtype) * 8; - // calc keycount to expect min. 0.5 collisions: EstimateNbCollisions, except for 64++bit. - // there limit to 2^25 = 33554432 keys - int keycount = 1 << windowbits; - while (EstimateNbCollisions(keycount, hashbits) < 0.5 && windowbits < 25) { - if ((int)log2(2.0 * keycount) < 0) // overflow - break; - keycount *= 2; - windowbits = (int)log2(1.0 * keycount); - //printf (" enlarge windowbits to %d (%d keys)\n", windowbits, keycount); - //fflush (NULL); - } - - std::vector hashes; - hashes.resize(keycount); - - bool result = true; - int testcount = keybits; - - printf("Keyset 'Window' - %3d-bit key, %3d-bit window - %d tests - %d keys\n", - keybits,windowbits,testcount,keycount); - - for(int j = 0; j < testcount; j++) - { - int minbit = j; - keytype key; - - for(int i = 0; i < keycount; i++) - { - key = i; - key.lrot(minbit); - hash(&key, sizeof(keytype), seed, &hashes[i]); - addVCodeInput(&key, sizeof(keytype)); +template +static bool WindowedKeyImpl( HashFn hash, const seed_t seed, int windowbits, + bool testCollision, bool testDistribution, bool drawDiagram ) { + const int keybits = sizeof(keytype ) * 8; + const int hashbits = sizeof(hashtype) * 8; + // calc keycount to expect min. 0.5 collisions: EstimateNbCollisions, except for 64++bit. + // there limit to 2^25 = 33554432 keys + int keycount = 1 << windowbits; + + while (EstimateNbCollisions(keycount, hashbits) < 0.5 && windowbits < 25) { + if ((int)log2(2.0 * keycount) < 0) { // overflow + break; + } + keycount *= 2; + windowbits = (int)log2(1.0 * keycount); + // printf (" enlarge windowbits to %d (%d keys)\n", windowbits, keycount); + // fflush (NULL); } - printf("Window at bit %3d\n",j); + std::vector hashes; + hashes.resize(keycount); - bool thisresult = TestHashList(hashes, drawDiagram, testCollision, testDistribution, - /* do not test high/low bits (to not clobber the screen) */ - false, false, true); + bool result = true; + int testcount = keybits; - recordTestResult(thisresult, "Windowed", j); + printf("Keyset 'Window' - %3d-bit key, %3d-bit window - %d tests - %d keys\n", + keybits, windowbits, testcount, keycount); - addVCodeResult(thisresult); + for (int j = 0; j < testcount; j++) { + int minbit = j; + keytype key; - result &= thisresult; - } + for (int i = 0; i < keycount; i++) { + key = i; + key.lrot(minbit); + hash(&key, sizeof(keytype), seed, &hashes[i]); + addVCodeInput(&key, sizeof(keytype)); + } - return result; + printf("Window at bit %3d\n", j); + + bool thisresult = TestHashList(hashes, drawDiagram, testCollision, testDistribution, + /* do not test high/low bits (to not clobber the screen) */ + false, false, true); + + recordTestResult(thisresult, "Windowed", j); + + addVCodeResult(thisresult); + + result &= thisresult; + } + + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool WindowedKeyTest(const HashInfo * hinfo, const bool verbose, const bool extra) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; - bool testCollision = true; +template +bool WindowedKeyTest( const HashInfo * hinfo, const bool verbose, const bool extra ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; + bool testCollision = true; // Skip distribution test for these - they're too easy to // distribute well, and it generates a _lot_ of testing. bool testDistribution = extra; // This value is now adjusted to generate at least 0.5 collisions per window, // except for 64++bit where it unrealistic. There use smaller but more keys, // to get a higher collision percentage. - int windowbits = 20; + int windowbits = 20; constexpr int hashbits = sizeof(hashtype) * 8; - constexpr int keybits = (hashbits >= 64) ? 32 : 72; + constexpr int keybits = (hashbits >= 64) ? 32 : 72; printf("[[[ Keyset 'Window' Tests ]]]\n\n"); const seed_t seed = hinfo->Seed(g_seed); - result &= WindowedKeyImpl< Blob, hashtype >(hash, seed, - windowbits, testCollision, testDistribution, verbose); + result &= + WindowedKeyImpl, hashtype>(hash, seed, windowbits, testCollision, testDistribution, verbose); printf("\n%s\n", result ? "" : g_failstr); diff --git a/tests/WindowedKeysetTest.h b/tests/WindowedKeysetTest.h index 433efb72..cc545848 100644 --- a/tests/WindowedKeysetTest.h +++ b/tests/WindowedKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool WindowedKeyTest(const HashInfo * info, const bool verbose, const bool extra); +template +bool WindowedKeyTest( const HashInfo * info, const bool verbose, const bool extra ); diff --git a/tests/ZeroesKeysetTest.cpp b/tests/ZeroesKeysetTest.cpp index 8f7738ad..cb525675 100644 --- a/tests/ZeroesKeysetTest.cpp +++ b/tests/ZeroesKeysetTest.cpp @@ -59,44 +59,44 @@ // Keyset 'Zeroes' - keys consisting of all zeroes, differing only in length // We reuse one block of empty bytes, otherwise the RAM cost is enormous. -template < typename hashtype > -static bool ZeroKeyImpl(HashFn hash, const seed_t seed, bool drawDiagram) { - int keycount = 200*1024; +template +static bool ZeroKeyImpl( HashFn hash, const seed_t seed, bool drawDiagram ) { + int keycount = 200 * 1024; - printf("Keyset 'Zeroes' - %d keys\n",keycount); + printf("Keyset 'Zeroes' - %d keys\n", keycount); - uint8_t * nullblock = new uint8_t[keycount]; - memset(nullblock,0,keycount); + uint8_t * nullblock = new uint8_t[keycount]; + memset(nullblock, 0, keycount); - addVCodeInput(nullblock, keycount); + addVCodeInput(nullblock, keycount); - //---------- - std::vector hashes; + //---------- + std::vector hashes; - hashes.resize(keycount); + hashes.resize(keycount); - for(int i = 0; i < keycount; i++) { - hash(nullblock, i, seed, &hashes[i]); - } + for (int i = 0; i < keycount; i++) { + hash(nullblock, i, seed, &hashes[i]); + } - bool result = TestHashList(hashes,drawDiagram); - printf("\n"); + bool result = TestHashList(hashes, drawDiagram); + printf("\n"); - delete [] nullblock; + delete [] nullblock; - recordTestResult(result, "Zeroes", (const char *)NULL); + recordTestResult(result, "Zeroes", (const char *)NULL); - addVCodeResult(result); + addVCodeResult(result); - return result; + return result; } //----------------------------------------------------------------------------- -template < typename hashtype > -bool ZeroKeyTest(const HashInfo * hinfo, const bool verbose) { - const HashFn hash = hinfo->hashFn(g_hashEndian); - bool result = true; +template +bool ZeroKeyTest( const HashInfo * hinfo, const bool verbose ) { + const HashFn hash = hinfo->hashFn(g_hashEndian); + bool result = true; printf("[[[ Keyset 'Zeroes' Tests ]]]\n\n"); diff --git a/tests/ZeroesKeysetTest.h b/tests/ZeroesKeysetTest.h index 6d124dd3..8ff346d8 100644 --- a/tests/ZeroesKeysetTest.h +++ b/tests/ZeroesKeysetTest.h @@ -47,5 +47,5 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -template < typename hashtype > -bool ZeroKeyTest(const HashInfo * info, const bool verbose); +template +bool ZeroKeyTest( const HashInfo * info, const bool verbose ); diff --git a/util/Analyze.cpp b/util/Analyze.cpp index 26c9f657..784adb64 100644 --- a/util/Analyze.cpp +++ b/util/Analyze.cpp @@ -59,7 +59,7 @@ #include #include -#include // for memset +#include // for memset #include #include "Analyze.h" @@ -79,174 +79,174 @@ static const double WARNING_PBOUND = exp2(-12); // 2**-12 == 1/4096 =~ 0.0244%, // (number of excess "heads" or "tails") over all those trials was the // specified worstbiascnt. -bool ReportBias(const int worstbiascnt, const int coinflips, const int trials, const bool drawDiagram ) -{ - double ratio = (double)worstbiascnt / (double)coinflips; - double p1value = 2 * exp(-(double)worstbiascnt * ratio); // two-tailed Chernoff Bound - double p_value = ScalePValue(p1value, trials); - int logp_value = GetLog2PValue(p_value); - bool result = true; - - recordLog2PValue(logp_value); - if (drawDiagram) - printf(" worst bias is %f%% (%6d) (p<%8.6e) (^%2d)", ratio*200.0, worstbiascnt, p_value, logp_value); - else - printf(" worst bias is %f%% (^%2d)", ratio*200.0, logp_value); - - if (p_value < FAILURE_PBOUND) - { - printf(" !!!!!\n"); - result = false; - } - else if (p_value < WARNING_PBOUND) - printf(" !\n"); - else - printf("\n"); +bool ReportBias( const int worstbiascnt, const int coinflips, const int trials, const bool drawDiagram ) { + double ratio = (double)worstbiascnt / (double)coinflips; + double p1value = 2 * exp(-(double)worstbiascnt * ratio); // two-tailed Chernoff Bound + double p_value = ScalePValue(p1value, trials); + int logp_value = GetLog2PValue(p_value); + bool result = true; + + recordLog2PValue(logp_value); + if (drawDiagram) { + printf(" worst bias is %f%% (%6d) (p<%8.6e) (^%2d)", ratio * 200.0, worstbiascnt, p_value, logp_value); + } else { + printf(" worst bias is %f%% (^%2d)", ratio * 200.0, logp_value); + } - return result; + if (p_value < FAILURE_PBOUND) { + printf(" !!!!!\n"); + result = false; + } else if (p_value < WARNING_PBOUND) { + printf(" !\n"); + } else { + printf("\n"); + } + + return result; } //----------------------------------------------------------------------------- -static bool ReportCollisions(uint64_t const nbH, int collcount, unsigned hashsize, bool maxcoll, bool highbits, bool header, bool verbose, bool drawDiagram ) -{ - bool largehash = hashsize > (8 * sizeof(uint32_t)); - - // The expected number depends on what collision statistic is being - // reported on; "worst of N buckets" is very different than "sum - // over N buckets". - // - // Also determine an upper-bound on the unlikelihood of the observed - // collision count. - double expected, p_value; - if (maxcoll) - { - expected = EstimateMaxCollisions(nbH, hashsize); - p_value = EstimatedBinomialPValue(nbH, hashsize, collcount); - } - else - { - expected = EstimateNbCollisions(nbH, hashsize); - p_value = BoundedPoissonPValue(expected, collcount); - } - int logp_value = GetLog2PValue(p_value); - - // Since p-values are now used to determine pass/warning/failure - // status, ratios are now solely for humans reading the results. - // - // If there were no collisions and none were expected, for a - // suitably fuzzy value of "none", then a ratio of 1.00 ("test - // exactly met expectations") is most sensible. - // - // If there were no collisions and there was a decent chance of - // seeing one, then a ratio of 0.00 ("test saw 0% of expected - // collisions") seems best. - // - // If there were any collisions, and the odds of seeing one were - // quite low (arbitrarily chosen to be 0.01), then a ratio isn't - // really meaningful, so we use +inf. - // - // A collision count matching the rounded expectation value is - // treated as "exactly expected". For small hash sizes, if the - // expected count has more than 0.1 after the decimal place and the - // actual collision count is the next integer above the expected - // one, then that case is also treated as "exactly expected". - // - // In all other cases, the true ratio is computed, but the value - // will be bounded to not clutter the output in failure cases. - double ratio; - if (collcount == 0) - ratio = (expected < 0.1) ? 1.00 : 0.00; - else if (expected < 0.01) - ratio = INFINITY; - else if (collcount == (int)round(expected)) - ratio = 1.00; - else if (!largehash && (collcount == (int)round(expected+0.4))) - ratio = 1.00; - else { - ratio = double(collcount) / expected; - if (ratio >= 999.95) - ratio = INFINITY; - } - - bool warning = false, failure = false; - if (p_value < FAILURE_PBOUND) - failure = true; - else if (p_value < WARNING_PBOUND) - warning = true; - else if (isnan(ratio)) - warning = true; - - if (verbose) - { - if (header) - printf("Testing %s collisions (%s %3i-bit)", maxcoll ? "max" : "all", - highbits ? "high" : "low ", hashsize); - - // 8 integer digits would match the 10.1 float specifier - // (10 characters - 1 decimal point - 1 digit after the decimal), - // but some hashes greatly exceed expected collision counts. - if (!finite(ratio)) - printf(" - Expected %10.1f, actual %10i (------) ", expected, collcount); - else if (ratio < 9.0) - printf(" - Expected %10.1f, actual %10i (%5.3fx) ", expected, collcount, ratio); - else - printf(" - Expected %10.1f, actual %10i (%#.4gx) ", expected, collcount, ratio); - - // Since ratios and p-value summaries are most important to humans, - // and deltas and exact p-values add visual noise and variable line - // widths and possibly field counts, they are now only printed out - // in --verbose mode. - recordLog2PValue(logp_value); - if (drawDiagram) - printf("(%+i) (p<%8.6f) (^%2d)", collcount - (int)round(expected), p_value, logp_value); - else - printf("(^%2d)", logp_value); - - if (failure) - printf(" !!!!!\n"); - else if (warning) - printf(" !\n"); - else - printf("\n"); - } - - return !failure; +static bool ReportCollisions( uint64_t const nbH, int collcount, unsigned hashsize, bool maxcoll, + bool highbits, bool header, bool verbose, bool drawDiagram ) { + bool largehash = hashsize > (8 * sizeof(uint32_t)); + + // The expected number depends on what collision statistic is being + // reported on; "worst of N buckets" is very different than "sum + // over N buckets". + // + // Also determine an upper-bound on the unlikelihood of the observed + // collision count. + double expected, p_value; + + if (maxcoll) { + expected = EstimateMaxCollisions(nbH, hashsize); + p_value = EstimatedBinomialPValue(nbH, hashsize, collcount); + } else { + expected = EstimateNbCollisions(nbH, hashsize); + p_value = BoundedPoissonPValue(expected, collcount); + } + int logp_value = GetLog2PValue(p_value); + + // Since p-values are now used to determine pass/warning/failure + // status, ratios are now solely for humans reading the results. + // + // If there were no collisions and none were expected, for a + // suitably fuzzy value of "none", then a ratio of 1.00 ("test + // exactly met expectations") is most sensible. + // + // If there were no collisions and there was a decent chance of + // seeing one, then a ratio of 0.00 ("test saw 0% of expected + // collisions") seems best. + // + // If there were any collisions, and the odds of seeing one were + // quite low (arbitrarily chosen to be 0.01), then a ratio isn't + // really meaningful, so we use +inf. + // + // A collision count matching the rounded expectation value is + // treated as "exactly expected". For small hash sizes, if the + // expected count has more than 0.1 after the decimal place and the + // actual collision count is the next integer above the expected + // one, then that case is also treated as "exactly expected". + // + // In all other cases, the true ratio is computed, but the value + // will be bounded to not clutter the output in failure cases. + double ratio; + if (collcount == 0) { + ratio = (expected < 0.1) ? 1.00 : 0.00; + } else if (expected < 0.01) { + ratio = INFINITY; + } else if (collcount == (int)round(expected)) { + ratio = 1.00; + } else if (!largehash && (collcount == (int)round(expected + 0.4))) { + ratio = 1.00; + } else { + ratio = double(collcount) / expected; + if (ratio >= 999.95) { + ratio = INFINITY; + } + } + + bool warning = false, failure = false; + if (p_value < FAILURE_PBOUND) { + failure = true; + } else if (p_value < WARNING_PBOUND) { + warning = true; + } else if (isnan(ratio)) { + warning = true; + } + + if (verbose) { + if (header) { + printf("Testing %s collisions (%s %3i-bit)", maxcoll ? "max" : "all", highbits ? "high" : "low ", hashsize); + } + + // 8 integer digits would match the 10.1 float specifier + // (10 characters - 1 decimal point - 1 digit after the decimal), + // but some hashes greatly exceed expected collision counts. + if (!finite(ratio)) { + printf(" - Expected %10.1f, actual %10i (------) ", expected, collcount); + } else if (ratio < 9.0) { + printf(" - Expected %10.1f, actual %10i (%5.3fx) ", expected, collcount, ratio); + } else { + printf(" - Expected %10.1f, actual %10i (%#.4gx) ", expected, collcount, ratio); + } + + // Since ratios and p-value summaries are most important to humans, + // and deltas and exact p-values add visual noise and variable line + // widths and possibly field counts, they are now only printed out + // in --verbose mode. + recordLog2PValue(logp_value); + if (drawDiagram) { + printf("(%+i) (p<%8.6f) (^%2d)", collcount - (int)round(expected), p_value, logp_value); + } else { + printf("(^%2d)", logp_value); + } + + if (failure) { + printf(" !!!!!\n"); + } else if (warning) { + printf(" !\n"); + } else { + printf("\n"); + } + } + + return !failure; } //---------------------------------------------------------------------------- -static void plot ( double n ) -{ - int ni = (int)floor(n); - - // Less than [0,3) sigma is fine, [3, 12) sigma is notable, 12+ sigma is pretty bad - if(ni <= 2) - putchar('.'); - else if (ni <= 11) - putchar('1' + ni - 3); - else - putchar('X'); +static void plot( double n ) { + int ni = (int)floor(n); + + // Less than [0,3) sigma is fine, [3, 12) sigma is notable, 12+ sigma is pretty bad + if (ni <= 2) { + putchar('.'); + } else if (ni <= 11) { + putchar('1' + ni - 3); + } else { + putchar('X'); + } } //----------------------------------------------------------------------------- // Sort the hash list, count the total number of collisions and return // the first N collisions for further processing -template< typename hashtype > -unsigned int FindCollisions(std::vector & hashes, - std::set & collisions, - int maxCollisions, - bool drawDiagram) { +template +unsigned int FindCollisions( std::vector & hashes, std::set & collisions, + int maxCollisions, bool drawDiagram ) { unsigned int collcount = 0; - blobsort(hashes.begin(),hashes.end()); + + blobsort(hashes.begin(), hashes.end()); const size_t sz = hashes.size(); for (size_t hnb = 1; hnb < sz; hnb++) { - if(hashes[hnb] == hashes[hnb-1]) { + if (hashes[hnb] == hashes[hnb - 1]) { collcount++; - if(collcount < maxCollisions) { + if (collcount < maxCollisions) { #if 0 - printf (" %zu: ", hnb); + printf(" %zu: ", hnb); hashes[hnb].printhex(""); #endif if (drawDiagram) { @@ -257,8 +257,9 @@ unsigned int FindCollisions(std::vector & hashes, } #if 0 && defined(DEBUG) - if (collcount) - printf ("\n"); + if (collcount) { + printf("\n"); + } #endif return collcount; @@ -266,12 +267,12 @@ unsigned int FindCollisions(std::vector & hashes, INSTANTIATE(FindCollisions, HASHTYPELIST); -template < typename hashtype > -void PrintCollisions(std::set & collisions) { +template +void PrintCollisions( std::set & collisions ) { printf("\nCollisions:\n"); for (auto it = collisions.begin(); it != collisions.end(); ++it) { - const hashtype &hash = *it; + const hashtype & hash = *it; hash.printhex(" "); } printf("\n"); @@ -295,260 +296,262 @@ INSTANTIATE(PrintCollisions, HASHTYPELIST); // since a collision for N bits is also a collision for N-k bits. // // This requires the vector of hashes to be sorted. -template< typename hashtype > -static void CountRangedNbCollisions ( std::vector & hashes, uint64_t const nbH, int minHBits, int maxHBits, int threshHBits, int * collcounts) -{ - const int origBits = sizeof(hashtype) * 8; - assert(minHBits >= 1); - assert(minHBits <= maxHBits); - assert(origBits >= maxHBits); - assert((threshHBits == 0) || (threshHBits >= minHBits)); - assert((threshHBits == 0) || (threshHBits <= maxHBits)); - - const int collbins = maxHBits - minHBits + 1; - const int maxcollbins = (threshHBits == 0) ? 0 : threshHBits - minHBits + 1; - int prevcoll[maxcollbins + 1]; - int maxcoll[maxcollbins + 1]; - - memset(collcounts, 0, sizeof(collcounts[0])*collbins); - memset(prevcoll, 0, sizeof(prevcoll[0])*maxcollbins); - memset(maxcoll, 0, sizeof(maxcoll[0])*maxcollbins); - - for (uint64_t hnb = 1; hnb < nbH; hnb++) { - hashtype hdiff = hashes[hnb-1] ^ hashes[hnb]; - int hzb = hdiff.highzerobits(); - if (hzb > maxHBits) - hzb = maxHBits; - if (hzb >= minHBits) - collcounts[hzb - minHBits]++; - // If we don't care about maximum collision counts, or if this - // hash is a collision for *all* bit widths where we do care about - // maximums, then this is all that need be done for this hash. - if (hzb >= threshHBits) - continue; - // If we do care about maximum collision counts, then any window - // sizes which are strictly larger than hzb have just encountered - // a non-collision. For each of those window sizes, see how many - // collisions there have been since the last non-collision, and - // record it if that's the new peak. - if (hzb < minHBits - 1) - hzb = minHBits - 1; - // coll is the total number of collisions so far, for the window - // width corresponding to index i - int coll = 0; - for (int i = collbins - 1; i >= maxcollbins; i--) - coll += collcounts[i]; - for (int i = maxcollbins - 1; i > hzb - minHBits; i--) - { - coll += collcounts[i]; - // See if this is the new peak for this window width - maxcoll[i] = std::max(maxcoll[i], coll - prevcoll[i]); - // Record the total number of collisions seen so far at this - // non-collision, so that when the next non-collision happens we - // can compute how many collisions there have been since this one. - prevcoll[i] = coll; - } - } - - for (int i = collbins - 2; i >= 0; i--) - collcounts[i] += collcounts[i + 1]; - for (int i = maxcollbins - 1; i >= 0; i--) - collcounts[i] = std::max(maxcoll[i], collcounts[i] - prevcoll[i]); +template +static void CountRangedNbCollisions( std::vector & hashes, uint64_t const nbH, + int minHBits, int maxHBits, int threshHBits, int * collcounts ) { + const int origBits = sizeof(hashtype) * 8; + + assert(minHBits >= 1 ); + assert(minHBits <= maxHBits); + assert(origBits >= maxHBits); + assert((threshHBits == 0) || (threshHBits >= minHBits)); + assert((threshHBits == 0) || (threshHBits <= maxHBits)); + + const int collbins = maxHBits - minHBits + 1; + const int maxcollbins = (threshHBits == 0) ? 0 : threshHBits - minHBits + 1; + int prevcoll[maxcollbins + 1]; + int maxcoll[maxcollbins + 1]; + + memset(collcounts, 0, sizeof(collcounts[0]) * collbins ); + memset(prevcoll , 0, sizeof(prevcoll[0]) * maxcollbins); + memset(maxcoll , 0, sizeof(maxcoll[0]) * maxcollbins ); + + for (uint64_t hnb = 1; hnb < nbH; hnb++) { + hashtype hdiff = hashes[hnb - 1] ^ hashes[hnb]; + int hzb = hdiff.highzerobits(); + if (hzb > maxHBits) { + hzb = maxHBits; + } + if (hzb >= minHBits) { + collcounts[hzb - minHBits]++; + } + // If we don't care about maximum collision counts, or if this + // hash is a collision for *all* bit widths where we do care about + // maximums, then this is all that need be done for this hash. + if (hzb >= threshHBits) { + continue; + } + // If we do care about maximum collision counts, then any window + // sizes which are strictly larger than hzb have just encountered + // a non-collision. For each of those window sizes, see how many + // collisions there have been since the last non-collision, and + // record it if that's the new peak. + if (hzb < minHBits - 1) { + hzb = minHBits - 1; + } + // coll is the total number of collisions so far, for the window + // width corresponding to index i + int coll = 0; + for (int i = collbins - 1; i >= maxcollbins; i--) { + coll += collcounts [i]; + } + for (int i = maxcollbins - 1; i > hzb - minHBits; i--) { + coll += collcounts[i]; + // See if this is the new peak for this window width + maxcoll[i] = std::max(maxcoll[i], coll - prevcoll[i]); + // Record the total number of collisions seen so far at this + // non-collision, so that when the next non-collision happens we + // can compute how many collisions there have been since this one. + prevcoll[i] = coll; + } + } + + for (int i = collbins - 2; i >= 0; i--) { + collcounts[i] += collcounts[i + 1]; + } + for (int i = maxcollbins - 1; i >= 0; i--) { + collcounts[i] = std::max(maxcoll[i], collcounts[i] - prevcoll[i]); + } } //----------------------------------------------------------------------------- // -static bool ReportBitsCollisions (uint64_t nbH, int * collcounts, int minBits, int maxBits, bool highbits, bool drawDiagram ) -{ - if (maxBits <= 1 || minBits > maxBits) return true; - - int spacelen = 80; - spacelen -= printf("Testing all collisions (%s %2i..%2i bits) - ", - highbits ? "high" : "low ", minBits, maxBits); - - double maxCollDev = 0.0; - int maxCollDevBits = 0; - int maxCollDevNb = 0; - double maxCollDevExp = 1.0; - double maxPValue = INFINITY; - - for (int b = minBits; b <= maxBits; b++) { - int const nbColls = collcounts[b - minBits]; - double const expected = EstimateNbCollisions(nbH, b); - assert(expected > 0.0); - double const dev = (double)nbColls / expected; - double const p_value = BoundedPoissonPValue(expected, nbColls); - //printf("%d bits, %d/%f, p %f\n", b, nbColls, expected, p_value); - if (p_value < maxPValue) { - maxPValue = p_value; - maxCollDev = dev; - maxCollDevBits = b; - maxCollDevNb = nbColls; - maxCollDevExp = expected; - } - } - - const char * spaces = " "; - int i_maxCollDevExp = (int)round(maxCollDevExp); - spacelen -= printf("Worst is %2i bits: %i/%i ", maxCollDevBits, maxCollDevNb, i_maxCollDevExp); - if (spacelen < 0) - spacelen = 0; - else if (spacelen > strlen(spaces)) - spacelen = strlen(spaces); - - if (maxCollDev >= 999.95) - maxCollDev = INFINITY; - - if (!finite(maxCollDev)) - printf("%.*s(------) ", spacelen, spaces); - else if (maxCollDev < 9.0) - printf("%.*s(%5.3fx) ", spacelen, spaces, maxCollDev); - else - printf("%.*s(%#.4gx) ", spacelen, spaces, maxCollDev); - - - double p_value = ScalePValue(maxPValue, maxBits - minBits + 1); - int logp_value = GetLog2PValue(p_value); - - recordLog2PValue(logp_value); - if (drawDiagram) - printf("(%+i) (p<%8.6f) (^%2d)", maxCollDevNb - i_maxCollDevExp, p_value, logp_value); - else - printf("(^%2d)", logp_value); - - if (p_value < FAILURE_PBOUND) - { - printf(" !!!!!\n"); - return false; - } - else if (p_value < WARNING_PBOUND) - printf(" !\n"); - else - printf("\n"); - return true; +static bool ReportBitsCollisions( uint64_t nbH, int * collcounts, int minBits, + int maxBits, bool highbits, bool drawDiagram ) { + if ((maxBits <= 1) || (minBits > maxBits)) { return true; } + + int spacelen = 80; + spacelen -= printf("Testing all collisions (%s %2i..%2i bits) - ", highbits ? "high" : "low ", minBits, maxBits); + + double maxCollDev = 0.0; + int maxCollDevBits = 0; + int maxCollDevNb = 0; + double maxCollDevExp = 1.0; + double maxPValue = INFINITY; + + for (int b = minBits; b <= maxBits; b++) { + int const nbColls = collcounts[b - minBits]; + double const expected = EstimateNbCollisions(nbH, b); + assert(expected > 0.0); + double const dev = (double)nbColls / expected; + double const p_value = BoundedPoissonPValue(expected, nbColls); + // printf("%d bits, %d/%f, p %f\n", b, nbColls, expected, p_value); + if (p_value < maxPValue) { + maxPValue = p_value; + maxCollDev = dev; + maxCollDevBits = b; + maxCollDevNb = nbColls; + maxCollDevExp = expected; + } + } + + const char * spaces = " "; + int i_maxCollDevExp = (int)round(maxCollDevExp); + spacelen -= printf("Worst is %2i bits: %i/%i ", maxCollDevBits, maxCollDevNb, i_maxCollDevExp); + if (spacelen < 0) { + spacelen = 0; + } else if (spacelen > strlen(spaces)) { + spacelen = strlen(spaces); + } + + if (maxCollDev >= 999.95) { + maxCollDev = INFINITY; + } + + if (!finite(maxCollDev)) { + printf("%.*s(------) ", spacelen, spaces); + } else if (maxCollDev < 9.0) { + printf("%.*s(%5.3fx) ", spacelen, spaces, maxCollDev); + } else { + printf("%.*s(%#.4gx) ", spacelen, spaces, maxCollDev); + } + + double p_value = ScalePValue(maxPValue, maxBits - minBits + 1); + int logp_value = GetLog2PValue(p_value); + + recordLog2PValue(logp_value); + if (drawDiagram) { + printf("(%+i) (p<%8.6f) (^%2d)", maxCollDevNb - i_maxCollDevExp, p_value, logp_value); + } else { + printf("(^%2d)", logp_value); + } + + if (p_value < FAILURE_PBOUND) { + printf(" !!!!!\n"); + return false; + } else if (p_value < WARNING_PBOUND) { + printf(" !\n"); + } else { + printf("\n"); + } + return true; } //---------------------------------------------------------------------------- // Measure the distribution "score" for each possible N-bit span, with // N going from 8 to 20 inclusive. -static int MaxDistBits ( const uint64_t nbH ) -{ - // If there aren't 5 keys per bin over 8 bins, then don't bother - // testing distribution at all. - if (nbH < (5 * 8)) - return 0; - int maxwidth = 20; - // We need at least 5 keys per bin to reliably test distribution biases - // down to 1%, so don't bother to test sparser distributions than that - while(double(nbH) / double(1 << maxwidth) < 5.0) - --maxwidth; - return maxwidth; +static int MaxDistBits( const uint64_t nbH ) { + // If there aren't 5 keys per bin over 8 bins, then don't bother + // testing distribution at all. + if (nbH < (5 * 8)) { + return 0; + } + int maxwidth = 20; + // We need at least 5 keys per bin to reliably test distribution biases + // down to 1%, so don't bother to test sparser distributions than that + while (double(nbH) / double(1 << maxwidth) < 5.0) { + --maxwidth; + } + return maxwidth; } -template< typename hashtype > -static bool TestDistribution ( std::vector & hashes, bool drawDiagram ) -{ - const int hashbits = sizeof(hashtype) * 8; - const uint64_t nbH = hashes.size(); - int maxwidth = MaxDistBits(nbH); - int minwidth = 8; +template +static bool TestDistribution( std::vector & hashes, bool drawDiagram ) { + const int hashbits = sizeof(hashtype) * 8; + const uint64_t nbH = hashes.size(); + int maxwidth = MaxDistBits(nbH); + int minwidth = 8; - if (maxwidth < minwidth) return true; + if (maxwidth < minwidth) { return true; } - printf("Testing distribution (any %2i..%2i bits)%s", minwidth, maxwidth, drawDiagram ? "\n[" : " - "); + printf("Testing distribution (any %2i..%2i bits)%s", minwidth, maxwidth, drawDiagram ? "\n[" : " - "); - std::vector bins; - bins.resize(1 << maxwidth); + std::vector bins; + bins.resize(1 << maxwidth); - double worstN = 0; // Only report on biases above 0 - int worstStart = -1; - int worstWidth = -1; - int tests = 0; + double worstN = 0; // Only report on biases above 0 + int worstStart = -1; + int worstWidth = -1; + int tests = 0; - for(int start = 0; start < hashbits; start++) - { - int width = maxwidth; - int bincount = (1 << width); + for (int start = 0; start < hashbits; start++) { + int width = maxwidth; + int bincount = (1 << width); - memset(&bins[0],0,sizeof(int)*bincount); + memset(&bins[0], 0, sizeof(int) * bincount); - for(uint64_t j = 0; j < nbH; j++) - { - uint32_t index = hashes[j].window(start,width); + for (uint64_t j = 0; j < nbH; j++) { + uint32_t index = hashes[j].window(start, width); - bins[index]++; - } + bins[index]++; + } - // Test the distribution, then fold the bins in half, - // repeat until we're down to 256 bins + // Test the distribution, then fold the bins in half, + // repeat until we're down to 256 bins - while(bincount >= 256) - { - double n = calcScore(&bins[0],bincount,nbH); + while (bincount >= 256) { + double n = calcScore(&bins[0], bincount, nbH); - tests++; + tests++; - if(drawDiagram) plot(n); + if (drawDiagram) { plot(n); } - if(n > worstN) - { - worstN = n; - worstStart = start; - worstWidth = width; - } + if (n > worstN) { + worstN = n; + worstStart = start; + worstWidth = width; + } - width--; - bincount /= 2; + width--; + bincount /= 2; - if(width < minwidth) break; + if (width < minwidth) { break; } - // To allow the compiler to parallelize this loop - assume((bincount % 8) == 0); + // To allow the compiler to parallelize this loop + assume((bincount % 8) == 0); - for(int i = 0; i < bincount; i++) - { - bins[i] += bins[i+bincount]; - } - } + for (int i = 0; i < bincount; i++) { + bins[i] += bins[i + bincount]; + } + } - if(drawDiagram) printf("]\n%s", ((start + 1) == hashbits) ? "" : "["); - } + if (drawDiagram) { printf("]\n%s", ((start + 1) == hashbits) ? "" : "["); } + } - addVCodeResult((uint32_t)worstN); - addVCodeResult(worstWidth); - addVCodeResult(worstStart); + addVCodeResult((uint32_t)worstN); + addVCodeResult(worstWidth ); + addVCodeResult(worstStart ); - double p_value = ScalePValue(GetNormalPValue(0, 1, worstN), tests); - int logp_value = GetLog2PValue(p_value); - double mult = normalizeScore(worstN, worstWidth, tests); + double p_value = ScalePValue(GetNormalPValue(0, 1, worstN), tests); + int logp_value = GetLog2PValue(p_value); + double mult = normalizeScore(worstN, worstWidth, tests); - if (worstStart == -1) - printf("No positive bias detected %5.3fx ", 0.0); - else if (mult < 9.0) - printf("Worst bias is %2d bits at bit %3d: %5.3fx ", - worstWidth, worstStart, mult); - else - printf("Worst bias is %2d bits at bit %3d: %#.4gx ", - worstWidth, worstStart, mult); + if (worstStart == -1) { + printf("No positive bias detected %5.3fx ", 0.0); + } else if (mult < 9.0) { + printf("Worst bias is %2d bits at bit %3d: %5.3fx ", worstWidth, worstStart, mult); + } else { + printf("Worst bias is %2d bits at bit %3d: %#.4gx ", worstWidth, worstStart, mult); + } - recordLog2PValue(logp_value); - if (drawDiagram) - printf("(%f) (p<%8.6f) (^%2d)", worstN, p_value, logp_value); - else - printf("(^%2d)", logp_value); + recordLog2PValue(logp_value); + if (drawDiagram) { + printf("(%f) (p<%8.6f) (^%2d)", worstN, p_value, logp_value); + } else { + printf("(^%2d)", logp_value); + } - if (p_value < FAILURE_PBOUND) - { - printf(" !!!!!\n"); - return false; - } - else if (p_value < WARNING_PBOUND) - printf(" !\n"); - else - printf("\n"); - return true; + if (p_value < FAILURE_PBOUND) { + printf(" !!!!!\n"); + return false; + } else if (p_value < WARNING_PBOUND) { + printf(" !\n"); + } else { + printf("\n"); + } + return true; } //----------------------------------------------------------------------------- @@ -556,239 +559,242 @@ static bool TestDistribution ( std::vector & hashes, bool drawDiagram // comparing them to a list of i.i.d. random numbers across the full // origBits range. -static void ComputeCollBitBounds ( std::vector & nbBitsvec, int origBits, uint64_t nbH, int & minBits, int & maxBits, int & threshBits ) -{ - const int nlognBits = GetNLogNBound(nbH); - - minBits = origBits + 1; - maxBits = 0; - threshBits = 0; - - for(const int nbBits: nbBitsvec) - { - // If the nbBits value is too large for this hashtype, do nothing. - if (nbBits >= origBits) - continue; - // If many hashes are being tested (compared to the hash width), - // then the expected number of collisions will approach the number - // of keys (indeed, it will converge to every hash bucket being - // full, leaving nbH - 2**nbBits collisions). In those cases, it is - // not very useful to count all collisions, so at some point of high - // expected collisions, it is better to instead count the number of - // keys in the fullest bucket. The cutoff here is if there are - // (n*log(n)) hashes, where n is the number of hash buckets. This - // cutoff is an inflection point where the "balls-into-bins" - // statistics really start changing. ReportCollisions() will - // estimate the correct key count for that differently, as it is a - // different statistic. - if (nbBits < nlognBits) - threshBits = std::max(threshBits, nbBits); - // Record the highest and lowest valid bit widths to test - maxBits = std::max(maxBits, nbBits); - minBits = std::min(minBits, nbBits); - } +static void ComputeCollBitBounds( std::vector & nbBitsvec, int origBits, + uint64_t nbH, int & minBits, int & maxBits, int & threshBits ) { + const int nlognBits = GetNLogNBound(nbH); + + minBits = origBits + 1; + maxBits = 0; + threshBits = 0; + + for (const int nbBits: nbBitsvec) { + // If the nbBits value is too large for this hashtype, do nothing. + if (nbBits >= origBits) { + continue; + } + // If many hashes are being tested (compared to the hash width), + // then the expected number of collisions will approach the number + // of keys (indeed, it will converge to every hash bucket being + // full, leaving nbH - 2**nbBits collisions). In those cases, it is + // not very useful to count all collisions, so at some point of high + // expected collisions, it is better to instead count the number of + // keys in the fullest bucket. The cutoff here is if there are + // (n*log(n)) hashes, where n is the number of hash buckets. This + // cutoff is an inflection point where the "balls-into-bins" + // statistics really start changing. ReportCollisions() will + // estimate the correct key count for that differently, as it is a + // different statistic. + if (nbBits < nlognBits) { + threshBits = std::max(threshBits, nbBits); + } + // Record the highest and lowest valid bit widths to test + maxBits = std::max(maxBits, nbBits); + minBits = std::min(minBits, nbBits); + } } -static int FindMinBits_TargetCollisionShare(uint64_t nbHashes, double share) -{ +static int FindMinBits_TargetCollisionShare( uint64_t nbHashes, double share ) { int nb; - for (nb=2; nb<64; nb++) { + + for (nb = 2; nb < 64; nb++) { double const maxColls = (double)(1ULL << nb) * share; - double const nbColls = EstimateNbCollisions(nbHashes, nb); - if (nbColls < maxColls) return nb; + double const nbColls = EstimateNbCollisions(nbHashes, nb); + if (nbColls < maxColls) { return nb; } } assert(0); return nb; } -static int FindMaxBits_TargetCollisionNb(uint64_t nbHashes, int minCollisions, int maxbits) -{ +static int FindMaxBits_TargetCollisionNb( uint64_t nbHashes, int minCollisions, int maxbits ) { int nb; - for (nb=maxbits; nb>2; nb--) { + + for (nb = maxbits; nb > 2; nb--) { double const nbColls = EstimateNbCollisions(nbHashes, nb); - if (nbColls > minCollisions) return nb; + if (nbColls > minCollisions) { return nb; } } - //assert(0); + // assert(0); return nb; } -template < typename hashtype > -bool TestHashList ( std::vector & hashes, bool drawDiagram, - bool testCollision, bool testDist , - bool testHighBits, bool testLowBits , - bool verbose ) -{ - bool result = true; - - if (testCollision) - { - unsigned const hashbits = sizeof(hashtype) * 8; - uint64_t const nbH = hashes.size(); - if (verbose) - printf("Testing all collisions ( %3i-bit)", hashbits); - - addVCodeOutput(&hashes[0], sizeof(hashtype) * nbH); - - std::set collisions; - int collcount = FindCollisions(hashes, collisions, 1000, drawDiagram); - - /* - * Do all other compute-intensive stuff (as requested) before - * displaying any results from FindCollisions, to be a little bit - * more human-friendly. - */ - - std::vector nbBitsvec = { 224, 160, 128, 64, 32, 12, 8, }; - /* - * cyan: The 12- and -8-bit tests are too small : tables are necessarily saturated. - * It would be better to count the nb of collisions per Cell, and - * compared the distribution of values against a random source. - * But that would be a different test. - * - * rurban: No, these tests are for non-prime hash tables, using only - * the lower 5-10 bits - * - * fwojcik: Collision counting did not previously reflect - * rurban's comment, as the code counted the sum of collisions - * across _all_ buckets. So if there are many more hashes than - * 2**nbBits, and the hash is even _slightly_ not broken, then - * every n-bit truncated hash value will appear at least once, in - * which case the "actual" value reported would always be - * (hashes.size() - 2**nbBits). Checking the results in doc/ - * confirms this. cyan's comment was correct. - * - * Collision counting has now been modified to report on the - * single bucket with the most collisions when fuller hash tables - * are being tested, and ReportCollisions() computes an - * appropriate "expected" statistic. - */ - - /* - * Compute the number of bits for a collision count of - * approximately 100. - */ - if (testHighBits || testLowBits) - { - int const hundredCollBits = FindMaxBits_TargetCollisionNb(nbH, 100, hashbits); - if (EstimateNbCollisions(nbH, hundredCollBits) >= 100) - nbBitsvec.push_back(hundredCollBits); - std::sort(nbBitsvec.rbegin(), nbBitsvec.rend()); - nbBitsvec.erase(std::unique(nbBitsvec.begin(), nbBitsvec.end()), nbBitsvec.end()); - } - - /* - * Each bit width value in nbBitsvec is explicitly reported on. If - * any of those values are less than the n*log(n) bound, then the - * bin with the most collisions will be reported on, otherwise the - * total sum of collisions across all bins will be reported on. - * - * But there are many more bit widths that a) are probably used in - * the real world, and b) we can now cheaply analyze and report - * on. Any bit width above the n*log(n) bound that has a - * reasonable number of expected collisions is worth analyzing, so - * that range of widths is computed here. - * - * This is slightly complicated by the fact that - * TestDistribution() may also get invoked, which does an - * RMSE-based comparison to the expected distribution over some - * range of bit width values. If that will be invoked, then - * there's no point in doubly-reporting on collision counts for - * those bit widths, so they get excluded here. - */ - std::vector testBitsvec; - int const nlognBits = GetNLogNBound(nbH); - int const minTBits = testDist ? std::max(MaxDistBits(nbH)+1, nlognBits) : nlognBits; - int const maxTBits = FindMaxBits_TargetCollisionNb(nbH, 10, hashbits - 1); - - if (testHighBits || testLowBits) - for (int i = minTBits; i <= maxTBits; i++) - testBitsvec.push_back(i); - - /* - * Given the range of hash sizes we care about, compute all - * collision counts for them, for high- and low-bits as requested. - */ - std::vector revhashes; - std::vector collcounts_fwd; - std::vector collcounts_rev; - int minBits, maxBits, threshBits; - - if (testHighBits || testLowBits) - { - std::vector combinedBitsvec; - combinedBitsvec.reserve(200); // Workaround for GCC bug 100366 - combinedBitsvec.insert(combinedBitsvec.begin(), nbBitsvec.begin(), nbBitsvec.end()); - combinedBitsvec.insert(combinedBitsvec.begin(), testBitsvec.begin(), testBitsvec.end()); - std::sort(combinedBitsvec.rbegin(), combinedBitsvec.rend()); - combinedBitsvec.erase(std::unique(combinedBitsvec.begin(), combinedBitsvec.end()), combinedBitsvec.end()); - ComputeCollBitBounds(combinedBitsvec, hashbits, nbH, minBits, maxBits, threshBits); - } - - if (testHighBits && (maxBits > 0)) - { - collcounts_fwd.reserve(maxBits - minBits + 1); - CountRangedNbCollisions(hashes, nbH, minBits, maxBits, threshBits, &collcounts_fwd[0]); - } - - if (testLowBits && (maxBits > 0)) - { - // reverse: bitwise flip the hashes. lowest bits first - revhashes.reserve(hashes.size()); - for(const auto hashval: hashes) - { - hashtype rev = hashval; - rev.reversebits(); - revhashes.push_back(rev); - } - blobsort(revhashes.begin(), revhashes.end()); - - collcounts_rev.reserve(maxBits - minBits + 1); - CountRangedNbCollisions(revhashes, nbH, minBits, maxBits, threshBits, &collcounts_rev[0]); - } - - addVCodeResult(collcount); - if (testHighBits && (collcounts_fwd.size() != 0)) { - addVCodeResult(&collcounts_fwd[0], sizeof(collcounts_fwd[0]) * - collcounts_fwd.size()); - } - if (testLowBits && (collcounts_rev.size() != 0)) { - addVCodeResult(&collcounts_rev[0], sizeof(collcounts_rev[0]) * - collcounts_rev.size()); - } - - // Report on complete collisions, now that the heavy lifting is complete - result &= ReportCollisions(nbH, collcount, hashbits, false, false, false, verbose, drawDiagram); - if(!result && drawDiagram) - { - PrintCollisions(collisions); - } - - if (testHighBits || testLowBits) - for(const int nbBits: nbBitsvec) - { - if ((nbBits < minBits) || (nbBits > maxBits)) - continue; - bool maxcoll = (nbBits <= threshBits) ? true : false; - if (testHighBits) - result &= ReportCollisions(nbH, collcounts_fwd[nbBits - minBits], nbBits, - maxcoll, true, true, true, drawDiagram); - if (testLowBits) - result &= ReportCollisions(nbH, collcounts_rev[nbBits - minBits], nbBits, - maxcoll, false, true, true, drawDiagram); - } - - if (testHighBits) - result &= ReportBitsCollisions(nbH, &collcounts_fwd[minTBits - minBits], minTBits, maxTBits, true, drawDiagram); - if (testLowBits) - result &= ReportBitsCollisions(nbH, &collcounts_rev[minTBits - minBits], minTBits, maxTBits, false, drawDiagram); - } - - //---------- - - if(testDist) - result &= TestDistribution(hashes,drawDiagram); - - return result; +template +bool TestHashList( std::vector & hashes, bool drawDiagram, bool testCollision, + bool testDist, bool testHighBits, bool testLowBits, bool verbose ) { + bool result = true; + + if (testCollision) { + unsigned const hashbits = sizeof(hashtype) * 8; + uint64_t const nbH = hashes.size(); + if (verbose) { + printf("Testing all collisions ( %3i-bit)", hashbits); + } + + addVCodeOutput(&hashes[0], sizeof(hashtype) * nbH); + + std::set collisions; + int collcount = FindCollisions(hashes, collisions, 1000, drawDiagram); + + /* + * Do all other compute-intensive stuff (as requested) before + * displaying any results from FindCollisions, to be a little bit + * more human-friendly. + */ + + std::vector nbBitsvec = { 224, 160, 128, 64, 32, 12, 8, }; + /* + * cyan: The 12- and -8-bit tests are too small : tables are necessarily saturated. + * It would be better to count the nb of collisions per Cell, and + * compared the distribution of values against a random source. + * But that would be a different test. + * + * rurban: No, these tests are for non-prime hash tables, using only + * the lower 5-10 bits + * + * fwojcik: Collision counting did not previously reflect + * rurban's comment, as the code counted the sum of collisions + * across _all_ buckets. So if there are many more hashes than + * 2**nbBits, and the hash is even _slightly_ not broken, then + * every n-bit truncated hash value will appear at least once, in + * which case the "actual" value reported would always be + * (hashes.size() - 2**nbBits). Checking the results in doc/ + * confirms this. cyan's comment was correct. + * + * Collision counting has now been modified to report on the + * single bucket with the most collisions when fuller hash tables + * are being tested, and ReportCollisions() computes an + * appropriate "expected" statistic. + */ + + /* + * Compute the number of bits for a collision count of + * approximately 100. + */ + if (testHighBits || testLowBits) { + int const hundredCollBits = FindMaxBits_TargetCollisionNb(nbH, 100, hashbits); + if (EstimateNbCollisions(nbH, hundredCollBits) >= 100) { + nbBitsvec.push_back(hundredCollBits); + } + std::sort(nbBitsvec.rbegin(), nbBitsvec.rend()); + nbBitsvec.erase(std::unique(nbBitsvec.begin(), nbBitsvec.end()), nbBitsvec.end()); + } + + /* + * Each bit width value in nbBitsvec is explicitly reported on. If + * any of those values are less than the n*log(n) bound, then the + * bin with the most collisions will be reported on, otherwise the + * total sum of collisions across all bins will be reported on. + * + * But there are many more bit widths that a) are probably used in + * the real world, and b) we can now cheaply analyze and report + * on. Any bit width above the n*log(n) bound that has a + * reasonable number of expected collisions is worth analyzing, so + * that range of widths is computed here. + * + * This is slightly complicated by the fact that + * TestDistribution() may also get invoked, which does an + * RMSE-based comparison to the expected distribution over some + * range of bit width values. If that will be invoked, then + * there's no point in doubly-reporting on collision counts for + * those bit widths, so they get excluded here. + */ + std::vector testBitsvec; + int const nlognBits = GetNLogNBound(nbH); + int const minTBits = testDist ? std::max(MaxDistBits(nbH) + 1, nlognBits) : nlognBits; + int const maxTBits = FindMaxBits_TargetCollisionNb(nbH, 10, hashbits - 1); + + if (testHighBits || testLowBits) { + for (int i = minTBits; i <= maxTBits; i++) { + testBitsvec.push_back(i); + } + } + + /* + * Given the range of hash sizes we care about, compute all + * collision counts for them, for high- and low-bits as requested. + */ + std::vector revhashes; + std::vector collcounts_fwd; + std::vector collcounts_rev; + int minBits, maxBits, threshBits; + + if (testHighBits || testLowBits) { + std::vector combinedBitsvec; + combinedBitsvec.reserve(200); // Workaround for GCC bug 100366 + combinedBitsvec.insert(combinedBitsvec.begin(), nbBitsvec.begin() , nbBitsvec.end() ); + combinedBitsvec.insert(combinedBitsvec.begin(), testBitsvec.begin(), testBitsvec.end()); + std::sort(combinedBitsvec.rbegin(), combinedBitsvec.rend()); + combinedBitsvec.erase(std::unique(combinedBitsvec.begin(), combinedBitsvec.end()), combinedBitsvec.end()); + ComputeCollBitBounds(combinedBitsvec, hashbits, nbH, minBits, maxBits, threshBits); + } + + if (testHighBits && (maxBits > 0)) { + collcounts_fwd.reserve(maxBits - minBits + 1); + CountRangedNbCollisions(hashes, nbH, minBits, maxBits, threshBits, &collcounts_fwd[0]); + } + + if (testLowBits && (maxBits > 0)) { + // reverse: bitwise flip the hashes. lowest bits first + revhashes.reserve(hashes.size()); + for (const auto hashval: hashes) { + hashtype rev = hashval; + rev.reversebits(); + revhashes.push_back(rev); + } + blobsort(revhashes.begin(), revhashes.end()); + + collcounts_rev.reserve(maxBits - minBits + 1); + CountRangedNbCollisions(revhashes, nbH, minBits, maxBits, threshBits, &collcounts_rev[0]); + } + + addVCodeResult(collcount); + if (testHighBits && (collcounts_fwd.size() != 0)) { + addVCodeResult(&collcounts_fwd[0], sizeof(collcounts_fwd[0]) * + collcounts_fwd.size()); + } + if (testLowBits && (collcounts_rev.size() != 0)) { + addVCodeResult(&collcounts_rev[0], sizeof(collcounts_rev[0]) * + collcounts_rev.size()); + } + + // Report on complete collisions, now that the heavy lifting is complete + result &= ReportCollisions(nbH, collcount, hashbits, false, false, false, verbose, drawDiagram); + if (!result && drawDiagram) { + PrintCollisions(collisions); + } + + if (testHighBits || testLowBits) { + for (const int nbBits: nbBitsvec) { + if ((nbBits < minBits) || (nbBits > maxBits)) { + continue; + } + bool maxcoll = (nbBits <= threshBits) ? true : false; + if (testHighBits) { + result &= ReportCollisions(nbH, collcounts_fwd[nbBits - minBits], + nbBits, maxcoll, true, true, true, drawDiagram); + } + if (testLowBits) { + result &= ReportCollisions(nbH, collcounts_rev[nbBits - minBits], + nbBits, maxcoll, false, true, true, drawDiagram); + } + } + } + + if (testHighBits) { + result &= ReportBitsCollisions(nbH, &collcounts_fwd[minTBits - minBits], + minTBits, maxTBits, true, drawDiagram); + } + if (testLowBits) { + result &= ReportBitsCollisions(nbH, &collcounts_rev[minTBits - minBits], + minTBits, maxTBits, false, drawDiagram); + } + } + + //---------- + + if (testDist) { + result &= TestDistribution(hashes, drawDiagram); + } + + return result; } INSTANTIATE(TestHashList, HASHTYPELIST); @@ -804,89 +810,81 @@ INSTANTIATE(TestHashList, HASHTYPELIST); // I'm not sure it's that useful (and hash functions that fail this test but // pass the normal distribution test still work well in practice) -template < typename hashtype > -double TestDistributionBytepairs ( std::vector & hashes, bool drawDiagram ) -{ - const int nbytes = sizeof(hashtype); - const int hashbits = nbytes * 8; +template +double TestDistributionBytepairs( std::vector & hashes, bool drawDiagram ) { + const int nbytes = sizeof(hashtype); + const int hashbits = nbytes * 8; + + const int nbins = 65536; - const int nbins = 65536; + std::vector bins( nbins, 0 ); - std::vector bins(nbins,0); + double worst = 0; - double worst = 0; + for (int a = 0; a < hashbits; a++) { + if (drawDiagram) { if ((a % 8 == 0) && (a > 0)) { printf("\n"); } } - for(int a = 0; a < hashbits; a++) - { - if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n"); + if (drawDiagram) { printf("["); } - if(drawDiagram) printf("["); + for (int b = 0; b < hashbits; b++) { + if (drawDiagram) { if ((b % 8 == 0) && (b > 0)) { printf(" "); } } - for(int b = 0; b < hashbits; b++) - { - if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" "); + bins.clear(); + bins.resize(nbins, 0); - bins.clear(); - bins.resize(nbins,0); + for (uint64_t i = 0; i < hashes.size(); i++) { + uint32_t pa = window(hashes[i], a, 8); + uint32_t pb = window(hashes[i], b, 8); - for(uint64_t i = 0; i < hashes.size(); i++) - { - uint32_t pa = window(hashes[i],a,8); - uint32_t pb = window(hashes[i],b,8); + bins[pa | (pb << 8)]++; + } - bins[pa | (pb << 8)]++; - } + double s = calcScore(bins, nbins, hashes.size()); - double s = calcScore(bins,nbins,hashes.size()); + if (drawDiagram) { plot(s); } - if(drawDiagram) plot(s); + if (s > worst) { + worst = s; + } + } - if(s > worst) - { - worst = s; - } + if (drawDiagram) { printf("]\n"); } } - if(drawDiagram) printf("]\n"); - } - - return worst; + return worst; } //----------------------------------------------------------------------------- // Simplified test - only check 64k distributions, and only on byte boundaries -template < typename hashtype > -void TestDistributionFast ( std::vector & hashes, double & dworst, double & davg ) -{ - const int hashbits = sizeof(hashtype) * 8; - const int nbins = 65536; +template +void TestDistributionFast( std::vector & hashes, double & dworst, double & davg ) { + const int hashbits = sizeof(hashtype) * 8; + const int nbins = 65536; - std::vector bins(nbins,0); + std::vector bins( nbins, 0 ); - dworst = -1.0e90; - davg = 0; + dworst = -1.0e90; + davg = 0; - for(int start = 0; start < hashbits; start += 8) - { - bins.clear(); - bins.resize(nbins,0); + for (int start = 0; start < hashbits; start += 8) { + bins.clear(); + bins.resize(nbins, 0); - for(uint64_t j = 0; j < hashes.size(); j++) - { - uint32_t index = window(hashes[j],start,16); + for (uint64_t j = 0; j < hashes.size(); j++) { + uint32_t index = window(hashes[j], start, 16); - bins[index]++; - } + bins[index]++; + } - double n = calcScore(&bins.front(),nbins,(int)hashes.size()); + double n = calcScore(&bins.front(), nbins, (int)hashes.size()); - davg += n; + davg += n; - if(n > dworst) dworst = n; - } + if (n > dworst) { dworst = n; } + } - davg /= double(hashbits/8); + davg /= double(hashbits / 8); } //----------------------------------------------------------------------------- diff --git a/util/Analyze.h b/util/Analyze.h index c5abdd78..7991f544 100644 --- a/util/Analyze.h +++ b/util/Analyze.h @@ -50,19 +50,15 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -bool ReportBias(const int biascnt, const int coinflips, const int trials, const bool drawDiagram); +bool ReportBias( const int biascnt, const int coinflips, const int trials, const bool drawDiagram ); -template < typename hashtype > -unsigned int FindCollisions ( std::vector & hashes, - std::set & collisions, - int maxCollisions = 1000, - bool drawDiagram = false); +template +unsigned int FindCollisions( std::vector & hashes, std::set & collisions, + int maxCollisions = 1000, bool drawDiagram = false ); -template < typename hashtype > -void PrintCollisions(std::set & collisions); +template +void PrintCollisions( std::set & collisions ); -template < typename hashtype > -bool TestHashList ( std::vector & hashes, bool drawDiagram, - bool testCollision = true, bool testDist = true, - bool testHighBits = true, bool testLowBits = true, - bool verbose = true ); +template +bool TestHashList( std::vector & hashes, bool drawDiagram, bool testCollision = true, bool testDist = true, + bool testHighBits = true, bool testLowBits = true, bool verbose = true ); diff --git a/util/Blob.h b/util/Blob.h index 344775e3..50f667f4 100644 --- a/util/Blob.h +++ b/util/Blob.h @@ -49,302 +49,311 @@ extern const uint32_t hzb[256]; //----------------------------------------------------------------------------- -#define _bytes ((_bits+7)/8) -template < unsigned _bits > +#define _bytes ((_bits + 7) / 8) +template class Blob { + public: + //---------- + // constructors -public: - //---------- - // constructors + Blob() { + memset(bytes, 0, sizeof(bytes)); + } + + Blob( const void * p, size_t len ) { + len = std::min(len, sizeof(bytes)); + memcpy(bytes, p, len); + memset(&bytes[len], 0, sizeof(bytes) - len); + } + + Blob( uint64_t x ) : + Blob((x = COND_BSWAP( x, isBE()), &x), sizeof(x)) {} + + //---------- + // unary operators + + uint8_t & operator [] ( int i ) { + // assert(i < sizeof(bytes)); + return bytes[i]; + } + + const uint8_t & operator [] ( int i ) const { + // assert(i < sizeof(bytes)); + return bytes[i]; + } + + Blob & operator = ( const Blob & k ) { + memcpy(bytes, k.bytes, sizeof(bytes)); + return *this; + } + + Blob & operator = ( const uint32_t & x ) { + const uint32_t y = COND_BSWAP(x, isBE()); + + memcpy(bytes, &y, sizeof(y)); + return *this; + } + + //---------- + // boolean operators + + bool operator < ( const Blob & k ) const { + for (int i = sizeof(bytes) - 1; i >= 0; i--) { + if (bytes[i] < k.bytes[i]) { return true; } + if (bytes[i] > k.bytes[i]) { return false; } + } + return false; + } + + bool operator == ( const Blob & k ) const { + int r = memcmp(&bytes[0], &k.bytes[0], sizeof(bytes)); + + return (r == 0) ? true : false; + } + + bool operator != ( const Blob & k ) const { + return !(*this == k); + } + + //---------- + // bitwise operations + + Blob operator ^ ( const Blob & k ) const { + Blob t; + + for (size_t i = 0; i < sizeof(bytes); i++) { + t.bytes[i] = bytes[i] ^ k.bytes[i]; + } + + return t; + } + + Blob & operator ^= ( const Blob & k ) { + for (size_t i = 0; i < sizeof(bytes); i++) { + bytes[i] ^= k.bytes[i]; + } + return *this; + } + + //---------- + // interface + + FORCE_INLINE uint8_t getbit( size_t bit ) const { + return _getbit(bit, bytes, sizeof(bytes)); + } + + FORCE_INLINE void printhex( const char * prefix = "" ) const { + _printhex(prefix, bytes, sizeof(bytes)); + } + + FORCE_INLINE void printbits( const char * prefix = "" ) const { + _printbits(prefix, bytes, sizeof(bytes)); + } + + FORCE_INLINE uint32_t highzerobits( void ) const { + return _highzerobits(bytes, sizeof(bytes)); + } - Blob() { - memset(bytes, 0, sizeof(bytes)); - } + FORCE_INLINE uint32_t window( size_t start, size_t count ) const { + return _window(start, count, bytes, sizeof(bytes)); + } + + FORCE_INLINE void flipbit( size_t bit ) { + _flipbit(bit, bytes, sizeof(bytes)); + } + + FORCE_INLINE void reversebits( void ) { + _reversebits(bytes, sizeof(bytes)); + } - Blob(const void * p, size_t len) { - len = std::min(len, sizeof(bytes)); - memcpy(bytes, p, len); - memset(&bytes[len], 0, sizeof(bytes) - len); - } + FORCE_INLINE void lrot( size_t c ) { + _lrot(c, bytes, sizeof(bytes)); + } + + protected: + //---------- + // implementations + + static FORCE_INLINE uint8_t _getbit( size_t bit, const uint8_t * bytes, const size_t len ) { + size_t byte = bit >> 3; + + bit &= 7; + if (byte > len) { return 0; } + return (bytes[byte] >> bit) & 1; + } + + static void _printhex( const char * prefix, const uint8_t * bytes, const size_t len ) { + const size_t buflen = 4 + 2 * len + ((len + 3) / 4); + char buf[buflen]; + char * p; + + buf[0] = '['; + buf[1] = ' '; + // Space preceding the closing ']' gets added by the loop below + buf[buflen - 2] = ']'; + buf[buflen - 1] = '\0'; + + // Print using MSB-first notation + p = &buf[2]; + for (size_t i = len; i != 0; i--) { + uint8_t vh = (bytes[i - 1] >> 4); + uint8_t vl = (bytes[i - 1] & 15); + *p++ = vh + ((vh <= 9) ? '0' : 'W'); // 'W' + 10 == 'a' + *p++ = vl + ((vl <= 9) ? '0' : 'W'); + if ((i & 3) == 1) { + *p++ = ' '; + } + } + + printf("%s%s\n", prefix, buf); + } + + static void _printbits( const char * prefix, const uint8_t * bytes, const size_t len ) { + const size_t buflen = 4 + 9 * len; + char buf[buflen]; + char * p; + + buf[0] = '['; + buf[1] = ' '; + // Space preceding the closing ']' gets added by the loop below + buf[buflen - 2] = ']'; + buf[buflen - 1] = '\0'; + + // Print using MSB-first notation + p = &buf[2]; + for (size_t i = len; i != 0; i--) { + uint8_t v = bytes[i - 1]; + for (int j = 7; j >= 0; j--) { + *p++ = (v & (1 << j)) ? '1' : '0'; + } + *p++ = ' '; + } + + printf("%s%s\n", prefix, buf); + } + + static FORCE_INLINE uint32_t _highzerobits( const uint8_t * bytes, const size_t len ) { + uint32_t zb = 0; + + for (ssize_t i = len - 1; i >= 0; i--) { + zb += hzb[bytes[i]]; + if (bytes[i] != 0) { + break; + } + } + return zb; + } + + // Bit-windowing function. + // Select some N-bit subset of the Blob, where N <= 24. + static FORCE_INLINE uint32_t _window( size_t start, size_t count, const uint8_t * bytes, const size_t len ) { + assume(count <= 24); + const size_t bitlen = 8 * len; + const uint32_t mask = (1 << count) - 1; + uint32_t v; + + if (start <= (bitlen - 25)) { + memcpy(&v, &bytes[start >> 3], 4); + v = COND_BSWAP(v, isBE()); + v >>= (start & 7 ); + } else { + memcpy(&v, &bytes[len - 4], 4); + v = COND_BSWAP(v, isBE()); + v >>= 32 + start - bitlen; + if ((start + count) > bitlen) { + uint32_t v2; + memcpy(&v2, bytes, 4); + v2 = COND_BSWAP(v2, isBE()); + v2 <<= bitlen - start; + v |= v2; + } + } + return v & mask; + } - Blob(uint64_t x) : - Blob((x = COND_BSWAP(x, isBE()), &x), sizeof(x)) {}; + static FORCE_INLINE void _flipbit( size_t bit, uint8_t * bytes, const size_t len ) { + const size_t byteoffset = bit >> 3; + const size_t bitoffset = bit & 7; - //---------- - // unary operators + if (byteoffset < len) { + bytes[byteoffset] ^= (1 << bitoffset); + } + } - uint8_t & operator [] (int i) { - //assert(i < sizeof(bytes)); - return bytes[i]; - } + // from the "Bit Twiddling Hacks" webpage + static FORCE_INLINE uint8_t _byterev( uint8_t b ) { + return ((b * UINT64_C(0x0802) & UINT64_C(0x22110)) | + (b * UINT64_C(0x8020) & UINT64_C(0x88440))) * UINT64_C(0x10101) >> 16; + } - const uint8_t & operator [](int i) const { - //assert(i < sizeof(bytes)); - return bytes[i]; - } + // 0xf00f1001 => 0x8008f00f + static FORCE_INLINE void _reversebits( uint8_t * bytes, const size_t len ) { + uint8_t tmp[len]; - Blob & operator = (const Blob & k) { - memcpy(bytes, k.bytes, sizeof(bytes)); - return *this; - } + for (size_t i = 0; i < len; i++) { + tmp[len - i - 1] = _byterev(bytes[i]); + } + memcpy(bytes, tmp, len); + } - Blob & operator = (const uint32_t & x) { - const uint32_t y = COND_BSWAP(x, isBE()); - memcpy(bytes, &y, sizeof(y)); - return *this; - } + static void _lrot( size_t c, uint8_t * bytes, const size_t len ) { + const size_t byteoffset = c >> 3; + const size_t bitoffset = c & 7; + uint8_t tmp[len]; + + for (size_t i = 0; i < len; i++) { + tmp[(i + byteoffset) % len] = bytes[i]; + } + if (bitoffset == 0) { + memcpy(bytes, tmp, len); + } else { + for (size_t i = 0; i < len; i++) { + uint8_t a = tmp[i]; + uint8_t b = (i == 0) ? tmp[len - 1] : tmp[i - 1]; + bytes[i] = (a << bitoffset) | (b >> (8 - bitoffset)); + } + } + } - //---------- - // boolean operators - - bool operator < (const Blob & k) const { - for(int i = sizeof(bytes) -1; i >= 0; i--) { - if(bytes[i] < k.bytes[i]) return true; - if(bytes[i] > k.bytes[i]) return false; - } - return false; - } - - bool operator == ( const Blob & k ) const { - int r = memcmp(&bytes[0], &k.bytes[0], sizeof(bytes)); - return (r == 0) ? true : false; - } - - bool operator != ( const Blob & k ) const { - return !(*this == k); - } - - //---------- - // bitwise operations - - Blob operator ^ (const Blob & k) const { - Blob t; - - for(size_t i = 0; i < sizeof(bytes); i++) { - t.bytes[i] = bytes[i] ^ k.bytes[i]; - } - - return t; - } - - Blob & operator ^= (const Blob & k) { - for(size_t i = 0; i < sizeof(bytes); i++) { - bytes[i] ^= k.bytes[i]; - } - return *this; - } - - //---------- - // interface - - FORCE_INLINE uint8_t getbit(size_t bit) const { - return _getbit(bit, bytes, sizeof(bytes)); - } - - FORCE_INLINE void printhex(const char * prefix = "") const { - _printhex(prefix, bytes, sizeof(bytes)); - } - - FORCE_INLINE void printbits(const char * prefix = "") const { - _printbits(prefix, bytes, sizeof(bytes)); - } - - FORCE_INLINE uint32_t highzerobits(void) const { - return _highzerobits(bytes, sizeof(bytes)); - } - - FORCE_INLINE uint32_t window(size_t start, size_t count) const { - return _window(start, count, bytes, sizeof(bytes)); - } - - FORCE_INLINE void flipbit(size_t bit) { - _flipbit(bit, bytes, sizeof(bytes)); - } - - FORCE_INLINE void reversebits(void) { - _reversebits(bytes, sizeof(bytes)); - } - - FORCE_INLINE void lrot(size_t c) { - _lrot(c, bytes, sizeof(bytes)); - } - -protected: - //---------- - // implementations - - static FORCE_INLINE uint8_t _getbit(size_t bit, const uint8_t * bytes, const size_t len) { - size_t byte = bit >> 3; - bit &= 7; - if (byte > len) return 0; - return (bytes[byte] >> bit) & 1; - } - - static void _printhex(const char * prefix, const uint8_t * bytes, const size_t len) { - const size_t buflen = 4 + 2 * len + ((len + 3) / 4); - char buf[buflen]; - char * p; - - buf[0] = '['; - buf[1] = ' '; - // Space preceding the closing ']' gets added by the loop below - buf[buflen - 2] = ']'; - buf[buflen - 1] = '\0'; - - // Print using MSB-first notation - p = &buf[2]; - for (size_t i = len; i != 0; i--) { - uint8_t vh = (bytes[i - 1] >> 4); - uint8_t vl = (bytes[i - 1] & 15); - *p++ = vh + ((vh <= 9) ? '0' : 'W'); // 'W' + 10 == 'a' - *p++ = vl + ((vl <= 9) ? '0' : 'W'); - if ((i & 3) == 1) { - *p++ = ' '; - } - } - - printf("%s%s\n", prefix, buf); - } - - static void _printbits(const char * prefix, const uint8_t * bytes, const size_t len) { - const size_t buflen = 4 + 9 * len; - char buf[buflen]; - char * p; - - buf[0] = '['; - buf[1] = ' '; - // Space preceding the closing ']' gets added by the loop below - buf[buflen - 2] = ']'; - buf[buflen - 1] = '\0'; - - // Print using MSB-first notation - p = &buf[2]; - for (size_t i = len; i != 0; i--) { - uint8_t v = bytes[i - 1]; - for (int j = 7; j >= 0; j--) { - *p++ = (v & (1 << j)) ? '1' : '0'; - } - *p++ = ' '; - } - - printf("%s%s\n", prefix, buf); - } - - static FORCE_INLINE uint32_t _highzerobits(const uint8_t * bytes, const size_t len) { - uint32_t zb = 0; - for (ssize_t i = len - 1; i >= 0; i--) { - zb += hzb[bytes[i]]; - if (bytes[i] != 0) { - break; - } - } - return zb; - } - - // Bit-windowing function. - // Select some N-bit subset of the Blob, where N <= 24. - static FORCE_INLINE uint32_t _window(size_t start, size_t count, const uint8_t * bytes, const size_t len) { - assume(count <= 24); - const size_t bitlen = 8 * len; - const uint32_t mask = (1 << count) - 1; - uint32_t v; - - if (start <= (bitlen - 25)) { - memcpy(&v, &bytes[start >> 3], 4); - v = COND_BSWAP(v, isBE()); - v >>= (start & 7); - } else { - memcpy(&v, &bytes[len - 4], 4); - v = COND_BSWAP(v, isBE()); - v >>= 32 + start - bitlen; - if ((start + count) > bitlen) { - uint32_t v2; - memcpy(&v2, bytes, 4); - v2 = COND_BSWAP(v2, isBE()); - v2 <<= bitlen - start; - v |= v2; - } - } - return v & mask; - } - - static FORCE_INLINE void _flipbit(size_t bit, uint8_t * bytes, const size_t len) { - const size_t byteoffset = bit >> 3; - const size_t bitoffset = bit & 7; - if (byteoffset < len) { - bytes[byteoffset] ^= (1 << bitoffset); - } - } - - // from the "Bit Twiddling Hacks" webpage - static FORCE_INLINE uint8_t _byterev(uint8_t b) { - return ((b * UINT64_C(0x0802) & UINT64_C(0x22110)) | - (b * UINT64_C(0x8020) & UINT64_C(0x88440))) * UINT64_C(0x10101) >> 16; - } - - // 0xf00f1001 => 0x8008f00f - static FORCE_INLINE void _reversebits(uint8_t * bytes, const size_t len) { - uint8_t tmp[len]; - - for (size_t i = 0; i < len; i++) - tmp[len - i - 1] = _byterev(bytes[i]); - memcpy(bytes, tmp, len); - } - - static void _lrot(size_t c, uint8_t * bytes, const size_t len) { - const size_t byteoffset = c >> 3; - const size_t bitoffset = c & 7; - uint8_t tmp[len]; - - for (size_t i = 0; i < len; i++) { - tmp[(i + byteoffset) % len] = bytes[i]; - } - if (bitoffset == 0) { - memcpy(bytes, tmp, len); - } else { - for (size_t i = 0; i < len; i++) { - uint8_t a = tmp[i]; - uint8_t b = (i == 0) ? tmp[len - 1] : tmp[i - 1]; - bytes[i] = (a << bitoffset) | (b >> (8 - bitoffset)); - } - } - } - -private: - //---------- - uint8_t bytes[_bytes]; -}; + private: + //---------- + uint8_t bytes[_bytes]; +}; // class Blob // from the "Bit Twiddling Hacks" webpage -template<> FORCE_INLINE void Blob<32>::reversebits(void) { +template <> +FORCE_INLINE void Blob<32>::reversebits( void ) { uint32_t v = GET_U32(bytes, 0); + // swap odd and even bits - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs - v = ( v >> 16 ) | ( v << 16); + v = ((v >> 16) ) | ((v) << 16); PUT_U32(v, bytes, 0); } -template<> FORCE_INLINE void Blob<64>::reversebits(void) { +template <> +FORCE_INLINE void Blob<64>::reversebits( void ) { uint64_t v = GET_U64(bytes, 0); + // swap odd and even bits - v = ((v >> 1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) << 1); + v = ((v >> 1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) << 1); // swap consecutive pairs - v = ((v >> 2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) << 2); + v = ((v >> 2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) << 2); // swap nibbles ... - v = ((v >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F)) | ((v & UINT64_C(0x0F0F0F0F0F0F0F0F)) << 4); + v = ((v >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F)) | ((v & UINT64_C(0x0F0F0F0F0F0F0F0F)) << 4); // swap bytes - v = ((v >> 8) & UINT64_C(0x00FF00FF00FF00FF)) | ((v & UINT64_C(0x00FF00FF00FF00FF)) << 8); + v = ((v >> 8) & UINT64_C(0x00FF00FF00FF00FF)) | ((v & UINT64_C(0x00FF00FF00FF00FF)) << 8); // swap 2-byte long pairs v = ((v >> 16) & UINT64_C(0x0000FFFF0000FFFF)) | ((v & UINT64_C(0x0000FFFF0000FFFF)) << 16); // swap 4-byte long pairs - v = ( v >> 32 ) | ( v << 32); + v = ((v >> 32)) | ((v) << 32); PUT_U64(v, bytes, 0); } @@ -352,73 +361,72 @@ template<> FORCE_INLINE void Blob<64>::reversebits(void) { // Blob-like class for externally managed buffers. // The operator overloads of Blob<> are made private, and so are not exposed. -typedef void * voidptr_t; +typedef void * voidptr_t; typedef const void * constvoidptr_t; class ExtBlob : private Blob<0> { + public: + //---------- + // constructors -public: - //---------- - // constructors - - ExtBlob(uint8_t * p, size_t l) { - ptr = p; - len = l; - } + ExtBlob( uint8_t * p, size_t l ) { + ptr = p; + len = l; + } - ExtBlob(uint8_t * p, const uint8_t * i, size_t l) { - ptr = p; - len = l; - memcpy(ptr, i, len); - } + ExtBlob( uint8_t * p, const uint8_t * i, size_t l ) { + ptr = p; + len = l; + memcpy(ptr, i, len); + } - //---------- - // conversion operators + //---------- + // conversion operators - operator voidptr_t () const { - return (voidptr_t)ptr; - } + operator voidptr_t () const { + return (voidptr_t)ptr; + } - operator constvoidptr_t () const { - return (constvoidptr_t)ptr; - } + operator constvoidptr_t () const { + return (constvoidptr_t)ptr; + } - //---------- - // interface + //---------- + // interface - FORCE_INLINE uint8_t getbit(size_t bit) const { - return _getbit(bit, ptr, len); - } + FORCE_INLINE uint8_t getbit( size_t bit ) const { + return _getbit(bit, ptr, len); + } - FORCE_INLINE void printhex(const char * prefix = "") const { - _printhex(prefix, ptr, len); - } + FORCE_INLINE void printhex( const char * prefix = "" ) const { + _printhex(prefix, ptr, len); + } - FORCE_INLINE void printbits(const char * prefix = "") const { - _printbits(prefix, ptr, len); - } + FORCE_INLINE void printbits( const char * prefix = "" ) const { + _printbits(prefix, ptr, len); + } - FORCE_INLINE uint32_t highzerobits(void) const { - return _highzerobits(ptr, len); - } + FORCE_INLINE uint32_t highzerobits( void ) const { + return _highzerobits(ptr, len); + } - FORCE_INLINE uint32_t window(size_t start, size_t count) const { - return _window(start, count, ptr, len); - } + FORCE_INLINE uint32_t window( size_t start, size_t count ) const { + return _window(start, count, ptr, len); + } - FORCE_INLINE void flipbit(size_t bit) { - _flipbit(bit, ptr, len); - } + FORCE_INLINE void flipbit( size_t bit ) { + _flipbit(bit, ptr, len); + } - FORCE_INLINE void reversebits(void) { - _reversebits(ptr, len); - } + FORCE_INLINE void reversebits( void ) { + _reversebits(ptr, len); + } - FORCE_INLINE void lrot(size_t c) { - _lrot(c, ptr, len); - } + FORCE_INLINE void lrot( size_t c ) { + _lrot(c, ptr, len); + } -private: + private: uint8_t * ptr; - size_t len; -}; + size_t len; +}; // class ExtBlob diff --git a/util/Blobsort.cpp b/util/Blobsort.cpp index f2942095..2b180098 100644 --- a/util/Blobsort.cpp +++ b/util/Blobsort.cpp @@ -29,177 +29,177 @@ // Blob sorting routine unit tests static const uint32_t SORT_TESTS = 19; -static const uint32_t TEST_SIZE = 100000; +static const uint32_t TEST_SIZE = 100000; -template < typename blobtype > -static void blobfill(std::vector & blobs, int testnum) { - if (testnum >= SORT_TESTS) { return ; } +template +static void blobfill( std::vector & blobs, int testnum ) { + if (testnum >= SORT_TESTS) { return; } - Rand r(testnum + 0xb840a149); + Rand r( testnum + 0xb840a149 ); - switch(testnum) { - case 0: // Consecutive numbers - case 1: // Consecutive numbers, sorted almost - case 2: // Consecutive numbers, scrambled - { - for (uint32_t n = 0; n < TEST_SIZE; n++) { - blobs[n] = n; - } - break; + switch (testnum) { + case 0: // Consecutive numbers + case 1: // Consecutive numbers, sorted almost + case 2: // Consecutive numbers, scrambled + { + for (uint32_t n = 0; n < TEST_SIZE; n++) { + blobs[n] = n; } - case 3: // Consecutive numbers, backwards - { - for (uint32_t n = 0; n < TEST_SIZE; n++) { - blobs[n] = TEST_SIZE - 1 - n; - } - break; + break; + } + case 3: // Consecutive numbers, backwards + { + for (uint32_t n = 0; n < TEST_SIZE; n++) { + blobs[n] = TEST_SIZE - 1 - n; } - case 4: // Random numbers - case 5: // Random numbers, sorted - case 6: // Random numbers, sorted almost - case 7: // Random numbers, sorted backwards + break; + } + case 4: // Random numbers + case 5: // Random numbers, sorted + case 6: // Random numbers, sorted almost + case 7: // Random numbers, sorted backwards case 10: // All zero bytes in LSB position case 11: // All zero bytes in MSB position case 12: // All zero bytes in LSB+1 position case 13: // All zero bytes in MSB-1 position case 14: // Random numbers, except each position has some missing bytes - { - for (uint32_t n = 0; n < TEST_SIZE; n++) { - r.rand_p(&blobs[n], sizeof(blobtype)); - } - break; + { + for (uint32_t n = 0; n < TEST_SIZE; n++) { + r.rand_p(&blobs[n], sizeof(blobtype)); } - case 8: // Many duplicates - { - uint32_t x = 0; - do { - r.rand_p(&blobs[x], sizeof(blobtype)); - uint32_t count = 1 + r.rand_range(TEST_SIZE - 1 - x); - for (uint32_t i = 1; i < count; i++) { - blobs[x + i] = blobs[x]; - } - x += count; - } while (x < TEST_SIZE); - break; - } - case 9: // All duplicates - { - r.rand_p(&blobs[0], sizeof(blobtype)); - for (uint32_t i = 1; i < TEST_SIZE; i++) { - blobs[i] = blobs[0]; + break; + } + case 8: // Many duplicates + { + uint32_t x = 0; + do { + r.rand_p(&blobs[x], sizeof(blobtype)); + uint32_t count = 1 + r.rand_range(TEST_SIZE - 1 - x); + for (uint32_t i = 1; i < count; i++) { + blobs[x + i] = blobs[x]; } - break; + x += count; + } while (x < TEST_SIZE); + break; + } + case 9: // All duplicates + { + r.rand_p(&blobs[0], sizeof(blobtype)); + for (uint32_t i = 1; i < TEST_SIZE; i++) { + blobs[i] = blobs[0]; } + break; + } case 15: // All zeroes - { - memset(&blobs[0], 0, TEST_SIZE * sizeof(blobtype)); - break; - } + { + memset(&blobs[0], 0, TEST_SIZE * sizeof(blobtype)); + break; + } case 16: // All ones - { - for (uint32_t i = 0; i < TEST_SIZE; i++) { - blobs[i] = 1; - } - break; + { + for (uint32_t i = 0; i < TEST_SIZE; i++) { + blobs[i] = 1; } + break; + } case 17: // All Fs - { - memset(&blobs[0], 0xFF, TEST_SIZE * sizeof(blobtype)); - break; - } + { + memset(&blobs[0], 0xFF, TEST_SIZE * sizeof(blobtype)); + break; + } case 18: // All 0xAAA and 0x555 - { - uint32_t i = 0; - do { - uint64_t rndnum = r.rand_u64(); - for (int j = 0; j < 64; j++) { - if (rndnum & 1) { - memset(&blobs[i], 0xAA, sizeof(blobtype)); - } else { - memset(&blobs[i], 0x55, sizeof(blobtype)); - } - i++; - rndnum >>= 1; - if (i == TEST_SIZE) { break; } + { + uint32_t i = 0; + do { + uint64_t rndnum = r.rand_u64(); + for (int j = 0; j < 64; j++) { + if (rndnum & 1) { + memset(&blobs[i], 0xAA, sizeof(blobtype)); + } else { + memset(&blobs[i], 0x55, sizeof(blobtype)); } - } while (i < TEST_SIZE); - break; - } + i++; + rndnum >>= 1; + if (i == TEST_SIZE) { break; } + } + } while (i < TEST_SIZE); + break; + } default: unreachable(); break; } - switch(testnum) { - // Sorted backwards - case 7: - { - std::sort(blobs.rbegin(), blobs.rend()); - break; - } - // Sorted - case 5: - case 6: - { - std::sort(blobs.begin(), blobs.end()); - if (testnum == 5) break; - } - // 6 is fallthrough to... - // "Almost sorted" == mix up a few entries - case 1: - { - for (uint32_t n = 0; n < TEST_SIZE / 1000; n++) { - std::swap(blobs[r.rand_range(TEST_SIZE)], - blobs[r.rand_range(TEST_SIZE)]); - } - break; + switch (testnum) { + // Sorted backwards + case 7: + { + std::sort(blobs.rbegin(), blobs.rend()); + break; + } + // Sorted + case 5: + case 6: + { + std::sort(blobs.begin(), blobs.end()); + if (testnum == 5) { break; } + } + // 6 is fallthrough to... + // "Almost sorted" == mix up a few entries + case 1: + { + for (uint32_t n = 0; n < TEST_SIZE / 1000; n++) { + std::swap(blobs[r.rand_range(TEST_SIZE)], blobs[r.rand_range(TEST_SIZE)]); } - // "Scrambled" == shuffle all the entries - case 2: - { - for (uint32_t n = TEST_SIZE - 1; n > 0; n--) { - std::swap(blobs[n], blobs[r.rand_range(n + 1)]); - } - break; + break; + } + // "Scrambled" == shuffle all the entries + case 2: + { + for (uint32_t n = TEST_SIZE - 1; n > 0; n--) { + std::swap(blobs[n], blobs[r.rand_range(n + 1)]); } - // Zero out bytes in some position + break; + } + // Zero out bytes in some position case 10: case 11: case 12: case 13: - { - uint32_t offset = (testnum == 10) ? 0 : - ((testnum == 11) ? (sizeof(blobtype) - 1) : - ((testnum == 12) ? 1 : (sizeof(blobtype) - 2))); - for (uint32_t n = 0; n < TEST_SIZE; n++) { - blobs[n][offset] = 0; - } - break; + { + uint32_t offset = (testnum == 10) ? 0 : + ((testnum == 11) ? (sizeof(blobtype) - 1) : + ((testnum == 12) ? 1 : (sizeof(blobtype) - 2))); + for (uint32_t n = 0; n < TEST_SIZE; n++) { + blobs[n][offset] = 0; } - // Exclude a byte value from each position + break; + } + // Exclude a byte value from each position case 14: - { - uint8_t excludes[sizeof(blobtype)]; - r.rand_p(excludes, sizeof(excludes)); - for (uint32_t n = 0; n < TEST_SIZE; n++) { - for (uint32_t i = 0; i < sizeof(blobtype); i++) { - if (blobs[n][i] == excludes[i]) { - blobs[n][i] = ~excludes[i]; - } + { + uint8_t excludes[sizeof(blobtype)]; + r.rand_p(excludes, sizeof(excludes)); + for (uint32_t n = 0; n < TEST_SIZE; n++) { + for (uint32_t i = 0; i < sizeof(blobtype); i++) { + if (blobs[n][i] == excludes[i]) { + blobs[n][i] = ~excludes[i]; } } - break; } + break; + } default: break; } } -template < typename blobtype > -static bool blobverify(std::vector & blobs) { - bool passed = true; +template +static bool blobverify( std::vector & blobs ) { + bool passed = true; const size_t sz = blobs.size(); + for (size_t nb = 1; nb < sz; nb++) { if (!((blobs[nb - 1] < blobs[nb]) || - (blobs[nb - 1] == blobs[nb]))) { + (blobs[nb - 1] == blobs[nb]))) { passed = false; } if (blobs[nb] < blobs[nb - 1]) { @@ -210,16 +210,16 @@ static bool blobverify(std::vector & blobs) { return passed; } -template < typename blobtype > -static bool test_blobsort_type(void) { +template +static bool test_blobsort_type( void ) { bool passed = true; - std::vector blobs(TEST_SIZE); + std::vector blobs( TEST_SIZE ); for (int i = 0; i < SORT_TESTS; i++) { blobfill(blobs, i); blobsort(blobs.begin(), blobs.end()); passed &= blobverify(blobs); - //printf("After test %d: %s\n", i, passed ? "ok" : "no"); + // printf("After test %d: %s\n", i, passed ? "ok" : "no"); } return passed; @@ -239,14 +239,15 @@ static bool test_blobsort_type(void) { // the list, which means the first template function gets called, // which ignores that type and just passes its input through. -template < typename T > -static bool AND(bool in) { +template +static bool AND( bool in ) { return in; } -template < typename T, typename... More > +template typename std::enable_if::value, bool>::type -static AND(bool in) { + +static AND( bool in ) { return test_blobsort_type() && AND(in); } @@ -256,4 +257,4 @@ static AND(bool in) { // cause it to run during startup, which takes a few seconds. // So this is only referenced in DEBUG mode. extern bool blobsort_test_result; -bool blobsort_test_result = AND(true); +bool blobsort_test_result = AND(true); diff --git a/util/Blobsort.h b/util/Blobsort.h index 154fa669..bee308a0 100644 --- a/util/Blobsort.h +++ b/util/Blobsort.h @@ -19,149 +19,154 @@ //----------------------------------------------------------------------------- // Blob sorting routines -static const uint32_t RADIX_BITS = 8; -static const uint32_t RADIX_SIZE = (uint32_t)1 << RADIX_BITS; -static const uint32_t RADIX_MASK = RADIX_SIZE - 1; - -template< typename T > -static void radixsort( T * begin, T * end ) -{ - const uint32_t RADIX_LEVELS = sizeof(T); - const size_t count = end - begin; - - size_t freqs [RADIX_LEVELS][RADIX_SIZE] = {}; - T * ptr = begin; - // Record byte frequencies in each position over all items except - // the last one. - do { +static const uint32_t RADIX_BITS = 8; +static const uint32_t RADIX_SIZE = (uint32_t)1 << RADIX_BITS; +static const uint32_t RADIX_MASK = RADIX_SIZE - 1; + +template +static void radixsort( T * begin, T * end ) { + const uint32_t RADIX_LEVELS = sizeof(T); + const size_t count = end - begin; + + size_t freqs[RADIX_LEVELS][RADIX_SIZE] = {}; + T * ptr = begin; + + // Record byte frequencies in each position over all items except + // the last one. + do { + for (uint32_t pass = 0; pass < RADIX_LEVELS; pass++) { + uint8_t value = (*ptr)[pass]; + ++freqs[pass][value]; + } + } while (++ptr < (end - 1)); + // Process the last item separately, so that we can record which + // passes (if any) would do no reordering of items, and which can + // therefore be skipped entirely. + uint32_t trivial_passes = 0; for (uint32_t pass = 0; pass < RADIX_LEVELS; pass++) { - uint8_t value = (*ptr)[pass]; - ++freqs[pass][value]; - } - } while (++ptr < (end - 1)); - // Process the last item separately, so that we can record which - // passes (if any) would do no reordering of items, and which can - // therefore be skipped entirely. - uint32_t trivial_passes = 0; - for (uint32_t pass = 0; pass < RADIX_LEVELS; pass++) { - uint8_t value = (*ptr)[pass]; - if (++freqs[pass][value] == count) - trivial_passes |= 1UL << pass; - } - - std::unique_ptr queue_area(new T[count]); - T * from = begin; - T * to = queue_area.get(); - - for (uint32_t pass = 0; pass < RADIX_LEVELS; pass++) { - // If this pass would do nothing, just skip it. - if (trivial_passes & (1UL << pass)) - continue; - - // Array of pointers to the current position in each queue, - // pre-arranged based on the known final sizes of each queue. This - // way all the entries end up contiguous with no gaps. - T * queue_ptrs[RADIX_SIZE]; - T * next = to; - for (size_t i = 0; i < RADIX_SIZE; i++) { - queue_ptrs[i] = next; - next += freqs[pass][i]; + uint8_t value = (*ptr)[pass]; + if (++freqs[pass][value] == count) { + trivial_passes |= 1UL << pass; + } } - // Copy each element into its queue based on the current byte. - for (size_t i = 0; i < count; i++) { - uint8_t index = from[i][pass]; - *queue_ptrs[index]++ = from[i]; - __builtin_prefetch(queue_ptrs[index] + 1); - } + std::unique_ptr queue_area( new T[count] ); + T * from = begin; + T * to = queue_area.get(); - std::swap(from, to); - } + for (uint32_t pass = 0; pass < RADIX_LEVELS; pass++) { + // If this pass would do nothing, just skip it. + if (trivial_passes & (1UL << pass)) { + continue; + } + + // Array of pointers to the current position in each queue, + // pre-arranged based on the known final sizes of each queue. This + // way all the entries end up contiguous with no gaps. + T * queue_ptrs[RADIX_SIZE]; + T * next = to; + for (size_t i = 0; i < RADIX_SIZE; i++) { + queue_ptrs[i] = next; + next += freqs[pass][i]; + } + + // Copy each element into its queue based on the current byte. + for (size_t i = 0; i < count; i++) { + uint8_t index = from[i][pass]; + *queue_ptrs[index]++ = from[i]; + __builtin_prefetch(queue_ptrs[index] + 1); + } + + std::swap(from, to); + } - // Because the swap always happens in the above loop, the "from" - // area has the sorted payload. If that's not the original array, - // then do a final copy. - if (from != begin) - std::copy(from, from + count, begin); + // Because the swap always happens in the above loop, the "from" + // area has the sorted payload. If that's not the original array, + // then do a final copy. + if (from != begin) { + std::copy(from, from + count, begin); + } } //----------------------------------------------------------------------------- -static const uint32_t SORT_CUTOFF = 60; +static const uint32_t SORT_CUTOFF = 60; // This is an in-place MSB radix sort that recursively sorts each // block, sometimes known as an "American Flag Sort". Testing shows // that performance increases by devolving to std::sort once we get // down to small block sizes. Both 40 and 60 items are best on my // system, but there could be a better value for the general case. -template< typename T > -static void flagsort( T * begin, T * end, int idx ) -{ - const uint32_t DIGITS = sizeof(T); - const size_t count = end - begin; - assume(idx >= 0); - assume(idx < DIGITS); - - // Each pass must compute its own frequency table, because the - // counts depend on all previous bytes, since each pass operates on - // a successively smaller subset of the total list to sort. - size_t freqs[RADIX_SIZE] = {}; - T * ptr = begin; - do { - ++freqs[(*ptr)[idx]]; - } while (++ptr < (end - 1)); - // As in radix sort, if this pass would do no rearrangement, then - // there's no need to iterate over every item. Since this case is - // only likely to hit in degenerate cases (e.g. donothing64), just - // devolve into radixsort since that performs better on lists of - // many similar values. - if (++freqs[(*ptr)[idx]] == count) { - // If there are no more passes, then we're just done. - if (idx == 0) { - return; - } - return radixsort(begin, end); - } - - T * block_ptrs[RADIX_SIZE]; - ptr = begin; - for (size_t i = 0; i < RADIX_SIZE; i++) { - block_ptrs[i] = ptr; - ptr += freqs[i]; - } - - // Move all values into their correct block, maintaining a stable - // sort ordering inside each block. - ptr = begin; - T * nxt = begin + freqs[0]; - uint8_t curblock = 0; - while (curblock < (RADIX_SIZE - 1)) { - if (expectp(ptr >= nxt, 0.0944)) { - curblock++; - nxt += freqs[curblock]; - continue; +template +static void flagsort( T * begin, T * end, int idx ) { + const uint32_t DIGITS = sizeof(T); + const size_t count = end - begin; + + assume(idx >= 0 ); + assume(idx < DIGITS); + + // Each pass must compute its own frequency table, because the + // counts depend on all previous bytes, since each pass operates on + // a successively smaller subset of the total list to sort. + size_t freqs[RADIX_SIZE] = {}; + T * ptr = begin; + do { + ++freqs[(*ptr)[idx]]; + } while (++ptr < (end - 1)); + // As in radix sort, if this pass would do no rearrangement, then + // there's no need to iterate over every item. Since this case is + // only likely to hit in degenerate cases (e.g. donothing64), just + // devolve into radixsort since that performs better on lists of + // many similar values. + if (++freqs[(*ptr)[idx]] == count) { + // If there are no more passes, then we're just done. + if (idx == 0) { + return; + } + return radixsort(begin, end); } - uint8_t value = (*ptr)[idx]; - if (unpredictable(value == curblock)) { // p ~= 0.501155 - ptr++; - continue; + + T * block_ptrs[RADIX_SIZE]; + ptr = begin; + for (size_t i = 0; i < RADIX_SIZE; i++) { + block_ptrs[i] = ptr; + ptr += freqs[i]; + } + + // Move all values into their correct block, maintaining a stable + // sort ordering inside each block. + ptr = begin; + T * nxt = begin + freqs[0]; + uint8_t curblock = 0; + while (curblock < (RADIX_SIZE - 1)) { + if (expectp((ptr >= nxt), 0.0944)) { + curblock++; + nxt += freqs[curblock]; + continue; + } + uint8_t value = (*ptr)[idx]; + if (unpredictable(value == curblock)) { // p ~= 0.501155 + ptr++; + continue; + } + // assert(block_ptrs[value] < end); + std::swap(*ptr, *block_ptrs[value]++); // MAYBE do this better manually? + } + + if (idx == 0) { + return; + } + + // Sort each block by the next less-significant byte, or by + // std::sort if there are only a few entries in the block. + ptr = begin; + for (int i = 0; i < RADIX_SIZE; i++) { + if (expectp((freqs[i] > SORT_CUTOFF), 0.00390611)) { + flagsort(ptr, ptr + freqs[i], idx - 1); + } else if (expectp((freqs[i] > 1), 0.3847)) { + std::sort(ptr, ptr + freqs[i]); + } + ptr += freqs[i]; } - //assert(block_ptrs[value] < end); - std::swap(*ptr, *block_ptrs[value]++); // MAYBE do this better manually? - } - - if (idx == 0) - return; - - // Sort each block by the next less-significant byte, or by - // std::sort if there are only a few entries in the block. - ptr = begin; - for (int i = 0; i < RADIX_SIZE; i++) { - if (expectp(freqs[i] > SORT_CUTOFF, 0.00390611)) - flagsort(ptr, ptr + freqs[i], idx - 1); - else if (expectp(freqs[i] > 1, 0.3847)) - std::sort(ptr, ptr + freqs[i]); - ptr += freqs[i]; - } } //----------------------------------------------------------------------------- @@ -170,20 +175,21 @@ static void flagsort( T * begin, T * end, int idx ) // that is, so some effort into finding the right cutoff might be // appropriate. This approach handily beats just using std::sort, at // least on my system (526 seconds vs 1430). -template< class Iter > -static void blobsort ( Iter iter_begin, Iter iter_end ) -{ - typedef typename std::iterator_traits::value_type T; - // Nothing to sort if there are 0 or 1 items - if ((iter_end - iter_begin) < 2) - return; - else if ((iter_end - iter_begin) <= SORT_CUTOFF) - return std::sort(iter_begin, iter_end); - - T * begin = &(*iter_begin); - T * end = &(*iter_end); - if (sizeof(T) > 4) - flagsort(begin, end, sizeof(T) - 1); - else - radixsort(begin, end); +template +static void blobsort( Iter iter_begin, Iter iter_end ) { + typedef typename std::iterator_traits::value_type T; + // Nothing to sort if there are 0 or 1 items + if ((iter_end - iter_begin) < 2) { + return; + } else if ((iter_end - iter_begin) <= SORT_CUTOFF) { + return std::sort(iter_begin, iter_end); + } + + T * begin = &(*iter_begin); + T * end = &(*iter_end ); + if (sizeof(T) > 4) { + flagsort(begin, end, sizeof(T) - 1); + } else { + radixsort(begin, end); + } } diff --git a/util/Instantiate.h b/util/Instantiate.h index 6964ded1..2cb8b840 100644 --- a/util/Instantiate.h +++ b/util/Instantiate.h @@ -24,13 +24,13 @@ #if defined(__cplusplus) && (__cplusplus >= 201402L) // C++14 allows auto variables to determine function return types -#define INSTANTIATE(FN, TYPELIST) \ - template < typename ... Types> \ - auto FN ## _instantiator() { \ - static auto instances = \ - std::tuple_cat(std::make_tuple(FN)...); \ - return &instances; \ - } \ +#define INSTANTIATE(FN, TYPELIST) \ + template < typename ... Types> \ + auto FN ## _instantiator() { \ + static auto instances = \ + std::tuple_cat(std::make_tuple(FN)...); \ + return &instances; \ + } \ template auto FN ## _instantiator(); #else // C++11 doesn't, so YOU get a void*, and YOU get a void*,.... diff --git a/util/Platform.cpp b/util/Platform.cpp index bfb7e9bb..f99b6851 100644 --- a/util/Platform.cpp +++ b/util/Platform.cpp @@ -19,12 +19,12 @@ #include "Platform.h" #if defined(HAVE_THREADS) -unsigned g_NCPU = 4; +unsigned g_NCPU = 4; #else -const unsigned g_NCPU = 1; +const unsigned g_NCPU = 1; #endif -void DisableThreads(void) { +void DisableThreads( void ) { #if defined(HAVE_THREADS) printf("WARNING: disabling threaded mode\n"); g_NCPU = 1; diff --git a/util/Random.h b/util/Random.h index 9cff1735..01a26c1c 100644 --- a/util/Random.h +++ b/util/Random.h @@ -46,105 +46,96 @@ // Xorshift RNG based on code by George Marsaglia // http://en.wikipedia.org/wiki/Xorshift -class Rand -{ - private: - uint32_t x; - uint32_t y; - uint32_t z; - uint32_t w; - - public: - Rand() - { - reseed(uint32_t(0)); - } - - Rand( uint32_t seed ) - { - reseed(seed); - } - - void reseed ( uint32_t seed ) - { - x = 0x498b3bc5 ^ seed; - y = 0; - z = 0; - w = 0; - - for(int i = 0; i < 10; i++) mix(); - } - - void reseed ( uint64_t seed ) - { - x = 0x498b3bc5 ^ (uint32_t)(seed >> 0); - y = 0x5a05089a ^ (uint32_t)(seed >> 32); - z = 0; - w = 0; - - for(int i = 0; i < 10; i++) mix(); - } - - //----------------------------------------------------------------------------- - - void mix ( void ) - { - uint32_t t = x ^ (x << 11); - x = y; y = z; z = w; - w = w ^ (w >> 19) ^ t ^ (t >> 8); - } - - uint32_t rand_u32 ( void ) - { - mix(); - - return x; - } - - uint64_t rand_u64 ( void ) - { - mix(); - - uint64_t a = x; - uint64_t b = y; - - return (a << 32) | b; - } +class Rand { + private: + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + public: + Rand() { + reseed(uint32_t(0)); + } + + Rand( uint32_t seed ) { + reseed(seed); + } + + void reseed( uint32_t seed ) { + x = 0x498b3bc5 ^ seed; + y = 0; + z = 0; + w = 0; + + for (int i = 0; i < 10; i++) { mix(); } + } + + void reseed( uint64_t seed ) { + x = 0x498b3bc5 ^ (uint32_t)(seed >> 0); + y = 0x5a05089a ^ (uint32_t)(seed >> 32); + z = 0; + w = 0; + + for (int i = 0; i < 10; i++) { mix(); } + } + + //----------------------------------------------------------------------------- + + void mix( void ) { + uint32_t t = x ^ (x << 11); + + x = y; y = z; z = w; + w = w ^ (w >> 19) ^ t ^ (t >> 8); + } + + uint32_t rand_u32( void ) { + mix(); + + return x; + } + + uint64_t rand_u64( void ) { + mix(); + + uint64_t a = x; + uint64_t b = y; + + return (a << 32) | b; + } #if defined(HAVE_INT128) - uint128_t rand_u128 ( void ) - { - uint128_t a = rand_u64(); - return (a << 64) | rand_u64(); - } + + uint128_t rand_u128( void ) { + uint128_t a = rand_u64(); + + return (a << 64) | rand_u64(); + } + #endif - // Returns a value in the range [0, max) - uint32_t rand_range ( uint32_t max ) - { - uint64_t r = rand_u32(); - return (r * max) >> 32; - - } - - void rand_p ( void * blob, int bytes ) - { - uint8_t * blocks = reinterpret_cast(blob); - int i; - - while(bytes >= 4) - { - uint32_t r = COND_BSWAP(rand_u32(), isBE()); - memcpy(blocks, &r, 4); - blocks += 4; - bytes -= 4; + // Returns a value in the range [0, max) + uint32_t rand_range( uint32_t max ) { + uint64_t r = rand_u32(); + + return (r * max) >> 32; } - for (i = 0; i < bytes; i++) - { - blocks[i] = (uint8_t)rand_u32(); + void rand_p( void * blob, int bytes ) { + uint8_t * blocks = reinterpret_cast(blob); + int i; + + while (bytes >= 4) { + uint32_t r = COND_BSWAP(rand_u32(), isBE()); + memcpy(blocks, &r, 4); + blocks += 4; + bytes -= 4; + } + + for (i = 0; i < bytes; i++) { + blocks[i] = (uint8_t)rand_u32(); + } } - } -}; +}; // class Rand //----------------------------------------------------------------------------- diff --git a/util/Stats.cpp b/util/Stats.cpp index 89fa5c35..8f04b9ee 100644 --- a/util/Stats.cpp +++ b/util/Stats.cpp @@ -60,169 +60,153 @@ //----------------------------------------------------------------------------- -double CalcMean(std::vector & v) { - const size_t sz = v.size(); - double mean = 0; +double CalcMean( std::vector & v ) { + const size_t sz = v.size(); + double mean = 0; - for(size_t i = 0; i < sz; i++) { - mean += v[i]; - } + for (size_t i = 0; i < sz; i++) { + mean += v[i]; + } - mean /= double(sz); + mean /= double(sz); - return mean; + return mean; } -double CalcMean ( std::vector & v, int a, int b ) -{ - double mean = 0; +double CalcMean( std::vector & v, int a, int b ) { + double mean = 0; - for(int i = a; i <= b; i++) - { - mean += v[i]; - } + for (int i = a; i <= b; i++) { + mean += v[i]; + } - mean /= (b-a+1); + mean /= (b - a + 1); - return mean; + return mean; } -double CalcStdv ( std::vector & v, int a, int b ) -{ - double mean = CalcMean(v,a,b); +double CalcStdv( std::vector & v, int a, int b ) { + double mean = CalcMean(v, a, b); - double stdv = 0; + double stdv = 0; - for(int i = a; i <= b; i++) - { - double x = v[i] - mean; + for (int i = a; i <= b; i++) { + double x = v[i] - mean; - stdv += x*x; - } + stdv += x * x; + } - stdv = sqrt(stdv / (b-a+1)); + stdv = sqrt(stdv / (b - a + 1)); - return stdv; + return stdv; } -double CalcStdv ( std::vector & v ) -{ - return CalcStdv(v, 0, v.size()); +double CalcStdv( std::vector & v ) { + return CalcStdv(v, 0, v.size()); } // Return true if the largest value in v[0,len) is more than three // standard deviations from the mean -bool ContainsOutlier ( std::vector & v, size_t len ) -{ - double mean = 0; +bool ContainsOutlier( std::vector & v, size_t len ) { + double mean = 0; - for(size_t i = 0; i < len; i++) - { - mean += v[i]; - } + for (size_t i = 0; i < len; i++) { + mean += v[i]; + } - mean /= double(len); + mean /= double(len); - double stdv = 0; + double stdv = 0; - for(size_t i = 0; i < len; i++) - { - double x = v[i] - mean; - stdv += x*x; - } + for (size_t i = 0; i < len; i++) { + double x = v[i] - mean; + stdv += x * x; + } - stdv = sqrt(stdv / double(len)); + stdv = sqrt(stdv / double(len)); - double cutoff = mean + stdv*3; + double cutoff = mean + stdv * 3; - return v[len-1] > cutoff; + return v[len - 1] > cutoff; } // Do a binary search to find the largest subset of v that does not contain // outliers. -void FilterOutliers ( std::vector & v ) -{ - std::sort(v.begin(),v.end()); +void FilterOutliers( std::vector & v ) { + std::sort(v.begin(), v.end()); - size_t len = 0; - const size_t sz = v.size(); + size_t len = 0; + const size_t sz = v.size(); - for(size_t x = 0x40000000; x; x = x >> 1 ) - { - if((len | x) >= sz) continue; + for (size_t x = 0x40000000; x; x = x >> 1) { + if ((len | x) >= sz) { continue; } - if(!ContainsOutlier(v,len | x)) - { - len |= x; + if (!ContainsOutlier(v, len | x)) { + len |= x; + } } - } - v.resize(len); + v.resize(len); } #if 0 // Iteratively tighten the set to find a subset that does not contain // outliers. I'm not positive this works correctly in all cases. -void FilterOutliers2 ( std::vector & v ) -{ - std::sort(v.begin(),v.end()); +void FilterOutliers2( std::vector & v ) { + std::sort(v.begin(), v.end()); - int a = 0; - int b = (int)(v.size() - 1); + int a = 0; + int b = (int)(v.size() - 1); - for(int i = 0; i < 10; i++) - { - //printf("%d %d\n",a,b); + for (int i = 0; i < 10; i++) { + // printf("%d %d\n",a,b); - double mean = CalcMean(v,a,b); - double stdv = CalcStdv(v,a,b); + double mean = CalcMean(v, a, b); + double stdv = CalcStdv(v, a, b); - double cutA = mean - stdv*3; - double cutB = mean + stdv*3; + double cutA = mean - stdv * 3; + double cutB = mean + stdv * 3; - while((a < b) && (v[a] < cutA)) a++; - while((b > a) && (v[b] > cutB)) b--; - } + while ((a < b) && (v[a] < cutA)) { a++; } + while ((b > a) && (v[b] > cutB)) { b--; } + } - std::vector v2; + std::vector v2; - v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1); + v2.insert(v2.begin(), v.begin() + a, v.begin() + b + 1); - v.swap(v2); + v.swap(v2); } + #endif //----------------------------------------------------------------------------- -double chooseK ( int n, int k ) -{ - if(k > (n - k)) k = n - k; +double chooseK( int n, int k ) { + if (k > (n - k)) { k = n - k; } - double c = 1; + double c = 1; - for(int i = 0; i < k; i++) - { - double t = double(n-i) / double(i+1); + for (int i = 0; i < k; i++) { + double t = double(n - i) / double(i + 1); - c *= t; - } + c *= t; + } return c; } -double chooseUpToK ( int n, int k ) -{ - double c = 0; +double chooseUpToK( int n, int k ) { + double c = 0; - for(int i = 1; i <= k; i++) - { - c += chooseK(n,i); - } + for (int i = 1; i <= k; i++) { + c += chooseK(n, i); + } - return c; + return c; } //----------------------------------------------------------------------------- @@ -232,49 +216,49 @@ double chooseUpToK ( int n, int k ) // Note: with 32bit 77163 keys will get a 50% probability of one collision. // Naive multiplication, no accuracy at all -static double ExpectedNBCollisions_Slow ( const double nbH, const double nbBits ) -{ - long balls = nbH; - long double bins = nbBits; - long double result = 1.0; - for (long i = 1; i < balls / 2; i++) { - // take a pair from the front and the end to minimize errors - result *= ((bins - i) / bins) * ((bins - (nbH - i)) / bins); - } - return (double)(nbH * result); +static double ExpectedNBCollisions_Slow( const double nbH, const double nbBits ) { + long balls = nbH; + long double bins = nbBits; + long double result = 1.0; + + for (long i = 1; i < balls / 2; i++) { + // take a pair from the front and the end to minimize errors + result *= ((bins - i) / bins) * ((bins - (nbH - i)) / bins); + } + return (double)(nbH * result); } // Still too inaccurate: https://preshing.com/20110504/hash-collision-probabilities/ -static double EstimateNbCollisions_Taylor(const double nbH, const double nbBits) -{ - const long double k = nbH; - const long double b = nbBits; - return (double)(k * (1.0 - expl(-0.5 * k * (k - 1.0) / b))); +static double EstimateNbCollisions_Taylor( const double nbH, const double nbBits ) { + const long double k = nbH; + const long double b = nbBits; + + return (double)(k * (1.0 - expl(-0.5 * k * (k - 1.0) / b))); } // demerphq: (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1)); // the very same as our calc. pow 2 vs exp2. Just the high cutoff is missing here. -static double EstimateNbCollisions_Demerphq(const double nbH, const double nbBits) -{ - return (nbH * (nbH - 1)) / pow(2.0, nbBits + 1); +static double EstimateNbCollisions_Demerphq( const double nbH, const double nbBits ) { + return (nbH * (nbH - 1)) / pow(2.0, nbBits + 1); } // The previous best calculation, highly prone to inaccuracies with low results (1.0 - 10.0) // TODO: return also the error. -static double EstimateNbCollisions_previmpl(const double nbH, const double nbBits) -{ - double exp = exp2(nbBits); // 2 ^ bits - double result = (nbH * (nbH-1)) / (2.0 * exp); - if (result > nbH) - result = nbH; - // improved floating point accuracy - if (result <= exp || nbBits > 32) - return result; - return result - exp; +static double EstimateNbCollisions_previmpl( const double nbH, const double nbBits ) { + double exp = exp2(nbBits); // 2 ^ bits + double result = (nbH * (nbH - 1)) / (2.0 * exp); + + if (result > nbH) { + result = nbH; + } + // improved floating point accuracy + if ((result <= exp) || (nbBits > 32)) { + return result; + } + return result - exp; } -static double EstimateNbCollisions_fwojcik(const double nbH, const int nbBits) -{ +static double EstimateNbCollisions_fwojcik( const double nbH, const int nbBits ) { // If the probability that there are 1 or more collisions (p(C >= // 1)) is not much higher than the probability of exactly 1 // collision (p(C == 1)), then the classically-good approximation @@ -285,8 +269,8 @@ static double EstimateNbCollisions_fwojcik(const double nbH, const int nbBits) // of p(C >= 1)/p(C == 1) is about 1/(1-2**(n-2r-1)). This uses // the new estimator if that ratio is > 1 + 2**-8. That cutoff // minimizes the error around the values we care about. - if (nbBits - 2.0*log2(nbH) >= 8 - 1) { - return nbH * (nbH - 1) * exp2(-nbBits-1); + if (nbBits - 2.0 * log2(nbH) >= 8 - 1) { + return nbH * (nbH - 1) * exp2(-nbBits - 1); } // The probability that any given hash bucket is empty after nbH @@ -316,26 +300,25 @@ static double EstimateNbCollisions_fwojcik(const double nbH, const int nbBits) // m/n and pF at the same general orders of magnitude, so it tends // to have very good precision. At low hash occupancy, pF is too // close to m/n for this formula to work well. - double logpE = (double)nbH * log1p(-exp2(-nbBits)); + double logpE = (double)nbH * log1p(-exp2(-nbBits)); double result = exp2(nbBits) * (exp2(-nbBits) * (double)nbH + expm1(logpE)); return result; } -double EstimateNbCollisions(const unsigned long nbH, const int nbBits) -{ - return EstimateNbCollisions_fwojcik((const double)nbH, (const double)nbBits); +double EstimateNbCollisions( const unsigned long nbH, const int nbBits ) { + return EstimateNbCollisions_fwojcik((const double)nbH, (const double)nbBits); } #define COLLISION_ESTIMATORS 3 -static double EstimateNbCollisionsCand(const unsigned long nbH, const int nbBits, const int estimator) -{ - switch(estimator) { + +static double EstimateNbCollisionsCand( const unsigned long nbH, const int nbBits, const int estimator ) { + switch (estimator) { case 0: return EstimateNbCollisions_fwojcik((const double)nbH, (const double)nbBits); case 1: return EstimateNbCollisions_previmpl((const double)nbH, (const double)nbBits); case 2: return EstimateNbCollisions_Demerphq((const double)nbH, (const double)nbBits); - //case 3: return EstimateNbCollisions_Taylor((const double)nbH, (const double)nbBits); - //case 4: return ExpectedNBCollisions_Slow((const double)nbH, (const double)nbBits); + // case 3: return EstimateNbCollisions_Taylor((const double)nbH, (const double)nbBits); + // case 4: return ExpectedNBCollisions_Slow((const double)nbH, (const double)nbBits); default: { printf("Invalid estimator requested\n"); exit(1); } } return NAN; @@ -352,471 +335,591 @@ static double EstimateNbCollisionsCand(const unsigned long nbH, const int nbBits */ static double realcoll[58][18] = { /* 149633745 */ - { 9.66830188511513408e-62, 4.15250404044246501e-52, 7.66001792990870096e-33, - 3.28995264957314909e-23, 6.06889145411344312e-04, 3.10727242021280714e-01, - 3.18184245207177412e+02, 2.54544870233834445e+03, 2.03619731305636706e+04, - 1.62792385217456205e+05, 2.57656049031511368e+06, 1.90430490019698478e+07, - 5.94342984822125658e+07, 1.32858774460385174e+08, 1.45439441000000000e+08, - 1.49109457000000000e+08, 1.49629649000000000e+08, 1.49633489000000000e+08 }, + { + 9.66830188511513408e-62, 4.15250404044246501e-52, 7.66001792990870096e-33, + 3.28995264957314909e-23, 6.06889145411344312e-04, 3.10727242021280714e-01, + 3.18184245207177412e+02, 2.54544870233834445e+03, 2.03619731305636706e+04, + 1.62792385217456205e+05, 2.57656049031511368e+06, 1.90430490019698478e+07, + 5.94342984822125658e+07, 1.32858774460385174e+08, 1.45439441000000000e+08, + 1.49109457000000000e+08, 1.49629649000000000e+08, 1.49633489000000000e+08 + }, /* 86536545 */ - { 3.23362916384237121e-62, 1.38883315060948101e-52, 2.56194496903768089e-33, - 1.10034698561685720e-23, 2.02978192359201898e-04, 1.03924834404869174e-01, - 1.06418943269388180e+02, 8.51346660380768071e+02, 6.81046060560096157e+03, - 5.44636796883101269e+04, 8.65959061394601478e+05, 6.61418293104189448e+06, - 2.27556140267314911e+07, 6.98558535013311207e+07, 8.23422410045954734e+07, - 8.60122570000000000e+07, 8.65324490000000000e+07, 8.65362890000000000e+07 }, + { + 3.23362916384237121e-62, 1.38883315060948101e-52, 2.56194496903768089e-33, + 1.10034698561685720e-23, 2.02978192359201898e-04, 1.03924834404869174e-01, + 1.06418943269388180e+02, 8.51346660380768071e+02, 6.81046060560096157e+03, + 5.44636796883101269e+04, 8.65959061394601478e+05, 6.61418293104189448e+06, + 2.27556140267314911e+07, 6.98558535013311207e+07, 8.23422410045954734e+07, + 8.60122570000000000e+07, 8.65324490000000000e+07, 8.65362890000000000e+07 + }, /* 75498113 */ - { 2.46129292104772484e-62, 1.05711726017762883e-52, 1.95003715543977527e-33, - 8.37534580859870329e-24, 1.54497860659825494e-04, 7.91029046026853616e-02, - 8.10013164325720538e+01, 6.48007286993706316e+02, 5.18385065708740240e+03, - 4.14575199616562895e+04, 6.59692186580697889e+05, 5.06817564395631664e+06, - 1.77549757986361682e+07, 5.89072678887400925e+07, 7.13038090638692677e+07, - 7.49738250000000000e+07, 7.54940170000000000e+07, 7.54978570000000000e+07 }, + { + 2.46129292104772484e-62, 1.05711726017762883e-52, 1.95003715543977527e-33, + 8.37534580859870329e-24, 1.54497860659825494e-04, 7.91029046026853616e-02, + 8.10013164325720538e+01, 6.48007286993706316e+02, 5.18385065708740240e+03, + 4.14575199616562895e+04, 6.59692186580697889e+05, 5.06817564395631664e+06, + 1.77549757986361682e+07, 5.89072678887400925e+07, 7.13038090638692677e+07, + 7.49738250000000000e+07, 7.54940170000000000e+07, 7.54978570000000000e+07 + }, /* 56050289 */ - { 1.35658440124283578e-62, 5.82648563760172142e-53, 1.07479689405983373e-33, - 4.61621750982936253e-24, 8.51541829923128089e-05, 4.35989416694992429e-02, - 4.46452925853961631e+01, 3.57161013077325094e+02, 2.85720313997638277e+03, - 2.28521884740198511e+04, 3.64148636055323470e+05, 2.82665629721443821e+06, - 1.02311598958176058e+07, 3.98670968021314815e+07, 5.18559915916659608e+07, - 5.55260010000000000e+07, 5.60461930000000000e+07, 5.60500330000000000e+07 }, + { + 1.35658440124283578e-62, 5.82648563760172142e-53, 1.07479689405983373e-33, + 4.61621750982936253e-24, 8.51541829923128089e-05, 4.35989416694992429e-02, + 4.46452925853961631e+01, 3.57161013077325094e+02, 2.85720313997638277e+03, + 2.28521884740198511e+04, 3.64148636055323470e+05, 2.82665629721443821e+06, + 1.02311598958176058e+07, 3.98670968021314815e+07, 5.18559915916659608e+07, + 5.55260010000000000e+07, 5.60461930000000000e+07, 5.60500330000000000e+07 + }, /* 49925029 */ - { 1.07628616390943998e-62, 4.62261387512834023e-53, 8.52721751060712554e-34, - 3.66241203339361373e-24, 6.75595774724252468e-05, 3.45905036499356000e-02, - 3.54206590004570572e+01, 2.83364333813803171e+02, 2.26685462770169033e+03, - 1.81309949687949847e+04, 2.89045130868813896e+05, 2.25101610920316912e+06, - 8.23359498302312009e+06, 3.40035930111785606e+07, 4.57307533941198885e+07, - 4.94007410000000000e+07, 4.99209330000000000e+07, 4.99247730000000000e+07 }, + { + 1.07628616390943998e-62, 4.62261387512834023e-53, 8.52721751060712554e-34, + 3.66241203339361373e-24, 6.75595774724252468e-05, 3.45905036499356000e-02, + 3.54206590004570572e+01, 2.83364333813803171e+02, 2.26685462770169033e+03, + 1.81309949687949847e+04, 2.89045130868813896e+05, 2.25101610920316912e+06, + 8.23359498302312009e+06, 3.40035930111785606e+07, 4.57307533941198885e+07, + 4.94007410000000000e+07, 4.99209330000000000e+07, 4.99247730000000000e+07 + }, /* 44251425 */ - { 8.45562327779528750e-63, 3.63166254454270828e-53, 6.69923495212561545e-34, - 2.87729950275996440e-24, 5.30768075507823733e-05, 2.71753254548965095e-02, - 2.78275216109708978e+01, 2.22619519580197675e+02, 1.78091434578536018e+03, - 1.42446392954819730e+04, 2.27182256963651860e+05, 1.77461480911257491e+06, - 6.55507402957992628e+06, 2.86743406137902029e+07, 4.00572308235341832e+07, - 4.37271370000000000e+07, 4.42473290000000000e+07, 4.42511690000000000e+07 }, + { + 8.45562327779528750e-63, 3.63166254454270828e-53, 6.69923495212561545e-34, + 2.87729950275996440e-24, 5.30768075507823733e-05, 2.71753254548965095e-02, + 2.78275216109708978e+01, 2.22619519580197675e+02, 1.78091434578536018e+03, + 1.42446392954819730e+04, 2.27182256963651860e+05, 1.77461480911257491e+06, + 6.55507402957992628e+06, 2.86743406137902029e+07, 4.00572308235341832e+07, + 4.37271370000000000e+07, 4.42473290000000000e+07, 4.42511690000000000e+07 + }, /* 43691201 */ - { 8.24288176206433810e-63, 3.54029075928611856e-53, 6.53068375830698963e-34, - 2.80490731624468888e-24, 5.17414074132004304e-05, 2.64916005848709717e-02, - 2.71273877811360791e+01, 2.17018473441357912e+02, 1.73610754462317163e+03, - 1.38862852138241597e+04, 2.21476017148987623e+05, 1.73055958502948540e+06, - 6.39857166559864674e+06, 2.81548679497163482e+07, 3.94970225171834230e+07, - 4.31669130000000000e+07, 4.36871050000000000e+07, 4.36909450000000000e+07 }, + { + 8.24288176206433810e-63, 3.54029075928611856e-53, 6.53068375830698963e-34, + 2.80490731624468888e-24, 5.17414074132004304e-05, 2.64916005848709717e-02, + 2.71273877811360791e+01, 2.17018473441357912e+02, 1.73610754462317163e+03, + 1.38862852138241597e+04, 2.21476017148987623e+05, 1.73055958502948540e+06, + 6.39857166559864674e+06, 2.81548679497163482e+07, 3.94970225171834230e+07, + 4.31669130000000000e+07, 4.36871050000000000e+07, 4.36909450000000000e+07 + }, /* 33558529 */ - { 4.86291784915122170e-63, 2.08860731252391586e-53, 3.85280045646069782e-34, - 1.65476519585125690e-24, 3.05250300699314860e-05, 1.56288153909619858e-02, - 1.60039018771892643e+01, 1.28030930083075560e+02, 1.02422920513447593e+03, - 8.19266670739054098e+03, 1.30763213462519823e+05, 1.02731598739112553e+06, - 3.86648187299589021e+06, 1.90513077430028245e+07, 2.93656306571820080e+07, - 3.30342410000000000e+07, 3.35544330000000000e+07, 3.35582730000000000e+07 }, + { + 4.86291784915122170e-63, 2.08860731252391586e-53, 3.85280045646069782e-34, + 1.65476519585125690e-24, 3.05250300699314860e-05, 1.56288153909619858e-02, + 1.60039018771892643e+01, 1.28030930083075560e+02, 1.02422920513447593e+03, + 8.19266670739054098e+03, 1.30763213462519823e+05, 1.02731598739112553e+06, + 3.86648187299589021e+06, 1.90513077430028245e+07, 2.93656306571820080e+07, + 3.30342410000000000e+07, 3.35544330000000000e+07, 3.35582730000000000e+07 + }, /* 33554432 */ - { 4.86173054093815170e-63, 2.08809736752937507e-53, 3.85185977398010151e-34, - 1.65436117580224877e-24, 3.05175772154867956e-05, 1.56249995294880754e-02, - 1.59999944369014884e+01, 1.27999670665119382e+02, 1.02397913646883865e+03, - 8.19066658538974480e+03, 1.30731328417170167e+05, 1.02706774802737299e+06, - 3.86557557111472497e+06, 1.90477651439465471e+07, 2.93615350309002101e+07, - 3.30301440000000000e+07, 3.35503360000000000e+07, 3.35541760000000000e+07 }, + { + 4.86173054093815170e-63, 2.08809736752937507e-53, 3.85185977398010151e-34, + 1.65436117580224877e-24, 3.05175772154867956e-05, 1.56249995294880754e-02, + 1.59999944369014884e+01, 1.27999670665119382e+02, 1.02397913646883865e+03, + 8.19066658538974480e+03, 1.30731328417170167e+05, 1.02706774802737299e+06, + 3.86557557111472497e+06, 1.90477651439465471e+07, 2.93615350309002101e+07, + 3.30301440000000000e+07, 3.35503360000000000e+07, 3.35541760000000000e+07 + }, /* 26977161 */ - { 3.14256005499304537e-63, 1.34971926619110914e-53, 2.48979258747824472e-34, - 1.06935777370422802e-24, 1.97261691747440925e-05, 1.00997986149531007e-02, - 1.03421911410463228e+01, 8.27373811067683533e+01, 6.61889575586005321e+02, - 5.29451037409544824e+03, 8.45461443414444802e+04, 6.66574543746769894e+05, - 2.53827383658029372e+06, 1.35603369840820655e+07, 2.27896075604615994e+07, - 2.64528730000000000e+07, 2.69730650000000000e+07, 2.69769050000000000e+07 }, + { + 3.14256005499304537e-63, 1.34971926619110914e-53, 2.48979258747824472e-34, + 1.06935777370422802e-24, 1.97261691747440925e-05, 1.00997986149531007e-02, + 1.03421911410463228e+01, 8.27373811067683533e+01, 6.61889575586005321e+02, + 5.29451037409544824e+03, 8.45461443414444802e+04, 6.66574543746769894e+05, + 2.53827383658029372e+06, 1.35603369840820655e+07, 2.27896075604615994e+07, + 2.64528730000000000e+07, 2.69730650000000000e+07, 2.69769050000000000e+07 + }, /* 22370049 */ - { 2.16085171788696973e-63, 9.28078745982995323e-54, 1.71200311073976113e-34, - 7.35299737127754043e-25, 1.35638860682561044e-05, 6.94470966551262447e-03, - 7.11138119182984063e+00, 5.68909651356401653e+01, 4.55122319603302856e+02, - 3.64063288968196957e+03, 5.81554370404469810e+04, 4.59645385789985245e+05, - 1.76481282635707408e+06, 1.00151462171464767e+07, 1.81959928124494441e+07, - 2.18457610000000000e+07, 2.23659530000000000e+07, 2.23697930000000000e+07 }, + { + 2.16085171788696973e-63, 9.28078745982995323e-54, 1.71200311073976113e-34, + 7.35299737127754043e-25, 1.35638860682561044e-05, 6.94470966551262447e-03, + 7.11138119182984063e+00, 5.68909651356401653e+01, 4.55122319603302856e+02, + 3.64063288968196957e+03, 5.81554370404469810e+04, 4.59645385789985245e+05, + 1.76481282635707408e+06, 1.00151462171464767e+07, 1.81959928124494441e+07, + 2.18457610000000000e+07, 2.23659530000000000e+07, 2.23697930000000000e+07 + }, /* 18877441 */ - { 1.53878283990836292e-63, 6.60902197305242237e-54, 1.21914936914420980e-34, - 5.23620666941341261e-25, 9.65909643476873488e-06, 4.94545737373954832e-03, - 5.06414744590625077e+00, 4.05131288488040155e+01, 3.24101784837318064e+02, - 2.59260655174234762e+03, 4.14247903550759002e+04, 3.28028082683300890e+05, - 1.26742600458991365e+06, 7.54599182152087614e+06, 1.47296973581916802e+07, - 1.83531530000000000e+07, 1.88733450000000000e+07, 1.88771850000000000e+07 }, + { + 1.53878283990836292e-63, 6.60902197305242237e-54, 1.21914936914420980e-34, + 5.23620666941341261e-25, 9.65909643476873488e-06, 4.94545737373954832e-03, + 5.06414744590625077e+00, 4.05131288488040155e+01, 3.24101784837318064e+02, + 2.59260655174234762e+03, 4.14247903550759002e+04, 3.28028082683300890e+05, + 1.26742600458991365e+06, 7.54599182152087614e+06, 1.47296973581916802e+07, + 1.83531530000000000e+07, 1.88733450000000000e+07, 1.88771850000000000e+07 + }, /* 18616785 */ - { 1.49658179329122305e-63, 6.42776985797483522e-54, 1.18571425534766178e-34, - 5.09260394911920045e-25, 9.39419617181328754e-06, 4.80982843914157677e-03, - 4.92526345384282216e+00, 3.94020589843511928e+01, 3.15213358531706945e+02, - 2.52150762757849679e+03, 4.02895318773614636e+04, 3.19083263398166222e+05, - 1.23344671390196425e+06, 7.37060359433948807e+06, 1.44720266633904669e+07, - 1.80924970000000000e+07, 1.86126890000000000e+07, 1.86165290000000000e+07 }, + { + 1.49658179329122305e-63, 6.42776985797483522e-54, 1.18571425534766178e-34, + 5.09260394911920045e-25, 9.39419617181328754e-06, 4.80982843914157677e-03, + 4.92526345384282216e+00, 3.94020589843511928e+01, 3.15213358531706945e+02, + 2.52150762757849679e+03, 4.02895318773614636e+04, 3.19083263398166222e+05, + 1.23344671390196425e+06, 7.37060359433948807e+06, 1.44720266633904669e+07, + 1.80924970000000000e+07, 1.86126890000000000e+07, 1.86165290000000000e+07 + }, /* 17676661 */ - { 1.34924729526152486e-63, 5.79497300736470505e-54, 1.06898383980911691e-34, - 4.59125063193266000e-25, 8.46936253854919755e-06, 4.33631361902940549e-03, - 4.44038440299461268e+00, 3.55230335814082565e+01, 2.84181603549241117e+02, - 2.27328227266108661e+03, 3.63257830806934944e+04, 2.87837384102243173e+05, - 1.11455845455760439e+06, 6.74926355401089974e+06, 1.35443510115238819e+07, - 1.71523730000000000e+07, 1.76725650000000000e+07, 1.76764050000000000e+07 }, + { + 1.34924729526152486e-63, 5.79497300736470505e-54, 1.06898383980911691e-34, + 4.59125063193266000e-25, 8.46936253854919755e-06, 4.33631361902940549e-03, + 4.44038440299461268e+00, 3.55230335814082565e+01, 2.84181603549241117e+02, + 2.27328227266108661e+03, 3.63257830806934944e+04, 2.87837384102243173e+05, + 1.11455845455760439e+06, 6.74926355401089974e+06, 1.35443510115238819e+07, + 1.71523730000000000e+07, 1.76725650000000000e+07, 1.76764050000000000e+07 + }, /* 16777216 */ - { 1.21543259901182161e-63, 5.22024326324805573e-54, 9.62964914796432828e-35, - 4.13590281624610549e-25, 7.62939407650033587e-06, 3.90624976656302669e-03, - 3.99999912579873262e+00, 3.19999574025932816e+01, 2.55997380594878024e+02, - 2.04783322146484898e+03, 3.27253730219586105e+04, 2.59434518880420335e+05, - 1.00621717678566615e+06, 6.17199266255285591e+06, 1.26597333208222985e+07, - 1.62529280000000075e+07, 1.67731200000000000e+07, 1.67769600000000000e+07 }, + { + 1.21543259901182161e-63, 5.22024326324805573e-54, 9.62964914796432828e-35, + 4.13590281624610549e-25, 7.62939407650033587e-06, 3.90624976656302669e-03, + 3.99999912579873262e+00, 3.19999574025932816e+01, 2.55997380594878024e+02, + 2.04783322146484898e+03, 3.27253730219586105e+04, 2.59434518880420335e+05, + 1.00621717678566615e+06, 6.17199266255285591e+06, 1.26597333208222985e+07, + 1.62529280000000075e+07, 1.67731200000000000e+07, 1.67769600000000000e+07 + }, /* 16777214 */ - { 1.21543230923011700e-63, 5.22024201864511143e-54, 9.62964685207712960e-35, - 4.13590183017006213e-25, 7.62939225751109495e-06, 3.90624883524053534e-03, - 3.99999817212472886e+00, 3.19999497732139844e+01, 2.55997319560658525e+02, - 2.04783273324324227e+03, 3.27253652246982456e+04, 2.59434457346894662e+05, - 1.00621694177949021e+06, 6.17199139831178170e+06, 1.26597313574535716e+07, - 1.62529260000000075e+07, 1.67731180000000000e+07, 1.67769580000000000e+07 }, + { + 1.21543230923011700e-63, 5.22024201864511143e-54, 9.62964685207712960e-35, + 4.13590183017006213e-25, 7.62939225751109495e-06, 3.90624883524053534e-03, + 3.99999817212472886e+00, 3.19999497732139844e+01, 2.55997319560658525e+02, + 2.04783273324324227e+03, 3.27253652246982456e+04, 2.59434457346894662e+05, + 1.00621694177949021e+06, 6.17199139831178170e+06, 1.26597313574535716e+07, + 1.62529260000000075e+07, 1.67731180000000000e+07, 1.67769580000000000e+07 + }, /* 15082603 */ - { 9.82298962180288047e-64, 4.21894191745907802e-54, 7.78257418132130597e-35, - 3.34259015874689832e-25, 6.16599052016874108e-06, 3.15698714588672326e-03, - 3.23275437590726122e+00, 2.58620091390967453e+01, 2.06894417561625545e+02, - 1.65504939094220754e+03, 2.64517551029136412e+04, 2.09891694997857179e+05, - 8.16575685588646214e+05, 5.13336480662504770e+06, 1.10033654155580010e+07, - 1.45583150000001676e+07, 1.50785070000000000e+07, 1.50823470000000000e+07 }, + { + 9.82298962180288047e-64, 4.21894191745907802e-54, 7.78257418132130597e-35, + 3.34259015874689832e-25, 6.16599052016874108e-06, 3.15698714588672326e-03, + 3.23275437590726122e+00, 2.58620091390967453e+01, 2.06894417561625545e+02, + 1.65504939094220754e+03, 2.64517551029136412e+04, 2.09891694997857179e+05, + 8.16575685588646214e+05, 5.13336480662504770e+06, 1.10033654155580010e+07, + 1.45583150000001676e+07, 1.50785070000000000e+07, 1.50823470000000000e+07 + }, /* 14986273 */ - { 9.69791481108703163e-64, 4.16522269530128191e-54, 7.68347970702294475e-35, - 3.30002940611432092e-25, 6.08747978902901173e-06, 3.11678965155155231e-03, - 3.19159215049388845e+00, 2.55327118282773071e+01, 2.04260070593989951e+02, - 1.63397663226719487e+03, 2.61151435765585957e+04, 2.07231508480752498e+05, - 8.06367654055638355e+05, 5.07635187903902307e+06, 1.09097087114329021e+07, - 1.44619850000002030e+07, 1.49821770000000000e+07, 1.49860170000000000e+07 }, + { + 9.69791481108703163e-64, 4.16522269530128191e-54, 7.68347970702294475e-35, + 3.30002940611432092e-25, 6.08747978902901173e-06, 3.11678965155155231e-03, + 3.19159215049388845e+00, 2.55327118282773071e+01, 2.04260070593989951e+02, + 1.63397663226719487e+03, 2.61151435765585957e+04, 2.07231508480752498e+05, + 8.06367654055638355e+05, 5.07635187903902307e+06, 1.09097087114329021e+07, + 1.44619850000002030e+07, 1.49821770000000000e+07, 1.49860170000000000e+07 + }, /* 14776336 */ - { 9.42810913278675722e-64, 4.04934203884380436e-54, 7.46971762574649011e-35, - 3.20821929129359426e-25, 5.91812001988149620e-06, 3.03007744976589765e-03, - 3.10279887462500303e+00, 2.48223666728909436e+01, 1.98577376650443540e+02, - 1.58851938758362576e+03, 2.53890076205234654e+04, 2.01492261805796676e+05, - 7.84335037057878566e+05, 4.95288674782931432e+06, 1.07058149018839840e+07, - 1.42520480000003017e+07, 1.47722400000000000e+07, 1.47760800000000000e+07 }, + { + 9.42810913278675722e-64, 4.04934203884380436e-54, 7.46971762574649011e-35, + 3.20821929129359426e-25, 5.91812001988149620e-06, 3.03007744976589765e-03, + 3.10279887462500303e+00, 2.48223666728909436e+01, 1.98577376650443540e+02, + 1.58851938758362576e+03, 2.53890076205234654e+04, 2.01492261805796676e+05, + 7.84335037057878566e+05, 4.95288674782931432e+06, 1.07058149018839840e+07, + 1.42520480000003017e+07, 1.47722400000000000e+07, 1.47760800000000000e+07 + }, /* 14196869 */ - { 8.70314528971027262e-64, 3.73797243916420662e-54, 6.89534209398419660e-35, - 2.96152687883942827e-25, 5.46305284013487504e-06, 2.79708305378238405e-03, - 2.86421266221348869e+00, 2.29136797245160615e+01, 1.83308057120624454e+02, - 1.46637609822502554e+03, 2.34378018895664463e+04, 1.86065371296118683e+05, - 7.25048552277948707e+05, 4.61779125281785242e+06, 1.01446868737243451e+07, - 1.36725810000009108e+07, 1.41927730000000000e+07, 1.41966130000000000e+07 }, + { + 8.70314528971027262e-64, 3.73797243916420662e-54, 6.89534209398419660e-35, + 2.96152687883942827e-25, 5.46305284013487504e-06, 2.79708305378238405e-03, + 2.86421266221348869e+00, 2.29136797245160615e+01, 1.83308057120624454e+02, + 1.46637609822502554e+03, 2.34378018895664463e+04, 1.86065371296118683e+05, + 7.25048552277948707e+05, 4.61779125281785242e+06, 1.01446868737243451e+07, + 1.36725810000009108e+07, 1.41927730000000000e+07, 1.41966130000000000e+07 + }, /* 12204240 */ - { 6.43150420527001539e-64, 2.76231002257211870e-54, 5.09556260386307283e-35, - 2.18852747383125011e-25, 4.03712062080382464e-06, 2.06700575761862432e-03, - 2.11661365131384116e+00, 1.69328955058294497e+01, 1.35462286951825348e+02, - 1.08364216400000464e+03, 1.73228893695771148e+04, 1.37669261714004766e+05, - 5.38415595845002681e+05, 3.53292539626187785e+06, 8.23848823565938789e+06, - 1.16799520000407528e+07, 1.22001440000000000e+07, 1.22039840000000000e+07 }, + { + 6.43150420527001539e-64, 2.76231002257211870e-54, 5.09556260386307283e-35, + 2.18852747383125011e-25, 4.03712062080382464e-06, 2.06700575761862432e-03, + 2.11661365131384116e+00, 1.69328955058294497e+01, 1.35462286951825348e+02, + 1.08364216400000464e+03, 1.73228893695771148e+04, 1.37669261714004766e+05, + 5.38415595845002681e+05, 3.53292539626187785e+06, 8.23848823565938789e+06, + 1.16799520000407528e+07, 1.22001440000000000e+07, 1.22039840000000000e+07 + }, /* 11017633 */ - { 5.24164589759972754e-64, 2.25126977074033947e-54, 4.15285973017258180e-35, - 1.78363967259666233e-25, 3.29023445600991739e-06, 1.68460004130569592e-03, - 1.72503026241426105e+00, 1.38002320160382475e+01, 1.10401210801834779e+02, - 8.83168387150024387e+02, 1.41193736003445592e+04, 1.12282200585662198e+05, - 4.40082662240044388e+05, 2.94038767245387891e+06, 7.12661430867962260e+06, - 1.04933450003918260e+07, 1.10135370000000000e+07, 1.10173770000000000e+07 }, + { + 5.24164589759972754e-64, 2.25126977074033947e-54, 4.15285973017258180e-35, + 1.78363967259666233e-25, 3.29023445600991739e-06, 1.68460004130569592e-03, + 1.72503026241426105e+00, 1.38002320160382475e+01, 1.10401210801834779e+02, + 8.83168387150024387e+02, 1.41193736003445592e+04, 1.12282200585662198e+05, + 4.40082662240044388e+05, 2.94038767245387891e+06, 7.12661430867962260e+06, + 1.04933450003918260e+07, 1.10135370000000000e+07, 1.10173770000000000e+07 + }, /* 9437505 */ - { 3.84596615253128342e-64, 1.65182988466448099e-54, 3.04708831357108469e-35, - 1.30871446548116017e-25, 2.41415208102884383e-06, 1.23604586537905408e-03, - 1.26571085309146980e+00, 1.01256804873721595e+01, 8.10050383096763937e+01, - 6.48014349639423358e+02, 1.03611138831922271e+04, 8.24657129882121953e+04, - 3.24156550320632989e+05, 2.21947546481000213e+06, 5.68524343875118531e+06, - 8.91321700797987171e+06, 9.43340900000000000e+06, 9.43724900000000000e+06 }, + { + 3.84596615253128342e-64, 1.65182988466448099e-54, 3.04708831357108469e-35, + 1.30871446548116017e-25, 2.41415208102884383e-06, 1.23604586537905408e-03, + 1.26571085309146980e+00, 1.01256804873721595e+01, 8.10050383096763937e+01, + 6.48014349639423358e+02, 1.03611138831922271e+04, 8.24657129882121953e+04, + 3.24156550320632989e+05, 2.21947546481000213e+06, 5.68524343875118531e+06, + 8.91321700797987171e+06, 9.43340900000000000e+06, 9.43724900000000000e+06 + }, /* 8390657 */ - { 3.04006590453258966e-64, 1.30569836376521308e-54, 2.40858835538382027e-35, - 1.03448082158999336e-25, 1.90828029650285053e-06, 9.77039511733760911e-04, - 1.00048838056196132e+00, 8.00390259075751231e+00, 6.40309356878872933e+01, - 5.12229243608175807e+02, 8.19066683023702899e+03, 6.52277588009487954e+04, - 2.56891072309514391e+05, 1.78809403153571300e+06, 4.76371295024558529e+06, - 7.86636905876981001e+06, 8.38656100000000000e+06, 8.39040100000000000e+06 }, + { + 3.04006590453258966e-64, 1.30569836376521308e-54, 2.40858835538382027e-35, + 1.03448082158999336e-25, 1.90828029650285053e-06, 9.77039511733760911e-04, + 1.00048838056196132e+00, 8.00390259075751231e+00, 6.40309356878872933e+01, + 5.12229243608175807e+02, 8.19066683023702899e+03, 6.52277588009487954e+04, + 2.56891072309514391e+05, 1.78809403153571300e+06, 4.76371295024558529e+06, + 7.86636905876981001e+06, 8.38656100000000000e+06, 8.39040100000000000e+06 + }, /* 8388608 */ - { 3.03858131641597245e-64, 1.30506073802432296e-54, 2.40741214349811932e-35, - 1.03397564243176815e-25, 1.90734840543853551e-06, 9.76562383508887020e-04, - 9.99999801317883907e-01, 7.99999396006690677e+00, 6.39996668511303071e+01, - 5.11979106274727883e+02, 8.18666829515939844e+03, 6.51959881527814287e+04, - 2.56766914989349432e+05, 1.78728773698867904e+06, 4.76194118448516913e+06, - 7.86432005899994168e+06, 8.38451200000000000e+06, 8.38835200000000000e+06 }, + { + 3.03858131641597245e-64, 1.30506073802432296e-54, 2.40741214349811932e-35, + 1.03397564243176815e-25, 1.90734840543853551e-06, 9.76562383508887020e-04, + 9.99999801317883907e-01, 7.99999396006690677e+00, 6.39996668511303071e+01, + 5.11979106274727883e+02, 8.18666829515939844e+03, 6.51959881527814287e+04, + 2.56766914989349432e+05, 1.78728773698867904e+06, 4.76194118448516913e+06, + 7.86432005899994168e+06, 8.38451200000000000e+06, 8.38835200000000000e+06 + }, /* 8303633 */ - { 2.97733261180485959e-64, 1.27875461970161355e-54, 2.35888592027094511e-35, - 1.01313378825585727e-25, 1.86890197043808392e-06, 9.56877808790931330e-04, - 9.79842799195114300e-01, 7.83873807696676383e+00, 6.27096283547353366e+01, - 5.01659346659709513e+02, 8.02170245095559312e+03, 6.38851939022925071e+04, - 2.51643815255051391e+05, 1.75398342366120382e+06, 4.68858358349586092e+06, - 7.77934506938103493e+06, 8.29953700000000000e+06, 8.30337700000000000e+06 }, + { + 2.97733261180485959e-64, 1.27875461970161355e-54, 2.35888592027094511e-35, + 1.01313378825585727e-25, 1.86890197043808392e-06, 9.56877808790931330e-04, + 9.79842799195114300e-01, 7.83873807696676383e+00, 6.27096283547353366e+01, + 5.01659346659709513e+02, 8.02170245095559312e+03, 6.38851939022925071e+04, + 2.51643815255051391e+05, 1.75398342366120382e+06, 4.68858358349586092e+06, + 7.77934506938103493e+06, 8.29953700000000000e+06, 8.30337700000000000e+06 + }, /* 6445069 */ - { 1.79368505410408035e-64, 7.70381864670101568e-55, 1.42110370965965099e-35, - 6.10359395721248029e-26, 1.12591435658525644e-06, 5.76468150537344320e-04, - 5.90303350141551664e-01, 4.72242478267542509e+00, 3.77792690805288558e+01, - 3.02225885259077643e+02, 4.83334738231306892e+03, 3.85317788870130607e+04, - 1.52297025401436375e+05, 1.09355884627841157e+06, 3.15298493161437940e+06, - 5.92078340317591745e+06, 6.44097300000000000e+06, 6.44481300000000000e+06 }, + { + 1.79368505410408035e-64, 7.70381864670101568e-55, 1.42110370965965099e-35, + 6.10359395721248029e-26, 1.12591435658525644e-06, 5.76468150537344320e-04, + 5.90303350141551664e-01, 4.72242478267542509e+00, 3.77792690805288558e+01, + 3.02225885259077643e+02, 4.83334738231306892e+03, 3.85317788870130607e+04, + 1.52297025401436375e+05, 1.09355884627841157e+06, 3.15298493161437940e+06, + 5.92078340317591745e+06, 6.44097300000000000e+06, 6.44481300000000000e+06 + }, /* 5471025 */ - { 1.29249369610449219e-64, 5.55121815505495657e-55, 1.02401900603628891e-35, - 4.39812814140828746e-26, 8.11311442279305058e-07, 4.15391458426019348e-04, - 4.25360831402496142e-01, 3.40288541657277221e+00, 2.72230043153551051e+01, - 2.17778977519387723e+02, 3.48307701466327671e+03, 2.77819973005047868e+04, - 1.10006032571945238e+05, 8.02497636826934526e+05, 2.41479032500354247e+06, - 4.94675240411104914e+06, 5.46692900000000000e+06, 5.47076900000000000e+06 }, + { + 1.29249369610449219e-64, 5.55121815505495657e-55, 1.02401900603628891e-35, + 4.39812814140828746e-26, 8.11311442279305058e-07, 4.15391458426019348e-04, + 4.25360831402496142e-01, 3.40288541657277221e+00, 2.72230043153551051e+01, + 2.17778977519387723e+02, 3.48307701466327671e+03, 2.77819973005047868e+04, + 1.10006032571945238e+05, 8.02497636826934526e+05, 2.41479032500354247e+06, + 4.94675240411104914e+06, 5.46692900000000000e+06, 5.47076900000000000e+06 + }, /* 5461601 */ - { 1.28804481454968919e-64, 5.53211035427330002e-55, 1.02049423892798245e-35, - 4.38298938195209473e-26, 8.08518834066487105e-07, 4.13961643021164814e-04, - 4.23896700541549154e-01, 3.39117237605436062e+00, 2.71293003988329815e+01, - 2.17029372274540748e+02, 3.47109048311671313e+03, 2.76865308479067826e+04, - 1.09629930206165693e+05, 7.99877169687261223e+05, 2.40792627883238578e+06, - 4.93732868350143358e+06, 5.45750500000000000e+06, 5.46134500000000000e+06 }, + { + 1.28804481454968919e-64, 5.53211035427330002e-55, 1.02049423892798245e-35, + 4.38298938195209473e-26, 8.08518834066487105e-07, 4.13961643021164814e-04, + 4.23896700541549154e-01, 3.39117237605436062e+00, 2.71293003988329815e+01, + 2.17029372274540748e+02, 3.47109048311671313e+03, 2.76865308479067826e+04, + 1.09629930206165693e+05, 7.99877169687261223e+05, 2.40792627883238578e+06, + 4.93732868350143358e+06, 5.45750500000000000e+06, 5.46134500000000000e+06 + }, /* 5000000 */ - { 1.07952085348259170e-64, 4.63650676105773906e-55, 8.55284536172561161e-36, - 3.67341911163567920e-26, 6.77626222278107512e-07, 3.46944625790372989e-04, - 3.55271279996754563e-01, 2.84216929754907532e+00, 2.27372940653300759e+01, - 1.81894492427756745e+02, 2.90925341562651647e+03, 2.32109475844556837e+04, - 9.19864480283138982e+04, 6.76244582431662595e+05, 2.07902454915874335e+06, - 4.47574982779582217e+06, 4.99590400000000000e+06, 4.99974400000000000e+06 }, + { + 1.07952085348259170e-64, 4.63650676105773906e-55, 8.55284536172561161e-36, + 3.67341911163567920e-26, 6.77626222278107512e-07, 3.46944625790372989e-04, + 3.55271279996754563e-01, 2.84216929754907532e+00, 2.27372940653300759e+01, + 1.81894492427756745e+02, 2.90925341562651647e+03, 2.32109475844556837e+04, + 9.19864480283138982e+04, 6.76244582431662595e+05, 2.07902454915874335e+06, + 4.47574982779582217e+06, 4.99590400000000000e+06, 4.99974400000000000e+06 + }, /* 4720129 */ - { 9.62052468491602810e-65, 4.13198388920750452e-55, 7.62216493209018785e-36, - 3.27369491080454178e-26, 6.03890121950116545e-07, 3.09191742424983634e-04, - 3.16612330098731132e-01, 2.53289784792646122e+00, 2.02631320402621107e+01, - 1.62101808815417854e+02, 2.59273843912307711e+03, 2.06888306707860320e+04, - 8.20335711247183208e+04, 6.05859806423343602e+05, 1.88701706041535083e+06, - 4.19590551232236158e+06, 4.71603300000000000e+06, 4.71987300000000000e+06 }, + { + 9.62052468491602810e-65, 4.13198388920750452e-55, 7.62216493209018785e-36, + 3.27369491080454178e-26, 6.03890121950116545e-07, 3.09191742424983634e-04, + 3.16612330098731132e-01, 2.53289784792646122e+00, 2.02631320402621107e+01, + 1.62101808815417854e+02, 2.59273843912307711e+03, 2.06888306707860320e+04, + 8.20335711247183208e+04, 6.05859806423343602e+05, 1.88701706041535083e+06, + 4.19590551232236158e+06, 4.71603300000000000e+06, 4.71987300000000000e+06 + }, /* 4598479 */ - { 9.13102296289999889e-65, 3.92174450046805166e-55, 7.23434171226120578e-36, - 3.10712610622505210e-26, 5.73163600862704501e-07, 2.93459763629244023e-04, - 3.00502784877568652e-01, 2.40402154589327210e+00, 1.92321254470970260e+01, - 1.53854000743080690e+02, 2.46084059619524533e+03, 1.96376437319819379e+04, - 7.78830134114269749e+04, 5.76361321148565039e+05, 1.80542466236221301e+06, - 4.07427236013673665e+06, 4.59438300000000000e+06, 4.59822300000000000e+06 }, + { + 9.13102296289999889e-65, 3.92174450046805166e-55, 7.23434171226120578e-36, + 3.10712610622505210e-26, 5.73163600862704501e-07, 2.93459763629244023e-04, + 3.00502784877568652e-01, 2.40402154589327210e+00, 1.92321254470970260e+01, + 1.53854000743080690e+02, 2.46084059619524533e+03, 1.96376437319819379e+04, + 7.78830134114269749e+04, 5.76361321148565039e+05, 1.80542466236221301e+06, + 4.07427236013673665e+06, 4.59438300000000000e+06, 4.59822300000000000e+06 + }, /* 4514873 */ - { 8.80201481185765059e-65, 3.78043657558362023e-55, 6.97367459966819779e-36, - 2.99517043385208020e-26, 5.52511424504064165e-07, 2.82885849334287552e-04, - 2.89675097340006849e-01, 2.31740008485763216e+00, 1.85391562717557470e+01, - 1.48310408165256945e+02, 2.37218721144947949e+03, 1.89310433056085276e+04, - 7.50922424384496408e+04, 5.56476519408195047e+05, 1.75003032936007436e+06, - 3.99068042602826888e+06, 4.51077700000000000e+06, 4.51461700000000000e+06 }, + { + 8.80201481185765059e-65, 3.78043657558362023e-55, 6.97367459966819779e-36, + 2.99517043385208020e-26, 5.52511424504064165e-07, 2.82885849334287552e-04, + 2.89675097340006849e-01, 2.31740008485763216e+00, 1.85391562717557470e+01, + 1.48310408165256945e+02, 2.37218721144947949e+03, 1.89310433056085276e+04, + 7.50922424384496408e+04, 5.56476519408195047e+05, 1.75003032936007436e+06, + 3.99068042602826888e+06, 4.51077700000000000e+06, 4.51461700000000000e+06 + }, /* 4216423 */ - { 7.67678466448147999e-65, 3.29715390723822894e-55, 6.08217542984550923e-36, - 2.61227445597212045e-26, 4.81879583396028819e-07, 2.46722346689160995e-04, - 2.52643672927461205e-01, 2.02114881826249349e+00, 1.61691543761076666e+01, - 1.29350920164308604e+02, 2.06897994841936315e+03, 1.65139961617354602e+04, - 6.55409147975342930e+04, 4.88100916845553555e+05, 1.55700132055291533e+06, - 3.69230361198300030e+06, 4.21232700000000000e+06, 4.21616700000000000e+06 }, + { + 7.67678466448147999e-65, 3.29715390723822894e-55, 6.08217542984550923e-36, + 2.61227445597212045e-26, 4.81879583396028819e-07, 2.46722346689160995e-04, + 2.52643672927461205e-01, 2.02114881826249349e+00, 1.61691543761076666e+01, + 1.29350920164308604e+02, 2.06897994841936315e+03, 1.65139961617354602e+04, + 6.55409147975342930e+04, 4.88100916845553555e+05, 1.55700132055291533e+06, + 3.69230361198300030e+06, 4.21232700000000000e+06, 4.21616700000000000e+06 + }, /* 4194304 */ - { 7.59645238547202323e-65, 3.26265145612235253e-55, 6.01852964128048457e-36, - 2.58493879793062928e-26, 4.76837044516251121e-07, 2.44140566782865192e-04, - 2.49999930461255154e-01, 1.99999888738057052e+00, 1.59999554953052812e+01, - 1.27997365357353743e+02, 2.04733300825732044e+03, 1.63414126607763610e+04, - 6.48586183619030489e+04, 4.83196861208001501e+05, 1.54299802768340637e+06, - 3.67019187768841069e+06, 4.19020800000000000e+06, 4.19404800000000000e+06 }, + { + 7.59645238547202323e-65, 3.26265145612235253e-55, 6.01852964128048457e-36, + 2.58493879793062928e-26, 4.76837044516251121e-07, 2.44140566782865192e-04, + 2.49999930461255154e-01, 1.99999888738057052e+00, 1.59999554953052812e+01, + 1.27997365357353743e+02, 2.04733300825732044e+03, 1.63414126607763610e+04, + 6.48586183619030489e+04, 4.83196861208001501e+05, 1.54299802768340637e+06, + 3.67019187768841069e+06, 4.19020800000000000e+06, 4.19404800000000000e+06 + }, /* 4000000 */ - { 6.90893311684184468e-65, 2.96736417870870697e-55, 5.47382075781328512e-36, - 2.35098811389739960e-26, 4.33680760573953185e-07, 2.22044549405662773e-04, - 2.27373609983355179e-01, 1.81898839734530293e+00, 1.45518762974392430e+01, - 1.16413034003141178e+02, 1.86206657745167763e+03, 1.48642188911844787e+04, - 5.90168968299262124e+04, 4.41096638730170089e+05, 1.42185603096995712e+06, - 3.47596677852119505e+06, 3.99590400000000000e+06, 3.99974400000000000e+06 }, + { + 6.90893311684184468e-65, 2.96736417870870697e-55, 5.47382075781328512e-36, + 2.35098811389739960e-26, 4.33680760573953185e-07, 2.22044549405662773e-04, + 2.27373609983355179e-01, 1.81898839734530293e+00, 1.45518762974392430e+01, + 1.16413034003141178e+02, 1.86206657745167763e+03, 1.48642188911844787e+04, + 5.90168968299262124e+04, 4.41096638730170089e+05, 1.42185603096995712e+06, + 3.47596677852119505e+06, 3.99590400000000000e+06, 3.99974400000000000e+06 + }, /* 3981553 */ - { 6.84535550514410596e-65, 2.94005780240874949e-55, 5.42344938429471275e-36, - 2.32935377370571273e-26, 4.29689929206757446e-07, 2.20001243745771501e-04, - 2.25281265106172607e-01, 1.80224964497290951e+00, 1.44179667037433958e+01, - 1.15341784471186955e+02, 1.84493404804906459e+03, 1.47276033582964737e+04, - 5.84764753082058160e+04, 4.37191522377733258e+05, 1.41053273133602901e+06, - 3.45752890244734008e+06, 3.97745700000000000e+06, 3.98129700000000000e+06 }, + { + 6.84535550514410596e-65, 2.94005780240874949e-55, 5.42344938429471275e-36, + 2.32935377370571273e-26, 4.29689929206757446e-07, 2.20001243745771501e-04, + 2.25281265106172607e-01, 1.80224964497290951e+00, 1.44179667037433958e+01, + 1.15341784471186955e+02, 1.84493404804906459e+03, 1.47276033582964737e+04, + 5.84764753082058160e+04, 4.37191522377733258e+05, 1.41053273133602901e+06, + 3.45752890244734008e+06, 3.97745700000000000e+06, 3.98129700000000000e+06 + }, /* 3469497 */ - { 5.19785334334943400e-65, 2.23246101190900781e-55, 4.11816369412201186e-36, - 1.76873783858285884e-26, 3.26274542418221493e-07, 1.67052565712777593e-04, - 1.71061821672631342e-01, 1.36849425850745465e+00, 1.09479339161807978e+01, - 8.75821816252248908e+01, 1.40096122943031264e+03, 1.11865973776804603e+04, - 4.44589238065494865e+04, 3.35240937339222815e+05, 1.10925791919939918e+06, - 2.94590981907640956e+06, 3.46540100000000000e+06, 3.46924100000000000e+06 }, + { + 5.19785334334943400e-65, 2.23246101190900781e-55, 4.11816369412201186e-36, + 1.76873783858285884e-26, 3.26274542418221493e-07, 1.67052565712777593e-04, + 1.71061821672631342e-01, 1.36849425850745465e+00, 1.09479339161807978e+01, + 8.75821816252248908e+01, 1.40096122943031264e+03, 1.11865973776804603e+04, + 4.44589238065494865e+04, 3.35240937339222815e+05, 1.10925791919939918e+06, + 2.94590981907640956e+06, 3.46540100000000000e+06, 3.46924100000000000e+06 + }, /* 2796417 */ - { 3.37671825984804601e-65, 1.45028944938533875e-55, 2.67531183056124863e-36, - 1.14903768188624562e-26, 2.11960040488029904e-07, 1.08523540727069068e-04, - 1.11128102763280903e-01, 8.89024657235948701e-01, 7.11218670620169569e+00, - 5.68968183484790444e+01, 9.10163898031904523e+02, 7.27026311537105084e+03, - 2.89302976804814243e+04, 2.20626239906953182e+05, 7.55430265292525059e+05, - 2.27465918879699614e+06, 2.79232100000000000e+06, 2.79616100000000000e+06 }, + { + 3.37671825984804601e-65, 1.45028944938533875e-55, 2.67531183056124863e-36, + 1.14903768188624562e-26, 2.11960040488029904e-07, 1.08523540727069068e-04, + 1.11128102763280903e-01, 8.89024657235948701e-01, 7.11218670620169569e+00, + 5.68968183484790444e+01, 9.10163898031904523e+02, 7.27026311537105084e+03, + 2.89302976804814243e+04, 2.20626239906953182e+05, 7.55430265292525059e+05, + 2.27465918879699614e+06, 2.79232100000000000e+06, 2.79616100000000000e+06 + }, /* 2396744 */ - { 2.48047143920984062e-65, 1.06535437100683176e-55, 1.96523194297708407e-36, - 8.44060692414111294e-27, 1.55701715756405132e-07, 7.97192784655151597e-05, - 8.16325392969082797e-02, 6.53060210574274436e-01, 5.22447504133784690e+00, - 4.17953751659456785e+01, 6.68609402176202252e+02, 5.34191798810462478e+03, - 2.12726697966660395e+04, 1.63326698532949667e+05, 5.71039962053837837e+05, - 1.87787878976813331e+06, 2.39264800000000000e+06, 2.39648800000000000e+06 }, + { + 2.48047143920984062e-65, 1.06535437100683176e-55, 1.96523194297708407e-36, + 8.44060692414111294e-27, 1.55701715756405132e-07, 7.97192784655151597e-05, + 8.16325392969082797e-02, 6.53060210574274436e-01, 5.22447504133784690e+00, + 4.17953751659456785e+01, 6.68609402176202252e+02, 5.34191798810462478e+03, + 2.12726697966660395e+04, 1.63326698532949667e+05, 5.71039962053837837e+05, + 1.87787878976813331e+06, 2.39264800000000000e+06, 2.39648800000000000e+06 + }, /* 2098177 */ - { 1.90096951102133711e-65, 8.16460188052975446e-56, 1.50610321353860109e-36, - 6.46866404654879610e-27, 1.19325790165487525e-07, 6.10948045635459623e-05, - 6.25610786307022049e-02, 5.00488559404961619e-01, 4.00390401824189190e+00, - 3.20309469002191776e+01, 5.12416921058289972e+02, 4.09466699542457309e+03, - 1.63148862712246882e+04, 1.25897567119276093e+05, 4.47225202517700847e+05, - 1.58347287791373348e+06, 2.09408100000000000e+06, 2.09792100000000000e+06 }, + { + 1.90096951102133711e-65, 8.16460188052975446e-56, 1.50610321353860109e-36, + 6.46866404654879610e-27, 1.19325790165487525e-07, 6.10948045635459623e-05, + 6.25610786307022049e-02, 5.00488559404961619e-01, 4.00390401824189190e+00, + 3.20309469002191776e+01, 5.12416921058289972e+02, 4.09466699542457309e+03, + 1.63148862712246882e+04, 1.25897567119276093e+05, 4.47225202517700847e+05, + 1.58347287791373348e+06, 2.09408100000000000e+06, 2.09792100000000000e+06 + }, /* 2097152 */ - { 1.89911264358405187e-65, 8.15662669561360700e-56, 1.50463205158771428e-36, - 6.46234545408261769e-27, 1.19209232707357876e-07, 6.10351271449853099e-05, - 6.24999689559159743e-02, 4.99999682108684340e-01, 3.99999300640047295e+00, - 3.19996592233267698e+01, 5.11916432816754536e+02, 4.09066992542314756e+03, - 1.62989912696615120e+04, 1.25777098836656849e+05, 4.46821820522652706e+05, - 1.58246663305044221e+06, 2.09305600000000000e+06, 2.09689600000000000e+06 }, + { + 1.89911264358405187e-65, 8.15662669561360700e-56, 1.50463205158771428e-36, + 6.46234545408261769e-27, 1.19209232707357876e-07, 6.10351271449853099e-05, + 6.24999689559159743e-02, 4.99999682108684340e-01, 3.99999300640047295e+00, + 3.19996592233267698e+01, 5.11916432816754536e+02, 4.09066992542314756e+03, + 1.62989912696615120e+04, 1.25777098836656849e+05, 4.46821820522652706e+05, + 1.58246663305044221e+06, 2.09305600000000000e+06, 2.09689600000000000e+06 + }, /* 1271626 */ - { 6.98247791753670586e-66, 2.99895143008623366e-56, 5.53208895202860154e-37, - 2.37601411275257565e-27, 4.38297242534678273e-08, 2.24408188175120292e-05, - 2.29793981925642821e-02, 1.83835170037565776e-01, 1.47068036811238745e+00, - 1.17653794451473974e+01, 1.88228655326640251e+02, 1.50478955000098654e+03, - 6.00493221828217247e+03, 4.69964688955476740e+04, 1.74675738335436967e+05, - 7.93705775574441534e+05, 1.26753000000000000e+06, 1.27137000000000000e+06 }, + { + 6.98247791753670586e-66, 2.99895143008623366e-56, 5.53208895202860154e-37, + 2.37601411275257565e-27, 4.38297242534678273e-08, 2.24408188175120292e-05, + 2.29793981925642821e-02, 1.83835170037565776e-01, 1.47068036811238745e+00, + 1.17653794451473974e+01, 1.88228655326640251e+02, 1.50478955000098654e+03, + 6.00493221828217247e+03, 4.69964688955476740e+04, 1.74675738335436967e+05, + 7.93705775574441534e+05, 1.26753000000000000e+06, 1.27137000000000000e+06 + }, /* 1180417 */ - { 6.01674571488324041e-66, 2.58417260737716580e-56, 4.76695707305772932e-37, - 2.04739247302188301e-27, 3.77677249682731562e-08, 1.93370751835450871e-05, - 1.98011647667272750e-02, 1.58409305733226813e-01, 1.26727365222838628e+00, - 1.01381384252463871e+01, 1.62196284074367895e+02, 1.29673859731428774e+03, - 5.17557281139463612e+03, 4.05690452754900689e+04, 1.51559237337625702e+05, - 7.11307437578365323e+05, 1.17632100000000000e+06, 1.18016100000000000e+06 }, + { + 6.01674571488324041e-66, 2.58417260737716580e-56, 4.76695707305772932e-37, + 2.04739247302188301e-27, 3.77677249682731562e-08, 1.93370751835450871e-05, + 1.98011647667272750e-02, 1.58409305733226813e-01, 1.26727365222838628e+00, + 1.01381384252463871e+01, 1.62196284074367895e+02, 1.29673859731428774e+03, + 5.17557281139463612e+03, 4.05690452754900689e+04, 1.51559237337625702e+05, + 7.11307437578365323e+05, 1.17632100000000000e+06, 1.18016100000000000e+06 + }, /* 1048576 */ - { 4.74777934504035996e-66, 2.03915570155726458e-56, 3.76157833530725135e-37, - 1.61558559314867667e-27, 2.98022939659853163e-08, 1.52587745104367425e-05, - 1.56249849436188217e-02, 1.24999870856632000e-01, 9.99998410544928107e-01, - 7.99995168077293606e+00, 1.27989461928571330e+02, 1.02333268407003743e+03, - 4.08535025830558106e+03, 3.20958386865916218e+04, 1.20799142289413823e+05, - 5.95242529642230948e+05, 1.04448000000000000e+06, 1.04832000000000000e+06 }, + { + 4.74777934504035996e-66, 2.03915570155726458e-56, 3.76157833530725135e-37, + 1.61558559314867667e-27, 2.98022939659853163e-08, 1.52587745104367425e-05, + 1.56249849436188217e-02, 1.24999870856632000e-01, 9.99998410544928107e-01, + 7.99995168077293606e+00, 1.27989461928571330e+02, 1.02333268407003743e+03, + 4.08535025830558106e+03, 3.20958386865916218e+04, 1.20799142289413823e+05, + 5.95242529642230948e+05, 1.04448000000000000e+06, 1.04832000000000000e+06 + }, /* 1000000 */ - { 4.31807995946294477e-66, 1.85460122074063535e-56, 3.42113540777918151e-37, - 1.46936646915992086e-27, 2.71050272070828090e-08, 1.38777739298982540e-05, - 1.42108403697154325e-02, 1.13686715418339940e-01, 9.09493240826389937e-01, - 7.27591504542061607e+00, 1.16406170946493603e+02, 9.30743673031597268e+02, - 3.71605194956770447e+03, 2.92188944778244804e+04, 1.10274089241209091e+05, - 5.53554744840516942e+05, 9.95904000000000000e+05, 9.99744000000000000e+05 }, + { + 4.31807995946294477e-66, 1.85460122074063535e-56, 3.42113540777918151e-37, + 1.46936646915992086e-27, 2.71050272070828090e-08, 1.38777739298982540e-05, + 1.42108403697154325e-02, 1.13686715418339940e-01, 9.09493240826389937e-01, + 7.27591504542061607e+00, 1.16406170946493603e+02, 9.30743673031597268e+02, + 3.71605194956770447e+03, 2.92188944778244804e+04, 1.10274089241209091e+05, + 5.53554744840516942e+05, 9.95904000000000000e+05, 9.99744000000000000e+05 + }, /* 819841 */ - { 2.90235045358949550e-66, 1.24655002796976490e-56, 2.29947893410337365e-37, - 9.87618681981492889e-28, 1.82183490689266710e-08, 9.32779472321984348e-06, - 9.55166172246109217e-03, 7.64132896251342869e-02, 6.11306051109687054e-01, - 4.89043139187867482e+00, 7.82422349713714453e+01, 6.25659192034058037e+02, - 2.49882041253832767e+03, 1.97089496950203138e+04, 7.51500479695295217e+04, - 4.05315292462697893e+05, 8.15745000000000000e+05, 8.19585000000000000e+05 }, + { + 2.90235045358949550e-66, 1.24655002796976490e-56, 2.29947893410337365e-37, + 9.87618681981492889e-28, 1.82183490689266710e-08, 9.32779472321984348e-06, + 9.55166172246109217e-03, 7.64132896251342869e-02, 6.11306051109687054e-01, + 4.89043139187867482e+00, 7.82422349713714453e+01, 6.25659192034058037e+02, + 2.49882041253832767e+03, 1.97089496950203138e+04, 7.51500479695295217e+04, + 4.05315292462697893e+05, 8.15745000000000000e+05, 8.19585000000000000e+05 + }, /* 652545 */ - { 1.83870213969147930e-66, 7.89716555706012712e-57, 1.45676991938802090e-37, - 6.25677916156810610e-28, 1.15417203919164522e-08, 5.90936084062561679e-06, - 6.05118546342795372e-03, 4.84094816125079305e-02, 3.87275718825497939e-01, - 3.09819716985184357e+00, 4.95688012285943671e+01, 3.96409870457700265e+02, - 1.58371435214666167e+03, 1.25273157680588301e+04, 4.82278663969549234e+04, - 2.79276527717245917e+05, 6.48449000000000000e+05, 6.52289000000000000e+05 }, + { + 1.83870213969147930e-66, 7.89716555706012712e-57, 1.45676991938802090e-37, + 6.25677916156810610e-28, 1.15417203919164522e-08, 5.90936084062561679e-06, + 6.05118546342795372e-03, 4.84094816125079305e-02, 3.87275718825497939e-01, + 3.09819716985184357e+00, 4.95688012285943671e+01, 3.96409870457700265e+02, + 1.58371435214666167e+03, 1.25273157680588301e+04, 4.82278663969549234e+04, + 2.79276527717245917e+05, 6.48449000000000000e+05, 6.52289000000000000e+05 + }, /* 524801 */ - { 1.18926762015466819e-66, 5.10786553475605035e-57, 9.42234882825664415e-38, - 4.04686800688662073e-28, 7.46515384231198445e-09, 3.82215876724521482e-06, - 3.91389055821865072e-03, 3.13111233760198990e-02, 2.50488917265499822e-01, - 2.00390687460218908e+00, 3.20612857504726705e+01, 2.56417175606829119e+02, - 1.02466699434609745e+03, 8.12310498202530835e+03, 3.15045400686032553e+04, - 1.93198962659155397e+05, 5.20705000000000000e+05, 5.24545000000000000e+05 }, + { + 1.18926762015466819e-66, 5.10786553475605035e-57, 9.42234882825664415e-38, + 4.04686800688662073e-28, 7.46515384231198445e-09, 3.82215876724521482e-06, + 3.91389055821865072e-03, 3.13111233760198990e-02, 2.50488917265499822e-01, + 2.00390687460218908e+00, 3.20612857504726705e+01, 2.56417175606829119e+02, + 1.02466699434609745e+03, 8.12310498202530835e+03, 3.15045400686032553e+04, + 1.93198962659155397e+05, 5.20705000000000000e+05, 5.24545000000000000e+05 + }, /* 401857 */ - { 6.97321585851295025e-67, 2.99497340602616845e-57, 5.52475079285309336e-38, - 2.37286239738541065e-28, 4.37715853666972486e-09, 2.24110517076658296e-06, - 2.29489168613654978e-03, 1.83591329998224681e-02, 1.46873032685309740e-01, - 1.17498225743608220e+00, 1.87991664504370917e+01, 1.50360504164546711e+02, - 6.00992138052254290e+02, 4.77454013471333201e+03, 1.86505860938960723e+04, - 1.21176669942356806e+05, 3.97761000000000000e+05, 4.01601000000000000e+05 }, + { + 6.97321585851295025e-67, 2.99497340602616845e-57, 5.52475079285309336e-38, + 2.37286239738541065e-28, 4.37715853666972486e-09, 2.24110517076658296e-06, + 2.29489168613654978e-03, 1.83591329998224681e-02, 1.46873032685309740e-01, + 1.17498225743608220e+00, 1.87991664504370917e+01, 1.50360504164546711e+02, + 6.00992138052254290e+02, 4.77454013471333201e+03, 1.86505860938960723e+04, + 1.21176669942356806e+05, 3.97761000000000000e+05, 4.01601000000000000e+05 + }, /* 264097 */ - { 3.01173257048041585e-67, 1.29352928945114011e-57, 2.38614037543525460e-38, - 1.02483948761595803e-28, 1.89049517446831162e-09, 9.67933529325415291e-07, - 9.91163931551744364e-04, 7.92931131353941630e-03, 6.34344816203459005e-02, - 5.07475284133261262e-01, 8.11944852670449713e+00, 6.49462697901977464e+01, - 2.59657344516898661e+02, 2.06775748649864772e+03, 8.14269081216647010e+03, - 5.66232434728111548e+04, 2.60001000000000000e+05, 2.63841000000000000e+05 }, + { + 3.01173257048041585e-67, 1.29352928945114011e-57, 2.38614037543525460e-38, + 1.02483948761595803e-28, 1.89049517446831162e-09, 9.67933529325415291e-07, + 9.91163931551744364e-04, 7.92931131353941630e-03, 6.34344816203459005e-02, + 5.07475284133261262e-01, 8.11944852670449713e+00, 6.49462697901977464e+01, + 2.59657344516898661e+02, 2.06775748649864772e+03, 8.14269081216647010e+03, + 5.66232434728111548e+04, 2.60001000000000000e+05, 2.63841000000000000e+05 + }, /* 204800 */ - { 1.81112697232874206e-67, 7.77873111505544409e-58, 1.43492262097629106e-38, - 6.16294572938377368e-29, 1.13686282610103304e-09, 5.82073766962628089e-07, - 5.96043536214395245e-04, 4.76834822495310166e-03, 3.81467816548533359e-02, - 3.05173987973646754e-01, 4.88271104998955341e+00, 3.90573427578099199e+01, - 1.56169795671348624e+02, 1.24492319174046884e+03, 4.91958032892884057e+03, - 3.52628737812490363e+04, 2.00704000000000000e+05, 2.04544000000000000e+05 }, + { + 1.81112697232874206e-67, 7.77873111505544409e-58, 1.43492262097629106e-38, + 6.16294572938377368e-29, 1.13686282610103304e-09, 5.82073766962628089e-07, + 5.96043536214395245e-04, 4.76834822495310166e-03, 3.81467816548533359e-02, + 3.05173987973646754e-01, 4.88271104998955341e+00, 3.90573427578099199e+01, + 1.56169795671348624e+02, 1.24492319174046884e+03, 4.91958032892884057e+03, + 3.52628737812490363e+04, 2.00704000000000000e+05, 2.04544000000000000e+05 + }, /* 200000 */ - { 1.72722507485033383e-67, 7.41837520931333590e-58, 1.36844868928954633e-38, - 5.87744236675266698e-29, 1.08419675147463808e-09, 5.55108736753989574e-07, - 5.68431345360095224e-04, 4.54745070256641366e-03, 3.63796017604134173e-02, - 2.91036567035938998e-01, 4.65651731179381212e+00, 3.72480912910018702e+01, - 1.48936880685972909e+02, 1.18736413772828405e+03, 4.69345257857060551e+03, - 3.37256310720094916e+04, 1.95904000000000000e+05, 1.99744000000000000e+05 }, + { + 1.72722507485033383e-67, 7.41837520931333590e-58, 1.36844868928954633e-38, + 5.87744236675266698e-29, 1.08419675147463808e-09, 5.55108736753989574e-07, + 5.68431345360095224e-04, 4.54745070256641366e-03, 3.63796017604134173e-02, + 2.91036567035938998e-01, 4.65651731179381212e+00, 3.72480912910018702e+01, + 1.48936880685972909e+02, 1.18736413772828405e+03, 4.69345257857060551e+03, + 3.37256310720094916e+04, 1.95904000000000000e+05, 1.99744000000000000e+05 + }, /* 102774 */ - { 4.56093001325520124e-68, 1.95890452462759358e-58, 3.61354104306368883e-39, - 1.55200406027122712e-29, 2.86294217011813689e-10, 1.46582639109909510e-07, - 1.50100622302544283e-04, 1.20080497023619128e-03, 9.60643923810312883e-03, - 7.68514803825075948e-02, 1.22961449148191515e+00, 9.83636673154418517e+00, - 3.93379364327392551e+01, 3.14142047803753769e+02, 1.24891387725365462e+03, - 9.44593016329059901e+03, 9.86780000000517612e+04, 1.02518000000000000e+05 }, + { + 4.56093001325520124e-68, 1.95890452462759358e-58, 3.61354104306368883e-39, + 1.55200406027122712e-29, 2.86294217011813689e-10, 1.46582639109909510e-07, + 1.50100622302544283e-04, 1.20080497023619128e-03, 9.60643923810312883e-03, + 7.68514803825075948e-02, 1.22961449148191515e+00, 9.83636673154418517e+00, + 3.93379364327392551e+01, 3.14142047803753769e+02, 1.24891387725365462e+03, + 9.44593016329059901e+03, 9.86780000000517612e+04, 1.02518000000000000e+05 + }, /* 100000 */ - { 4.31804109670444684e-68, 1.85458452931295726e-58, 3.42110461752972125e-39, - 1.46935324484847411e-29, 2.71047832615944429e-10, 1.38776490299235408e-07, - 1.42107125931920287e-04, 1.13685699991618186e-03, 9.09485551682193138e-03, - 7.27588132541049926e-02, 1.16413254204269756e+00, 9.31255441700961661e+00, - 3.72432805975374706e+01, 2.97429023684080164e+02, 1.18266355295424069e+03, - 8.95817783366734693e+03, 9.59040000001018925e+04, 9.97440000000000000e+04 }, + { + 4.31804109670444684e-68, 1.85458452931295726e-58, 3.42110461752972125e-39, + 1.46935324484847411e-29, 2.71047832615944429e-10, 1.38776490299235408e-07, + 1.42107125931920287e-04, 1.13685699991618186e-03, 9.09485551682193138e-03, + 7.27588132541049926e-02, 1.16413254204269756e+00, 9.31255441700961661e+00, + 3.72432805975374706e+01, 2.97429023684080164e+02, 1.18266355295424069e+03, + 8.95817783366734693e+03, 9.59040000001018925e+04, 9.97440000000000000e+04 + }, /* 77163 */ - { 2.57100957639565332e-68, 1.10424020483221446e-58, 2.03696364544404734e-39, - 8.74869224032312274e-30, 1.61384886736889072e-10, 8.26290620092283361e-08, - 8.46121594356573939e-05, 6.76897272021500683e-04, 5.41517795449147563e-03, - 4.33214094483818091e-02, 6.93138659749164665e-01, 5.54487683849644686e+00, - 2.21763200560975164e+01, 1.77172840383531820e+02, 7.05445676326827083e+02, - 5.40962011023344166e+03, 7.30670000269061129e+04, 7.69070000000000000e+04 }, + { + 2.57100957639565332e-68, 1.10424020483221446e-58, 2.03696364544404734e-39, + 8.74869224032312274e-30, 1.61384886736889072e-10, 8.26290620092283361e-08, + 8.46121594356573939e-05, 6.76897272021500683e-04, 5.41517795449147563e-03, + 4.33214094483818091e-02, 6.93138659749164665e-01, 5.54487683849644686e+00, + 2.21763200560975164e+01, 1.77172840383531820e+02, 7.05445676326827083e+02, + 5.40962011023344166e+03, 7.30670000269061129e+04, 7.69070000000000000e+04 + }, /* 50643 */ - { 1.10744301397987420e-68, 4.75643152722723048e-59, 8.77406750868841857e-40, - 3.76843330027129536e-30, 6.95153246489491803e-11, 3.55918462202453374e-08, - 3.64460505120626550e-05, 2.91568403117305078e-04, 2.33254716226988625e-03, - 1.86603732873723421e-02, 2.98564872499666734e-01, 2.38845326899687205e+00, - 9.55291197889362387e+00, 7.63560630702938568e+01, 3.04504893070908849e+02, - 2.36897008846858444e+03, 4.65470174614963616e+04, 5.03870000000000000e+04 }, + { + 1.10744301397987420e-68, 4.75643152722723048e-59, 8.77406750868841857e-40, + 3.76843330027129536e-30, 6.95153246489491803e-11, 3.55918462202453374e-08, + 3.64460505120626550e-05, 2.91568403117305078e-04, 2.33254716226988625e-03, + 1.86603732873723421e-02, 2.98564872499666734e-01, 2.38845326899687205e+00, + 9.55291197889362387e+00, 7.63560630702938568e+01, 3.04504893070908849e+02, + 2.36897008846858444e+03, 4.65470174614963616e+04, 5.03870000000000000e+04 + }, /* 6 */ - { 1.29542528326416669e-76, 5.56380922603113208e-67, 1.02634164867540313e-47, - 4.40810381558357815e-38, 8.13151629364128326e-19, 4.16333634234433703e-16, - 4.26325641456043956e-13, 3.41060513164744692e-12, 2.72848410531216727e-11, - 2.18278728421267612e-10, 3.49245965372384226e-09, 2.79396771690754164e-08, - 1.11758707843634397e-07, 8.94069600576588975e-07, 3.57627754965526357e-06, - 2.86101567327154416e-05, 3.66091750036190520e-03, 5.82894668923472636e-02 }, + { + 1.29542528326416669e-76, 5.56380922603113208e-67, 1.02634164867540313e-47, + 4.40810381558357815e-38, 8.13151629364128326e-19, 4.16333634234433703e-16, + 4.26325641456043956e-13, 3.41060513164744692e-12, 2.72848410531216727e-11, + 2.18278728421267612e-10, 3.49245965372384226e-09, 2.79396771690754164e-08, + 1.11758707843634397e-07, 8.94069600576588975e-07, 3.57627754965526357e-06, + 2.86101567327154416e-05, 3.66091750036190520e-03, 5.82894668923472636e-02 + }, }; -static void printdouble( const int width, const double value ) -{ - if (width < 10) +static void printdouble( const int width, const double value ) { + if (width < 10) { printf("%.*s|", width - 1, "----------"); - else if (value == 0.0) - printf (" %*.3f |", width - 2, value); - else if (value < 1.0e-100) - printf (" %.*e |", width - 9, value); - else if (value < 1.0e-6) - printf (" %.*e |", width - 9, value); - else if (value < 1.0) - printf (" %*.*f |", width - 3, width - 5, value); - else if (value < 1.0e6) - printf (" %*.3f |", width - 2, value); - else - printf (" %*.1f |", width - 4, value); + } else if (value == 0.0) { + printf(" %*.3f |", width - 2, value); + } else if (value < 1.0e-100) { + printf(" %.*e |", width - 9, value); + } else if (value < 1.0e-6) { + printf(" %.*e |", width - 9, value); + } else if (value < 1.0) { + printf(" %*.*f |", width - 3, width - 5, value); + } else if (value < 1.0e6) { + printf(" %*.3f |", width - 2, value); + } else { + printf(" %*.1f |", width - 4, value); + } } -void ReportCollisionEstimates( void ) -{ +void ReportCollisionEstimates( void ) { const int keys[] = { - 149633745, 86536545, 75498113, 56050289, 49925029, 44251425, - 43691201, 33558529, 33554432, 26977161, 22370049, 18877441, - 18616785, 17676661, 16777216, 16777214, 15082603, 14986273, - 14776336, 14196869, 12204240, 11017633, 9437505, 8390657, - 8388608, 8303633, 6445069, 5471025, 5461601, 5000000, - 4720129, 4598479, 4514873, 4216423, 4194304, 4000000, - 3981553, 3469497, 2796417, 2396744, 2098177, 2097152, - 1271626, 1180417, 1048576, 1000000, 819841, 652545, - 524801, 401857, 264097, 204800, 200000, 102774, - 100000, 77163, 50643, 6 + 149633745, 86536545, 75498113, 56050289, 49925029, 44251425, + 43691201, 33558529, 33554432, 26977161, 22370049, 18877441, + 18616785, 17676661, 16777216, 16777214, 15082603, 14986273, + 14776336, 14196869, 12204240, 11017633, 9437505, 8390657, + 8388608, 8303633, 6445069, 5471025, 5461601, 5000000, + 4720129, 4598479, 4514873, 4216423, 4194304, 4000000, + 3981553, 3469497, 2796417, 2396744, 2098177, 2097152, + 1271626, 1180417, 1048576, 1000000, 819841, 652545, + 524801, 401857, 264097, 204800, 200000, 102774, + 100000, 77163, 50643, 6 }; const int bits[] = { 256, 224, 160, 128, 64, 55, 45, 42, 39, 36, 32, 29, 27, 24, 22, 19, 12, 8 }; - printf ("EstimateNbCollisions:\n"); - printf (" # keys : bits| True answer | A: _fwojcik() | B: _previmpl() | C: _Demerphq() | Error A | Error B | Error C |\n"); - printf ("---------------------------------------------------------------------------------------------------------------------------------------------------\n"); - for (int i = 0; i < sizeof(keys)/sizeof(keys[0]); i++) { - const int key = keys[i]; - for (int j = 0; j < sizeof(bits)/sizeof(bits[0]); j++) { - const int bit = bits[j]; - printf (" %9d : %3d |", key, bit); - printdouble(20, realcoll[i][j]); - for (int k = 0; k < COLLISION_ESTIMATORS; k++) { - printdouble(20, EstimateNbCollisionsCand(key, bit, k)); - } - for (int k = 0; k < COLLISION_ESTIMATORS; k++) { - double delta = EstimateNbCollisionsCand(key, bit, k) - realcoll[i][j]; - double deltapct = delta/realcoll[i][j]*100.0; - if (deltapct > 9999.999) - deltapct = 9999.999; - printf(" %+11.5f%% |", deltapct); + + printf("EstimateNbCollisions:\n"); + printf( + " # keys : bits| True answer | A: _fwojcik() | B: _previmpl() | C: _Demerphq() | Error A | Error B | Error C |\n"); + printf( + "---------------------------------------------------------------------------------------------------------------------------------------------------\n"); + for (int i = 0; i < sizeof(keys) / sizeof(keys[0]); i++) { + const int key = keys[i]; + for (int j = 0; j < sizeof(bits) / sizeof(bits[0]); j++) { + const int bit = bits[j]; + printf(" %9d : %3d |", key, bit); + printdouble(20, realcoll[i][j]); + for (int k = 0; k < COLLISION_ESTIMATORS; k++) { + printdouble(20, EstimateNbCollisionsCand(key, bit, k)); + } + for (int k = 0; k < COLLISION_ESTIMATORS; k++) { + double delta = EstimateNbCollisionsCand(key, bit, k) - realcoll[i][j]; + double deltapct = delta / realcoll[i][j] * 100.0; + if (deltapct > 9999.999) { + deltapct = 9999.999; + } + printf(" %+11.5f%% |", deltapct); + } + printf("\n"); } - printf("\n"); - } } } //----------------------------------------------------------------------------- + /* * Compute the lowest number of hash bits (n) such that there are * fewer than (2**n)*log(2**n) hashes, for a given hash count. @@ -824,13 +927,15 @@ void ReportCollisionEstimates( void ) * This may validly return a value exceeding the number of hash bits * that exist for the hash being tested! */ -int GetNLogNBound ( unsigned nbH ) -{ - int nbHBits; - for (nbHBits = 1; nbHBits <= 255; nbHBits++) - if (nbH < (log(2.0) * nbHBits * exp2(nbHBits))) - break; - return nbHBits - 1; +int GetNLogNBound( unsigned nbH ) { + int nbHBits; + + for (nbHBits = 1; nbHBits <= 255; nbHBits++) { + if (nbH < (log(2.0) * nbHBits * exp2(nbHBits))) { + break; + } + } + return nbHBits - 1; } /* @@ -849,17 +954,15 @@ int GetNLogNBound ( unsigned nbH ) * use pow(), but this alternate formulation does the same thing for * values in 1-p space. */ -double ScalePValue ( double p_value, unsigned testcount ) -{ - return -expm1(log1p(-p_value) * testcount); +double ScalePValue( double p_value, unsigned testcount ) { + return -expm1(log1p(-p_value) * testcount); } /* * This is exactly the same as ScalePValue, but for 2**N tests. */ -double ScalePValue2N ( double p_value, unsigned testbits ) -{ - return -expm1(log1p(-p_value) * exp2(testbits)); +double ScalePValue2N( double p_value, unsigned testbits ) { + return -expm1(log1p(-p_value) * exp2(testbits)); } /* @@ -875,8 +978,7 @@ double ScalePValue2N ( double p_value, unsigned testbits ) * the caret (^) to display these values, as that can indicate * exponentiation, and the p-value is no less than 1/(2**logp_value). */ -int GetLog2PValue ( double p_value ) -{ +int GetLog2PValue( double p_value ) { return (log2(p_value) <= -99.0) ? 99 : -ceil(log2(p_value)); } @@ -884,10 +986,9 @@ int GetLog2PValue ( double p_value ) * Given a mean and standard deviation, return (1.0 - p) for the given * random normal variable. */ -double GetNormalPValue(const double mu, const double sd, const double variable) -{ - double stdvar = (variable - mu) / sd; - double p_value = erfc(stdvar/sqrt(2.0))/2.0; +double GetNormalPValue( const double mu, const double sd, const double variable ) { + double stdvar = (variable - mu) / sd; + double p_value = erfc(stdvar / sqrt(2.0)) / 2.0; return p_value; } @@ -895,16 +996,20 @@ double GetNormalPValue(const double mu, const double sd, const double variable) /* * A helper function for the Peizer and Pratt approximation below. */ -static double GFunc_PeizerPratt(const double x) { - if (x < 0.0) +static double GFunc_PeizerPratt( const double x ) { + if (x < 0.0) { return NAN; - if (x == 0.0) + } + if (x == 0.0) { return 1.0; - if (x == 1.0) + } + if (x == 1.0) { return 0.0; - if (x > 1.0) - return -GFunc_PeizerPratt(1.0/x); - return (1.0 - x*x + 2*x*log(x))/((1.0 - x)*(1.0 - x)); + } + if (x > 1.0) { + return -GFunc_PeizerPratt(1.0 / x); + } + return (1.0 - x * x + 2 * x * log(x)) / ((1.0 - x) * (1.0 - x)); } /* @@ -929,28 +1034,29 @@ static double GFunc_PeizerPratt(const double x) { * "APPROXIMATIONS TO THE BINOMIAL", by MYRTLE ANNA BRUCE * https://core.ac.uk/download/pdf/33362622.pdf */ -double EstimatedBinomialPValue(const unsigned long nbH, const int nbBits, const int maxColl) -{ - const double s = maxColl + 1; - const double n = nbH; - const double t = nbH - maxColl; - const double p = exp2(-nbBits); - const double q = 1.0 - p; +double EstimatedBinomialPValue( const unsigned long nbH, const int nbBits, const int maxColl ) { + const double s = maxColl + 1; + const double n = nbH; + const double t = nbH - maxColl; + const double p = exp2(-nbBits); + const double q = 1.0 - p; - const double d1 = s + 1.0/6.0 - p * (n + 1.0/3.0); - const double d2 = d1 + 0.02 * (q/(s+0.5) - p/(t+0.5) + (q-0.5)/(n+1)); + const double d1 = s + 1.0 / 6.0 - p * (n + 1.0 / 3.0); + const double d2 = d1 + 0.02 * (q / (s + 0.5) - p / (t + 0.5) + (q - 0.5) / (n + 1)); - const double num = 1.0 + q*GFunc_PeizerPratt(s/(n*p)) + p*GFunc_PeizerPratt(t/(n*q)); - const double denom = (n + 1.0/6.0) * p * q; - const double z2 = d2 * sqrt(num/denom); + const double num = 1.0 + q * GFunc_PeizerPratt(s / (n * p)) + p * GFunc_PeizerPratt(t / (n * q)); + const double denom = (n + 1.0 / 6.0) * p * q; + const double z2 = d2 * sqrt(num / denom); // (1.0 - p) for one hash bin double p_value = GetNormalPValue(0.0, 1.0, z2); - //fprintf(stderr, "Pr(Xi > %ld; %d, %d) ~= 1.0 - N(%f)\n", nbH, nbBits, maxColl, z2); + // fprintf(stderr, "Pr(Xi > %ld; %d, %d) ~= 1.0 - N(%f)\n", nbH, nbBits, maxColl, z2); // (1.0 - p) across all 2**nbBits hash bins double pm_value = ScalePValue2N(p_value, nbBits); - //fprintf(stderr,"Pr(Xm > %ld; %d, %d) ~= 1.0-((1.0-%e)**(2**n)) == %.12f\n", nbH, nbBits, maxColl, p_value, pm_value, pm_value); + + // fprintf(stderr,"Pr(Xm > %ld; %d, %d) ~= 1.0-((1.0-%e)**(2**n)) == %.12f\n", nbH, nbBits, maxColl, p_value, + // pm_value, pm_value); return pm_value; } @@ -972,14 +1078,13 @@ double EstimatedBinomialPValue(const unsigned long nbH, const int nbBits, const * 50th-percentile for a given nbBits were computed via linear * regression from Monte Carlo experiments by fwojcik [N ~= 80,000,000]. */ -double EstimateMaxCollisions(const unsigned long nbH, const int nbBits) -{ +double EstimateMaxCollisions( const unsigned long nbH, const int nbBits ) { double alpha = -expm1(-0.128775055 * nbBits - 0.759110989); double m = (double)nbH - 16; double n = exp2(nbBits); double logn = nbBits * log(2); - return (m/n) + alpha * sqrt(2.0 * (m/n) * logn); + return (m / n) + alpha * sqrt(2.0 * (m / n) * logn); } /* @@ -990,7 +1095,7 @@ double EstimateMaxCollisions(const unsigned long nbH, const int nbBits) * p-value using a single calculation. This is taken from: * * "Sharp Bounds on Tail Probabilities for Poisson Random Variables", by - * Peter Harremoës + * Peter Harremoës * https://helda.helsinki.fi/bitstream/handle/10138/229679/witmse_proc_17.pdf * * Similar to other places in SMHasher3, this returns 1.0-p, so the @@ -998,13 +1103,13 @@ double EstimateMaxCollisions(const unsigned long nbH, const int nbBits) * computing real p-values for lower-than-expected collision counts, * since that is never a failure condition. */ -double BoundedPoissonPValue(const double expected, const uint64_t collisions) -{ - if (collisions < expected) +double BoundedPoissonPValue( const double expected, const uint64_t collisions ) { + if (collisions < expected) { return 1.0; - double x = (double)collisions - 0.5; + } + double x = (double)collisions - 0.5; double g_over_root2 = sqrt(x * log(x / expected) + expected - x); - double p_lbound = erfc(g_over_root2)/2.0; + double p_lbound = erfc(g_over_root2) / 2.0; return p_lbound; } @@ -1071,32 +1176,32 @@ double BoundedPoissonPValue(const double expected, const uint64_t collisions) // sumN{(Bi**2)} - M * lambda // // NB: bincount must be a non-zero multiple of 8! -double calcScore ( const unsigned * bins, const int bincount, const int keycount ) -{ - const double n = bincount; - const double k = keycount; - const double lambda = k/n; +double calcScore( const unsigned * bins, const int bincount, const int keycount ) { + const double n = bincount; + const double k = keycount; + const double lambda = k / n; - uint64_t sumsq = 0; + uint64_t sumsq = 0; - assume(bincount >= 8); - for(int i = 0; i < (bincount>>3)<<3; i++) - sumsq += (uint64_t)bins[i] * (uint64_t)bins[i]; + assume(bincount >= 8); + for (int i = 0; i < (bincount >> 3) << 3; i++) { + sumsq += (uint64_t)bins[i] * (uint64_t)bins[i]; + } - double sumsqe = (double)sumsq - lambda * k; - double rmse = sqrt(sumsqe/n); - double rmse_ratio_m1 = (rmse - sqrt(lambda))/sqrt(lambda); // == rmse/sqrt(lambda) - 1.0 - double score = (rmse_ratio_m1) * sqrt(2.0 * n); + double sumsqe = (double)sumsq - lambda * k; + double rmse = sqrt(sumsqe / n); + double rmse_ratio_m1 = (rmse - sqrt(lambda)) / sqrt(lambda); // == rmse/sqrt(lambda) - 1.0 + double score = (rmse_ratio_m1) * sqrt(2.0 * n); - return score; + return score; } // Convert the score from calcScore back into (rmse/sqrt(lambda) - // 1.0), to show the user something like the previous report. -double normalizeScore ( double score, int scorewidth, int tests ) -{ - if (score <= 0) +double normalizeScore( double score, int scorewidth, int tests ) { + if (score <= 0) { return 0.0; + } // Never return a result higher than this, as a precise value // would be visually cluttered and not really meaningful. @@ -1104,8 +1209,9 @@ double normalizeScore ( double score, int scorewidth, int tests ) double result = score / sqrt(2.0 * scorewidth); - if (result > maxresult) + if (result > maxresult) { return maxresult; + } return result; } diff --git a/util/Stats.h b/util/Stats.h index 2dce94dc..35d3acef 100644 --- a/util/Stats.h +++ b/util/Stats.h @@ -50,27 +50,27 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -double CalcMean ( std::vector & v ); -double CalcMean ( std::vector & v, int a, int b ); -double CalcStdv ( std::vector & v ); -double CalcStdv ( std::vector & v, int a, int b ); -bool ContainsOutlier ( std::vector & v, size_t len ); -void FilterOutliers ( std::vector & v ); +double CalcMean( std::vector & v ); +double CalcMean( std::vector & v, int a, int b ); +double CalcStdv( std::vector & v ); +double CalcStdv( std::vector & v, int a, int b ); +bool ContainsOutlier( std::vector & v, size_t len ); +void FilterOutliers( std::vector & v ); -double chooseK ( int b, int k ); -double chooseUpToK ( int n, int k ); +double chooseK( int b, int k ); +double chooseUpToK( int n, int k ); -double EstimateNbCollisions(const unsigned long nbH, const int nbBits); +double EstimateNbCollisions( const unsigned long nbH, const int nbBits ); void ReportCollisionEstimates( void ); -int GetNLogNBound ( unsigned nbH ); -double ScalePValue ( double p_value, unsigned testcount ); -double ScalePValue2N ( double p_value, unsigned testbits ); -int GetLog2PValue ( double p_value ); -double GetNormalPValue(const double mu, const double sd, const double variable); -double EstimatedBinomialPValue(const unsigned long nbH, const int nbBits, const int maxColl); -double EstimateMaxCollisions(const unsigned long nbH, const int nbBits); -double BoundedPoissonPValue(const double expected, const uint64_t collisions); +int GetNLogNBound( unsigned nbH ); +double ScalePValue( double p_value, unsigned testcount ); +double ScalePValue2N( double p_value, unsigned testbits ); +int GetLog2PValue( double p_value ); +double GetNormalPValue( const double mu, const double sd, const double variable ); +double EstimatedBinomialPValue( const unsigned long nbH, const int nbBits, const int maxColl ); +double EstimateMaxCollisions( const unsigned long nbH, const int nbBits ); +double BoundedPoissonPValue( const double expected, const uint64_t collisions ); -double calcScore ( const unsigned * bins, const int bincount, const int ballcount ); -double normalizeScore ( double score, int scorewidth, int tests ); +double calcScore( const unsigned * bins, const int bincount, const int ballcount ); +double normalizeScore( double score, int scorewidth, int tests ); diff --git a/util/TestGlobals.h b/util/TestGlobals.h index 1dc66418..58f92b64 100644 --- a/util/TestGlobals.h +++ b/util/TestGlobals.h @@ -47,42 +47,42 @@ extern HashInfo::endianness g_hashEndian; // Recording test results for final summary printout #define COUNT_MAX_PVALUE 18 -extern uint32_t g_log2pValueCounts[COUNT_MAX_PVALUE+2]; +extern uint32_t g_log2pValueCounts[COUNT_MAX_PVALUE + 2]; -static inline void recordLog2PValue(uint32_t log_pvalue) { - if (log_pvalue <= COUNT_MAX_PVALUE) { - g_log2pValueCounts[log_pvalue]++; - } else { - g_log2pValueCounts[COUNT_MAX_PVALUE+1]++; - } +static inline void recordLog2PValue( uint32_t log_pvalue ) { + if (log_pvalue <= COUNT_MAX_PVALUE) { + g_log2pValueCounts[log_pvalue]++; + } else { + g_log2pValueCounts[COUNT_MAX_PVALUE + 1]++; + } } extern uint32_t g_testPass, g_testFail; -extern std::vector< std::pair > g_testFailures; +extern std::vector> g_testFailures; -static inline void recordTestResult(bool pass, const char * suitename, const char * testname) { - if (pass) { - g_testPass++; - return; - } - g_testFail++; +static inline void recordTestResult( bool pass, const char * suitename, const char * testname ) { + if (pass) { + g_testPass++; + return; + } + g_testFail++; - char * ntestname = NULL; - if (testname != NULL) { - testname += strspn(testname, " "); - ntestname = strdup(testname); - if (!ntestname) { - printf("OOM\n"); - exit(1); + char * ntestname = NULL; + if (testname != NULL) { + testname += strspn(testname, " "); + ntestname = strdup(testname); + if (!ntestname) { + printf("OOM\n"); + exit(1); + } } - } - g_testFailures.push_back(std::pair(suitename, ntestname)); + g_testFailures.push_back(std::pair(suitename, ntestname)); } -static inline void recordTestResult(bool pass, const char * suitename, uint64_t testnum) { - const uint64_t maxlen = sizeof("18446744073709551615"); // UINT64_MAX - char testname[maxlen]; - snprintf(testname, maxlen, "%" PRIu64, testnum); - recordTestResult(pass, suitename, testname); -} +static inline void recordTestResult( bool pass, const char * suitename, uint64_t testnum ) { + const uint64_t maxlen = sizeof("18446744073709551615"); // UINT64_MAX + char testname[maxlen]; + snprintf(testname, maxlen, "%" PRIu64, testnum); + recordTestResult(pass, suitename, testname); +} diff --git a/util/VCode.cpp b/util/VCode.cpp index 25804a1e..4f89d828 100644 --- a/util/VCode.cpp +++ b/util/VCode.cpp @@ -47,9 +47,9 @@ //----------------------------------------------------------------------------- // Full CRC32c implementation // This is based on Mark Adler's implementation. -static inline void crc32c_sw_update(uint32_t * const crcptr, const void * const ptr, size_t len) { +static inline void crc32c_sw_update( uint32_t * const crcptr, const void * const ptr, size_t len ) { const uint8_t * next = (const uint8_t *)ptr; - uint64_t crc; + uint64_t crc; crc = *crcptr; while (len && ((uintptr_t)next & 7) != 0) { @@ -66,40 +66,40 @@ static inline void crc32c_sw_update(uint32_t * const crcptr, const void * const crc ^= wd1; if (isBE()) { crc = - crc32c_sw_table[15][ crc & 0xff] ^ - crc32c_sw_table[14][(crc >> 8) & 0xff] ^ - crc32c_sw_table[13][(crc >> 16) & 0xff] ^ - crc32c_sw_table[12][(crc >> 24) & 0xff] ^ - crc32c_sw_table[11][(crc >> 32) & 0xff] ^ - crc32c_sw_table[10][(crc >> 40) & 0xff] ^ - crc32c_sw_table[ 9][(crc >> 48) & 0xff] ^ - crc32c_sw_table[ 8][ crc >> 56] ^ - crc32c_sw_table[ 0][ wd2 & 0xff] ^ - crc32c_sw_table[ 1][(wd2 >> 8) & 0xff] ^ - crc32c_sw_table[ 2][(wd2 >> 16) & 0xff] ^ - crc32c_sw_table[ 3][(wd2 >> 24) & 0xff] ^ - crc32c_sw_table[ 4][(wd2 >> 32) & 0xff] ^ - crc32c_sw_table[ 5][(wd2 >> 40) & 0xff] ^ - crc32c_sw_table[ 6][(wd2 >> 48) & 0xff] ^ - crc32c_sw_table[ 7][ wd2 >> 56] ; + crc32c_sw_table[15][crc & 0xff] ^ + crc32c_sw_table[14][(crc >> 8) & 0xff] ^ + crc32c_sw_table[13][(crc >> 16) & 0xff] ^ + crc32c_sw_table[12][(crc >> 24) & 0xff] ^ + crc32c_sw_table[11][(crc >> 32) & 0xff] ^ + crc32c_sw_table[10][(crc >> 40) & 0xff] ^ + crc32c_sw_table[ 9][(crc >> 48) & 0xff] ^ + crc32c_sw_table[ 8][crc >> 56] ^ + crc32c_sw_table[ 0][wd2 & 0xff] ^ + crc32c_sw_table[ 1][(wd2 >> 8) & 0xff] ^ + crc32c_sw_table[ 2][(wd2 >> 16) & 0xff] ^ + crc32c_sw_table[ 3][(wd2 >> 24) & 0xff] ^ + crc32c_sw_table[ 4][(wd2 >> 32) & 0xff] ^ + crc32c_sw_table[ 5][(wd2 >> 40) & 0xff] ^ + crc32c_sw_table[ 6][(wd2 >> 48) & 0xff] ^ + crc32c_sw_table[ 7][wd2 >> 56]; } else { crc = - crc32c_sw_table[15][ crc & 0xff] ^ - crc32c_sw_table[14][(crc >> 8) & 0xff] ^ - crc32c_sw_table[13][(crc >> 16) & 0xff] ^ - crc32c_sw_table[12][(crc >> 24) & 0xff] ^ - crc32c_sw_table[11][(crc >> 32) & 0xff] ^ - crc32c_sw_table[10][(crc >> 40) & 0xff] ^ - crc32c_sw_table[ 9][(crc >> 48) & 0xff] ^ - crc32c_sw_table[ 8][ crc >> 56] ^ - crc32c_sw_table[ 7][ wd2 & 0xff] ^ - crc32c_sw_table[ 6][(wd2 >> 8) & 0xff] ^ - crc32c_sw_table[ 5][(wd2 >> 16) & 0xff] ^ - crc32c_sw_table[ 4][(wd2 >> 24) & 0xff] ^ - crc32c_sw_table[ 3][(wd2 >> 32) & 0xff] ^ - crc32c_sw_table[ 2][(wd2 >> 40) & 0xff] ^ - crc32c_sw_table[ 1][(wd2 >> 48) & 0xff] ^ - crc32c_sw_table[ 0][ wd2 >> 56] ; + crc32c_sw_table[15][crc & 0xff] ^ + crc32c_sw_table[14][(crc >> 8) & 0xff] ^ + crc32c_sw_table[13][(crc >> 16) & 0xff] ^ + crc32c_sw_table[12][(crc >> 24) & 0xff] ^ + crc32c_sw_table[11][(crc >> 32) & 0xff] ^ + crc32c_sw_table[10][(crc >> 40) & 0xff] ^ + crc32c_sw_table[ 9][(crc >> 48) & 0xff] ^ + crc32c_sw_table[ 8][crc >> 56] ^ + crc32c_sw_table[ 7][wd2 & 0xff] ^ + crc32c_sw_table[ 6][(wd2 >> 8) & 0xff] ^ + crc32c_sw_table[ 5][(wd2 >> 16) & 0xff] ^ + crc32c_sw_table[ 4][(wd2 >> 24) & 0xff] ^ + crc32c_sw_table[ 3][(wd2 >> 32) & 0xff] ^ + crc32c_sw_table[ 2][(wd2 >> 40) & 0xff] ^ + crc32c_sw_table[ 1][(wd2 >> 48) & 0xff] ^ + crc32c_sw_table[ 0][wd2 >> 56]; } next += 16; len -= 16; @@ -119,19 +119,19 @@ extern const uint32_t crc32_short[4][256]; // HW_LONGBLOCK_LEN and HW_SHORTBLOCK_LEN must both be powers of // two. Altering these means the crc32_long and crc32_short tables // need to be rebuilt. -const uint32_t HW_LONGBLOCK_LEN = 8192; +const uint32_t HW_LONGBLOCK_LEN = 8192; const uint32_t HW_SHORTBLOCK_LEN = 256; /* Apply the zeros operator table to crc. */ -static inline uint32_t crc32_shift(const uint32_t zeros[][256], uint32_t crc) { +static inline uint32_t crc32_shift( const uint32_t zeros[][256], uint32_t crc ) { return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]; } -static inline void crc32c_hw_update(uint32_t * crcptr, const void * ptr, size_t len) { +static inline void crc32c_hw_update( uint32_t * crcptr, const void * ptr, size_t len ) { const uint8_t * next = (const uint8_t *)ptr; const uint8_t * end; - uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ + uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ /* Assume CRC is already pre-processed! */ crc0 = *crcptr; @@ -152,48 +152,48 @@ static inline void crc32c_hw_update(uint32_t * crcptr, const void * ptr, size_t * Bridge, and Ivy Bridge architectures, which have a throughput * of one crc per cycle, but a latency of three cycles. */ - while (len >= HW_LONGBLOCK_LEN*3) { + while (len >= HW_LONGBLOCK_LEN * 3) { crc1 = 0; crc2 = 0; - end = next + HW_LONGBLOCK_LEN; + end = next + HW_LONGBLOCK_LEN; do { uint64_t d1, d2, d3; memcpy(&d1, next, 8); memcpy(&d2, next + HW_LONGBLOCK_LEN, 8); memcpy(&d3, next + HW_LONGBLOCK_LEN + HW_LONGBLOCK_LEN, 8); - crc0 = HWCRC_U64(crc0, d1); - crc1 = HWCRC_U64(crc1, d2); - crc2 = HWCRC_U64(crc2, d3); + crc0 = HWCRC_U64(crc0, d1); + crc1 = HWCRC_U64(crc1, d2); + crc2 = HWCRC_U64(crc2, d3); next += 8; } while (next < end); - crc0 = crc32_shift(crc32_long, crc0) ^ crc1; - crc0 = crc32_shift(crc32_long, crc0) ^ crc2; - next += HW_LONGBLOCK_LEN*2; - len -= HW_LONGBLOCK_LEN*3; + crc0 = crc32_shift(crc32_long, crc0) ^ crc1; + crc0 = crc32_shift(crc32_long, crc0) ^ crc2; + next += HW_LONGBLOCK_LEN * 2; + len -= HW_LONGBLOCK_LEN * 3; } /* * Do the same thing, but now on HW_SHORTBLOCK_LEN*3 blocks for * the remaining data less than a HW_LONGBLOCK_LEN*3 block. */ - while (len >= HW_SHORTBLOCK_LEN*3) { + while (len >= HW_SHORTBLOCK_LEN * 3) { crc1 = 0; crc2 = 0; - end = next + HW_SHORTBLOCK_LEN; + end = next + HW_SHORTBLOCK_LEN; do { uint64_t d1, d2, d3; memcpy(&d1, next, 8); memcpy(&d2, next + HW_SHORTBLOCK_LEN, 8); memcpy(&d3, next + HW_SHORTBLOCK_LEN + HW_SHORTBLOCK_LEN, 8); - crc0 = HWCRC_U64(crc0, d1); - crc1 = HWCRC_U64(crc1, d2); - crc2 = HWCRC_U64(crc2, d3); + crc0 = HWCRC_U64(crc0, d1); + crc1 = HWCRC_U64(crc1, d2); + crc2 = HWCRC_U64(crc2, d3); next += 8; } while (next < end); - crc0 = crc32_shift(crc32_short, crc0) ^ crc1; - crc0 = crc32_shift(crc32_short, crc0) ^ crc2; - next += HW_SHORTBLOCK_LEN*2; - len -= HW_SHORTBLOCK_LEN*3; + crc0 = crc32_shift(crc32_short, crc0) ^ crc1; + crc0 = crc32_shift(crc32_short, crc0) ^ crc2; + next += HW_SHORTBLOCK_LEN * 2; + len -= HW_SHORTBLOCK_LEN * 3; } /* @@ -204,7 +204,7 @@ static inline void crc32c_hw_update(uint32_t * crcptr, const void * ptr, size_t while (next < end) { uint64_t data; memcpy(&data, next, 8); - crc0 = HWCRC_U64(crc0, data); + crc0 = HWCRC_U64(crc0, data); next += 8; } len &= 7; @@ -218,9 +218,10 @@ static inline void crc32c_hw_update(uint32_t * crcptr, const void * ptr, size_t /* DON'T post-process the CRC! */ *crcptr = (uint32_t)crc0; } + #endif -static inline void crc32c_update(uint32_t * crc, const void * ptr, size_t len) { +static inline void crc32c_update( uint32_t * crc, const void * ptr, size_t len ) { #if defined(HWCRC_U64) crc32c_hw_update(crc, ptr, len); #else @@ -230,9 +231,10 @@ static inline void crc32c_update(uint32_t * crc, const void * ptr, size_t len) { //----------------------------------------------------------------------------- // CRC implementation self-tests -template < bool use_hw, bool oneshot > -static uint32_t vcode_crc_selftest_40(uint8_t offset) { +template +static uint32_t vcode_crc_selftest_40( uint8_t offset ) { uint8_t buf[40]; + for (int i = 0; i < 40; i++) { buf[i] = offset + i; } @@ -240,61 +242,62 @@ static uint32_t vcode_crc_selftest_40(uint8_t offset) { uint32_t crc = ~0; if (oneshot) { #if defined(HWCRC_U64) - if (use_hw) + if (use_hw) { crc32c_hw_update(&crc, buf, 40); - else + } else #endif - crc32c_sw_update(&crc, buf, 40); + crc32c_sw_update(&crc, buf, 40); } else { #if defined(HWCRC_U64) if (use_hw) { - crc32c_hw_update(&crc, &buf[0], 1); - crc32c_hw_update(&crc, &buf[1], 1); - crc32c_hw_update(&crc, &buf[2], 2); - crc32c_hw_update(&crc, &buf[4], 4); - crc32c_hw_update(&crc, &buf[8], 8); + crc32c_hw_update(&crc, &buf[0] , 1); + crc32c_hw_update(&crc, &buf[1] , 1); + crc32c_hw_update(&crc, &buf[2] , 2); + crc32c_hw_update(&crc, &buf[4] , 4); + crc32c_hw_update(&crc, &buf[8] , 8); crc32c_hw_update(&crc, &buf[16], 16); - crc32c_hw_update(&crc, &buf[32], 1); - crc32c_hw_update(&crc, &buf[33], 1); - crc32c_hw_update(&crc, &buf[34], 2); - crc32c_hw_update(&crc, &buf[36], 4); + crc32c_hw_update(&crc, &buf[32], 1); + crc32c_hw_update(&crc, &buf[33], 1); + crc32c_hw_update(&crc, &buf[34], 2); + crc32c_hw_update(&crc, &buf[36], 4); } else { #endif - crc32c_sw_update(&crc, &buf[0], 1); - crc32c_sw_update(&crc, &buf[1], 1); - crc32c_sw_update(&crc, &buf[2], 2); - crc32c_sw_update(&crc, &buf[4], 4); - crc32c_sw_update(&crc, &buf[8], 8); - crc32c_sw_update(&crc, &buf[16], 16); - crc32c_sw_update(&crc, &buf[32], 1); - crc32c_sw_update(&crc, &buf[33], 1); - crc32c_sw_update(&crc, &buf[34], 2); - crc32c_sw_update(&crc, &buf[36], 4); + crc32c_sw_update(&crc, &buf[0] , 1); + crc32c_sw_update(&crc, &buf[1] , 1); + crc32c_sw_update(&crc, &buf[2] , 2); + crc32c_sw_update(&crc, &buf[4] , 4); + crc32c_sw_update(&crc, &buf[8] , 8); + crc32c_sw_update(&crc, &buf[16], 16); + crc32c_sw_update(&crc, &buf[32], 1); + crc32c_sw_update(&crc, &buf[33], 1); + crc32c_sw_update(&crc, &buf[34], 2); + crc32c_sw_update(&crc, &buf[36], 4); #if defined(HWCRC_U64) - } + } #endif - } return ~crc; } -template < bool use_hw > -static bool vcode_crc_selftest(void) { +template +static bool vcode_crc_selftest( void ) { #if !defined(HWCRC_U64) if (use_hw) { return true; } #endif constexpr uint32_t testcnt = 6; - uint8_t offsets[testcnt] = { 0x01, 0x29, 0x51, 0x79, 0xa1, 0xc9 }; - uint32_t crcs[testcnt] = { 0x0e2c157f, 0xe980ebf6, 0xde74bded, - 0xd579c862, 0xba979ad0, 0x2b29d913 }; + uint8_t offsets[testcnt] = { 0x01, 0x29, 0x51, 0x79, 0xa1, 0xc9 }; + uint32_t crcs[testcnt] = { + 0x0e2c157f, 0xe980ebf6, 0xde74bded, + 0xd579c862, 0xba979ad0, 0x2b29d913 + }; uint32_t crc; for (int i = 0; i < testcnt; i++) { - crc = vcode_crc_selftest_40(offsets[i]); + crc = vcode_crc_selftest_40( offsets[i]); if (crc != crcs[i]) { return false; } - crc = vcode_crc_selftest_40(offsets[i]); + crc = vcode_crc_selftest_40(offsets[i]); if (crc != crcs[i]) { return false; } } @@ -304,37 +307,39 @@ static bool vcode_crc_selftest(void) { //----------------------------------------------------------------------------- // VCode internal implementation vcode_state_t vcode_states[VCODE_COUNT]; -uint32_t g_doVCode = 0; -uint32_t g_inputVCode = 1; -uint32_t g_outputVCode = 1; -uint32_t g_resultVCode = 1; +uint32_t g_doVCode = 0; +uint32_t g_inputVCode = 1; +uint32_t g_outputVCode = 1; +uint32_t g_resultVCode = 1; static const uint64_t K1 = UINT64_C(0x6A09E667F3BCC909); // sqrt(2)-1 static const uint64_t K2 = UINT64_C(0xBB67AE8584CAA73B); // sqrt(3)-1 -static void resetWithSeed(vcode_state_t * state, uint64_t seed) { +static void resetWithSeed( vcode_state_t * state, uint64_t seed ) { // Arbitrarily mix seed into 2 starting 32-bit "CRC" values, // and then pre-process them. uint64_t v1 = (seed + 1) * K1; uint64_t v2 = (seed + 2) * K2; + state->data_hash = 0xffffffff ^ (v1 - (v1 >> 32)); state->lens_hash = 0xffffffff ^ (v2 - (v2 >> 32)); } -static void update(vcode_state_t * state, const void * ptr, size_t len) { +static void update( vcode_state_t * state, const void * ptr, size_t len ) { crc32c_update(&state->data_hash, ptr, len); crc32c_update_u64(&state->lens_hash, (uint64_t)len); } -static void update_u32(vcode_state_t * state, uint32_t data) { +static void update_u32( vcode_state_t * state, uint32_t data ) { crc32c_update_u64(&state->data_hash, (uint64_t)data); crc32c_update_u64(&state->lens_hash, 4); } -static uint32_t getDigest(vcode_state_t * state) { +static uint32_t getDigest( vcode_state_t * state ) { // Post-process the 2 final 32-bit CRCs, and then arbitrarily mix // them into a 32-bit signature. uint64_t combined; + combined = (uint64_t)(0xffffffff ^ state->data_hash); combined *= (uint64_t)(0xffffffff ^ state->lens_hash); combined ^= K1 ^ K2 ^ state->data_hash ^ state->lens_hash; @@ -346,7 +351,7 @@ static uint32_t getDigest(vcode_state_t * state) { // VCode external interface implementation static uint32_t VCODE_MASK = 0x0; -void VCODE_INIT(void) { +void VCODE_INIT( void ) { if (!vcode_crc_selftest()) { printf("VCode CRC32c SW self-test failed!\n"); exit(1); @@ -365,8 +370,8 @@ void VCODE_INIT(void) { VCODE_MASK = VCODE_FINALIZE() ^ 0x1; } -uint32_t VCODE_FINALIZE(void) { - if (!g_doVCode) return 1; +uint32_t VCODE_FINALIZE( void ) { + if (!g_doVCode) { return 1; } g_inputVCode = getDigest(&vcode_states[0]); g_outputVCode = getDigest(&vcode_states[1]); @@ -375,16 +380,17 @@ uint32_t VCODE_FINALIZE(void) { vcode_state_t finalvcode; resetWithSeed(&finalvcode, VCODE_COUNT); - update_u32(&finalvcode, g_inputVCode); + update_u32(&finalvcode, g_inputVCode ); update_u32(&finalvcode, g_outputVCode); update_u32(&finalvcode, g_resultVCode); return VCODE_MASK ^ getDigest(&finalvcode); } -void VCODE_HASH(const void * input, size_t len, unsigned idx) { - if (idx >= VCODE_COUNT) +void VCODE_HASH( const void * input, size_t len, unsigned idx ) { + if (idx >= VCODE_COUNT) { return; + } update(&vcode_states[idx], input, len); } @@ -392,828 +398,828 @@ void VCODE_HASH(const void * input, size_t len, unsigned idx) { // Pre-computed tables for CRC32c #if defined(HWCRC_U64) const uint32_t crc32_long[4][256] = { - { - 0x00000000, 0xe040e0ac, 0xc56db7a9, 0x252d5705, 0x8f3719a3, 0x6f77f90f, 0x4a5aae0a, 0xaa1a4ea6, - 0x1b8245b7, 0xfbc2a51b, 0xdeeff21e, 0x3eaf12b2, 0x94b55c14, 0x74f5bcb8, 0x51d8ebbd, 0xb1980b11, - 0x37048b6e, 0xd7446bc2, 0xf2693cc7, 0x1229dc6b, 0xb83392cd, 0x58737261, 0x7d5e2564, 0x9d1ec5c8, - 0x2c86ced9, 0xccc62e75, 0xe9eb7970, 0x09ab99dc, 0xa3b1d77a, 0x43f137d6, 0x66dc60d3, 0x869c807f, - 0x6e0916dc, 0x8e49f670, 0xab64a175, 0x4b2441d9, 0xe13e0f7f, 0x017eefd3, 0x2453b8d6, 0xc413587a, - 0x758b536b, 0x95cbb3c7, 0xb0e6e4c2, 0x50a6046e, 0xfabc4ac8, 0x1afcaa64, 0x3fd1fd61, 0xdf911dcd, - 0x590d9db2, 0xb94d7d1e, 0x9c602a1b, 0x7c20cab7, 0xd63a8411, 0x367a64bd, 0x135733b8, 0xf317d314, - 0x428fd805, 0xa2cf38a9, 0x87e26fac, 0x67a28f00, 0xcdb8c1a6, 0x2df8210a, 0x08d5760f, 0xe89596a3, - 0xdc122db8, 0x3c52cd14, 0x197f9a11, 0xf93f7abd, 0x5325341b, 0xb365d4b7, 0x964883b2, 0x7608631e, - 0xc790680f, 0x27d088a3, 0x02fddfa6, 0xe2bd3f0a, 0x48a771ac, 0xa8e79100, 0x8dcac605, 0x6d8a26a9, - 0xeb16a6d6, 0x0b56467a, 0x2e7b117f, 0xce3bf1d3, 0x6421bf75, 0x84615fd9, 0xa14c08dc, 0x410ce870, - 0xf094e361, 0x10d403cd, 0x35f954c8, 0xd5b9b464, 0x7fa3fac2, 0x9fe31a6e, 0xbace4d6b, 0x5a8eadc7, - 0xb21b3b64, 0x525bdbc8, 0x77768ccd, 0x97366c61, 0x3d2c22c7, 0xdd6cc26b, 0xf841956e, 0x180175c2, - 0xa9997ed3, 0x49d99e7f, 0x6cf4c97a, 0x8cb429d6, 0x26ae6770, 0xc6ee87dc, 0xe3c3d0d9, 0x03833075, - 0x851fb00a, 0x655f50a6, 0x407207a3, 0xa032e70f, 0x0a28a9a9, 0xea684905, 0xcf451e00, 0x2f05feac, - 0x9e9df5bd, 0x7edd1511, 0x5bf04214, 0xbbb0a2b8, 0x11aaec1e, 0xf1ea0cb2, 0xd4c75bb7, 0x3487bb1b, - 0xbdc82d81, 0x5d88cd2d, 0x78a59a28, 0x98e57a84, 0x32ff3422, 0xd2bfd48e, 0xf792838b, 0x17d26327, - 0xa64a6836, 0x460a889a, 0x6327df9f, 0x83673f33, 0x297d7195, 0xc93d9139, 0xec10c63c, 0x0c502690, - 0x8acca6ef, 0x6a8c4643, 0x4fa11146, 0xafe1f1ea, 0x05fbbf4c, 0xe5bb5fe0, 0xc09608e5, 0x20d6e849, - 0x914ee358, 0x710e03f4, 0x542354f1, 0xb463b45d, 0x1e79fafb, 0xfe391a57, 0xdb144d52, 0x3b54adfe, - 0xd3c13b5d, 0x3381dbf1, 0x16ac8cf4, 0xf6ec6c58, 0x5cf622fe, 0xbcb6c252, 0x999b9557, 0x79db75fb, - 0xc8437eea, 0x28039e46, 0x0d2ec943, 0xed6e29ef, 0x47746749, 0xa73487e5, 0x8219d0e0, 0x6259304c, - 0xe4c5b033, 0x0485509f, 0x21a8079a, 0xc1e8e736, 0x6bf2a990, 0x8bb2493c, 0xae9f1e39, 0x4edffe95, - 0xff47f584, 0x1f071528, 0x3a2a422d, 0xda6aa281, 0x7070ec27, 0x90300c8b, 0xb51d5b8e, 0x555dbb22, - 0x61da0039, 0x819ae095, 0xa4b7b790, 0x44f7573c, 0xeeed199a, 0x0eadf936, 0x2b80ae33, 0xcbc04e9f, - 0x7a58458e, 0x9a18a522, 0xbf35f227, 0x5f75128b, 0xf56f5c2d, 0x152fbc81, 0x3002eb84, 0xd0420b28, - 0x56de8b57, 0xb69e6bfb, 0x93b33cfe, 0x73f3dc52, 0xd9e992f4, 0x39a97258, 0x1c84255d, 0xfcc4c5f1, - 0x4d5ccee0, 0xad1c2e4c, 0x88317949, 0x687199e5, 0xc26bd743, 0x222b37ef, 0x070660ea, 0xe7468046, - 0x0fd316e5, 0xef93f649, 0xcabea14c, 0x2afe41e0, 0x80e40f46, 0x60a4efea, 0x4589b8ef, 0xa5c95843, - 0x14515352, 0xf411b3fe, 0xd13ce4fb, 0x317c0457, 0x9b664af1, 0x7b26aa5d, 0x5e0bfd58, 0xbe4b1df4, - 0x38d79d8b, 0xd8977d27, 0xfdba2a22, 0x1dfaca8e, 0xb7e08428, 0x57a06484, 0x728d3381, 0x92cdd32d, - 0x2355d83c, 0xc3153890, 0xe6386f95, 0x06788f39, 0xac62c19f, 0x4c222133, 0x690f7636, 0x894f969a, - }, - { - 0x00000000, 0x7e7c2df3, 0xfcf85be6, 0x82847615, 0xfc1cc13d, 0x8260ecce, 0x00e49adb, 0x7e98b728, - 0xfdd5f48b, 0x83a9d978, 0x012daf6d, 0x7f51829e, 0x01c935b6, 0x7fb51845, 0xfd316e50, 0x834d43a3, - 0xfe479fe7, 0x803bb214, 0x02bfc401, 0x7cc3e9f2, 0x025b5eda, 0x7c277329, 0xfea3053c, 0x80df28cf, - 0x03926b6c, 0x7dee469f, 0xff6a308a, 0x81161d79, 0xff8eaa51, 0x81f287a2, 0x0376f1b7, 0x7d0adc44, - 0xf963493f, 0x871f64cc, 0x059b12d9, 0x7be73f2a, 0x057f8802, 0x7b03a5f1, 0xf987d3e4, 0x87fbfe17, - 0x04b6bdb4, 0x7aca9047, 0xf84ee652, 0x8632cba1, 0xf8aa7c89, 0x86d6517a, 0x0452276f, 0x7a2e0a9c, - 0x0724d6d8, 0x7958fb2b, 0xfbdc8d3e, 0x85a0a0cd, 0xfb3817e5, 0x85443a16, 0x07c04c03, 0x79bc61f0, - 0xfaf12253, 0x848d0fa0, 0x060979b5, 0x78755446, 0x06ede36e, 0x7891ce9d, 0xfa15b888, 0x8469957b, - 0xf72ae48f, 0x8956c97c, 0x0bd2bf69, 0x75ae929a, 0x0b3625b2, 0x754a0841, 0xf7ce7e54, 0x89b253a7, - 0x0aff1004, 0x74833df7, 0xf6074be2, 0x887b6611, 0xf6e3d139, 0x889ffcca, 0x0a1b8adf, 0x7467a72c, - 0x096d7b68, 0x7711569b, 0xf595208e, 0x8be90d7d, 0xf571ba55, 0x8b0d97a6, 0x0989e1b3, 0x77f5cc40, - 0xf4b88fe3, 0x8ac4a210, 0x0840d405, 0x763cf9f6, 0x08a44ede, 0x76d8632d, 0xf45c1538, 0x8a2038cb, - 0x0e49adb0, 0x70358043, 0xf2b1f656, 0x8ccddba5, 0xf2556c8d, 0x8c29417e, 0x0ead376b, 0x70d11a98, - 0xf39c593b, 0x8de074c8, 0x0f6402dd, 0x71182f2e, 0x0f809806, 0x71fcb5f5, 0xf378c3e0, 0x8d04ee13, - 0xf00e3257, 0x8e721fa4, 0x0cf669b1, 0x728a4442, 0x0c12f36a, 0x726ede99, 0xf0eaa88c, 0x8e96857f, - 0x0ddbc6dc, 0x73a7eb2f, 0xf1239d3a, 0x8f5fb0c9, 0xf1c707e1, 0x8fbb2a12, 0x0d3f5c07, 0x734371f4, - 0xebb9bfef, 0x95c5921c, 0x1741e409, 0x693dc9fa, 0x17a57ed2, 0x69d95321, 0xeb5d2534, 0x952108c7, - 0x166c4b64, 0x68106697, 0xea941082, 0x94e83d71, 0xea708a59, 0x940ca7aa, 0x1688d1bf, 0x68f4fc4c, - 0x15fe2008, 0x6b820dfb, 0xe9067bee, 0x977a561d, 0xe9e2e135, 0x979eccc6, 0x151abad3, 0x6b669720, - 0xe82bd483, 0x9657f970, 0x14d38f65, 0x6aafa296, 0x143715be, 0x6a4b384d, 0xe8cf4e58, 0x96b363ab, - 0x12daf6d0, 0x6ca6db23, 0xee22ad36, 0x905e80c5, 0xeec637ed, 0x90ba1a1e, 0x123e6c0b, 0x6c4241f8, - 0xef0f025b, 0x91732fa8, 0x13f759bd, 0x6d8b744e, 0x1313c366, 0x6d6fee95, 0xefeb9880, 0x9197b573, - 0xec9d6937, 0x92e144c4, 0x106532d1, 0x6e191f22, 0x1081a80a, 0x6efd85f9, 0xec79f3ec, 0x9205de1f, - 0x11489dbc, 0x6f34b04f, 0xedb0c65a, 0x93cceba9, 0xed545c81, 0x93287172, 0x11ac0767, 0x6fd02a94, - 0x1c935b60, 0x62ef7693, 0xe06b0086, 0x9e172d75, 0xe08f9a5d, 0x9ef3b7ae, 0x1c77c1bb, 0x620bec48, - 0xe146afeb, 0x9f3a8218, 0x1dbef40d, 0x63c2d9fe, 0x1d5a6ed6, 0x63264325, 0xe1a23530, 0x9fde18c3, - 0xe2d4c487, 0x9ca8e974, 0x1e2c9f61, 0x6050b292, 0x1ec805ba, 0x60b42849, 0xe2305e5c, 0x9c4c73af, - 0x1f01300c, 0x617d1dff, 0xe3f96bea, 0x9d854619, 0xe31df131, 0x9d61dcc2, 0x1fe5aad7, 0x61998724, - 0xe5f0125f, 0x9b8c3fac, 0x190849b9, 0x6774644a, 0x19ecd362, 0x6790fe91, 0xe5148884, 0x9b68a577, - 0x1825e6d4, 0x6659cb27, 0xe4ddbd32, 0x9aa190c1, 0xe43927e9, 0x9a450a1a, 0x18c17c0f, 0x66bd51fc, - 0x1bb78db8, 0x65cba04b, 0xe74fd65e, 0x9933fbad, 0xe7ab4c85, 0x99d76176, 0x1b531763, 0x652f3a90, - 0xe6627933, 0x981e54c0, 0x1a9a22d5, 0x64e60f26, 0x1a7eb80e, 0x640295fd, 0xe686e3e8, 0x98face1b, - }, - { - 0x00000000, 0xd29f092f, 0xa0d264af, 0x724d6d80, 0x4448bfaf, 0x96d7b680, 0xe49adb00, 0x3605d22f, - 0x88917f5e, 0x5a0e7671, 0x28431bf1, 0xfadc12de, 0xccd9c0f1, 0x1e46c9de, 0x6c0ba45e, 0xbe94ad71, - 0x14ce884d, 0xc6518162, 0xb41cece2, 0x6683e5cd, 0x508637e2, 0x82193ecd, 0xf054534d, 0x22cb5a62, - 0x9c5ff713, 0x4ec0fe3c, 0x3c8d93bc, 0xee129a93, 0xd81748bc, 0x0a884193, 0x78c52c13, 0xaa5a253c, - 0x299d109a, 0xfb0219b5, 0x894f7435, 0x5bd07d1a, 0x6dd5af35, 0xbf4aa61a, 0xcd07cb9a, 0x1f98c2b5, - 0xa10c6fc4, 0x739366eb, 0x01de0b6b, 0xd3410244, 0xe544d06b, 0x37dbd944, 0x4596b4c4, 0x9709bdeb, - 0x3d5398d7, 0xefcc91f8, 0x9d81fc78, 0x4f1ef557, 0x791b2778, 0xab842e57, 0xd9c943d7, 0x0b564af8, - 0xb5c2e789, 0x675deea6, 0x15108326, 0xc78f8a09, 0xf18a5826, 0x23155109, 0x51583c89, 0x83c735a6, - 0x533a2134, 0x81a5281b, 0xf3e8459b, 0x21774cb4, 0x17729e9b, 0xc5ed97b4, 0xb7a0fa34, 0x653ff31b, - 0xdbab5e6a, 0x09345745, 0x7b793ac5, 0xa9e633ea, 0x9fe3e1c5, 0x4d7ce8ea, 0x3f31856a, 0xedae8c45, - 0x47f4a979, 0x956ba056, 0xe726cdd6, 0x35b9c4f9, 0x03bc16d6, 0xd1231ff9, 0xa36e7279, 0x71f17b56, - 0xcf65d627, 0x1dfadf08, 0x6fb7b288, 0xbd28bba7, 0x8b2d6988, 0x59b260a7, 0x2bff0d27, 0xf9600408, - 0x7aa731ae, 0xa8383881, 0xda755501, 0x08ea5c2e, 0x3eef8e01, 0xec70872e, 0x9e3deaae, 0x4ca2e381, - 0xf2364ef0, 0x20a947df, 0x52e42a5f, 0x807b2370, 0xb67ef15f, 0x64e1f870, 0x16ac95f0, 0xc4339cdf, - 0x6e69b9e3, 0xbcf6b0cc, 0xcebbdd4c, 0x1c24d463, 0x2a21064c, 0xf8be0f63, 0x8af362e3, 0x586c6bcc, - 0xe6f8c6bd, 0x3467cf92, 0x462aa212, 0x94b5ab3d, 0xa2b07912, 0x702f703d, 0x02621dbd, 0xd0fd1492, - 0xa6744268, 0x74eb4b47, 0x06a626c7, 0xd4392fe8, 0xe23cfdc7, 0x30a3f4e8, 0x42ee9968, 0x90719047, - 0x2ee53d36, 0xfc7a3419, 0x8e375999, 0x5ca850b6, 0x6aad8299, 0xb8328bb6, 0xca7fe636, 0x18e0ef19, - 0xb2baca25, 0x6025c30a, 0x1268ae8a, 0xc0f7a7a5, 0xf6f2758a, 0x246d7ca5, 0x56201125, 0x84bf180a, - 0x3a2bb57b, 0xe8b4bc54, 0x9af9d1d4, 0x4866d8fb, 0x7e630ad4, 0xacfc03fb, 0xdeb16e7b, 0x0c2e6754, - 0x8fe952f2, 0x5d765bdd, 0x2f3b365d, 0xfda43f72, 0xcba1ed5d, 0x193ee472, 0x6b7389f2, 0xb9ec80dd, - 0x07782dac, 0xd5e72483, 0xa7aa4903, 0x7535402c, 0x43309203, 0x91af9b2c, 0xe3e2f6ac, 0x317dff83, - 0x9b27dabf, 0x49b8d390, 0x3bf5be10, 0xe96ab73f, 0xdf6f6510, 0x0df06c3f, 0x7fbd01bf, 0xad220890, - 0x13b6a5e1, 0xc129acce, 0xb364c14e, 0x61fbc861, 0x57fe1a4e, 0x85611361, 0xf72c7ee1, 0x25b377ce, - 0xf54e635c, 0x27d16a73, 0x559c07f3, 0x87030edc, 0xb106dcf3, 0x6399d5dc, 0x11d4b85c, 0xc34bb173, - 0x7ddf1c02, 0xaf40152d, 0xdd0d78ad, 0x0f927182, 0x3997a3ad, 0xeb08aa82, 0x9945c702, 0x4bdace2d, - 0xe180eb11, 0x331fe23e, 0x41528fbe, 0x93cd8691, 0xa5c854be, 0x77575d91, 0x051a3011, 0xd785393e, - 0x6911944f, 0xbb8e9d60, 0xc9c3f0e0, 0x1b5cf9cf, 0x2d592be0, 0xffc622cf, 0x8d8b4f4f, 0x5f144660, - 0xdcd373c6, 0x0e4c7ae9, 0x7c011769, 0xae9e1e46, 0x989bcc69, 0x4a04c546, 0x3849a8c6, 0xead6a1e9, - 0x54420c98, 0x86dd05b7, 0xf4906837, 0x260f6118, 0x100ab337, 0xc295ba18, 0xb0d8d798, 0x6247deb7, - 0xc81dfb8b, 0x1a82f2a4, 0x68cf9f24, 0xba50960b, 0x8c554424, 0x5eca4d0b, 0x2c87208b, 0xfe1829a4, - 0x408c84d5, 0x92138dfa, 0xe05ee07a, 0x32c1e955, 0x04c43b7a, 0xd65b3255, 0xa4165fd5, 0x768956fa, - }, - { - 0x00000000, 0x4904f221, 0x9209e442, 0xdb0d1663, 0x21ffbe75, 0x68fb4c54, 0xb3f65a37, 0xfaf2a816, - 0x43ff7cea, 0x0afb8ecb, 0xd1f698a8, 0x98f26a89, 0x6200c29f, 0x2b0430be, 0xf00926dd, 0xb90dd4fc, - 0x87fef9d4, 0xcefa0bf5, 0x15f71d96, 0x5cf3efb7, 0xa60147a1, 0xef05b580, 0x3408a3e3, 0x7d0c51c2, - 0xc401853e, 0x8d05771f, 0x5608617c, 0x1f0c935d, 0xe5fe3b4b, 0xacfac96a, 0x77f7df09, 0x3ef32d28, - 0x0a118559, 0x43157778, 0x9818611b, 0xd11c933a, 0x2bee3b2c, 0x62eac90d, 0xb9e7df6e, 0xf0e32d4f, - 0x49eef9b3, 0x00ea0b92, 0xdbe71df1, 0x92e3efd0, 0x681147c6, 0x2115b5e7, 0xfa18a384, 0xb31c51a5, - 0x8def7c8d, 0xc4eb8eac, 0x1fe698cf, 0x56e26aee, 0xac10c2f8, 0xe51430d9, 0x3e1926ba, 0x771dd49b, - 0xce100067, 0x8714f246, 0x5c19e425, 0x151d1604, 0xefefbe12, 0xa6eb4c33, 0x7de65a50, 0x34e2a871, - 0x14230ab2, 0x5d27f893, 0x862aeef0, 0xcf2e1cd1, 0x35dcb4c7, 0x7cd846e6, 0xa7d55085, 0xeed1a2a4, - 0x57dc7658, 0x1ed88479, 0xc5d5921a, 0x8cd1603b, 0x7623c82d, 0x3f273a0c, 0xe42a2c6f, 0xad2ede4e, - 0x93ddf366, 0xdad90147, 0x01d41724, 0x48d0e505, 0xb2224d13, 0xfb26bf32, 0x202ba951, 0x692f5b70, - 0xd0228f8c, 0x99267dad, 0x422b6bce, 0x0b2f99ef, 0xf1dd31f9, 0xb8d9c3d8, 0x63d4d5bb, 0x2ad0279a, - 0x1e328feb, 0x57367dca, 0x8c3b6ba9, 0xc53f9988, 0x3fcd319e, 0x76c9c3bf, 0xadc4d5dc, 0xe4c027fd, - 0x5dcdf301, 0x14c90120, 0xcfc41743, 0x86c0e562, 0x7c324d74, 0x3536bf55, 0xee3ba936, 0xa73f5b17, - 0x99cc763f, 0xd0c8841e, 0x0bc5927d, 0x42c1605c, 0xb833c84a, 0xf1373a6b, 0x2a3a2c08, 0x633ede29, - 0xda330ad5, 0x9337f8f4, 0x483aee97, 0x013e1cb6, 0xfbccb4a0, 0xb2c84681, 0x69c550e2, 0x20c1a2c3, - 0x28461564, 0x6142e745, 0xba4ff126, 0xf34b0307, 0x09b9ab11, 0x40bd5930, 0x9bb04f53, 0xd2b4bd72, - 0x6bb9698e, 0x22bd9baf, 0xf9b08dcc, 0xb0b47fed, 0x4a46d7fb, 0x034225da, 0xd84f33b9, 0x914bc198, - 0xafb8ecb0, 0xe6bc1e91, 0x3db108f2, 0x74b5fad3, 0x8e4752c5, 0xc743a0e4, 0x1c4eb687, 0x554a44a6, - 0xec47905a, 0xa543627b, 0x7e4e7418, 0x374a8639, 0xcdb82e2f, 0x84bcdc0e, 0x5fb1ca6d, 0x16b5384c, - 0x2257903d, 0x6b53621c, 0xb05e747f, 0xf95a865e, 0x03a82e48, 0x4aacdc69, 0x91a1ca0a, 0xd8a5382b, - 0x61a8ecd7, 0x28ac1ef6, 0xf3a10895, 0xbaa5fab4, 0x405752a2, 0x0953a083, 0xd25eb6e0, 0x9b5a44c1, - 0xa5a969e9, 0xecad9bc8, 0x37a08dab, 0x7ea47f8a, 0x8456d79c, 0xcd5225bd, 0x165f33de, 0x5f5bc1ff, - 0xe6561503, 0xaf52e722, 0x745ff141, 0x3d5b0360, 0xc7a9ab76, 0x8ead5957, 0x55a04f34, 0x1ca4bd15, - 0x3c651fd6, 0x7561edf7, 0xae6cfb94, 0xe76809b5, 0x1d9aa1a3, 0x549e5382, 0x8f9345e1, 0xc697b7c0, - 0x7f9a633c, 0x369e911d, 0xed93877e, 0xa497755f, 0x5e65dd49, 0x17612f68, 0xcc6c390b, 0x8568cb2a, - 0xbb9be602, 0xf29f1423, 0x29920240, 0x6096f061, 0x9a645877, 0xd360aa56, 0x086dbc35, 0x41694e14, - 0xf8649ae8, 0xb16068c9, 0x6a6d7eaa, 0x23698c8b, 0xd99b249d, 0x909fd6bc, 0x4b92c0df, 0x029632fe, - 0x36749a8f, 0x7f7068ae, 0xa47d7ecd, 0xed798cec, 0x178b24fa, 0x5e8fd6db, 0x8582c0b8, 0xcc863299, - 0x758be665, 0x3c8f1444, 0xe7820227, 0xae86f006, 0x54745810, 0x1d70aa31, 0xc67dbc52, 0x8f794e73, - 0xb18a635b, 0xf88e917a, 0x23838719, 0x6a877538, 0x9075dd2e, 0xd9712f0f, 0x027c396c, 0x4b78cb4d, - 0xf2751fb1, 0xbb71ed90, 0x607cfbf3, 0x297809d2, 0xd38aa1c4, 0x9a8e53e5, 0x41834586, 0x0887b7a7, - }, + { + 0x00000000, 0xe040e0ac, 0xc56db7a9, 0x252d5705, 0x8f3719a3, 0x6f77f90f, 0x4a5aae0a, 0xaa1a4ea6, + 0x1b8245b7, 0xfbc2a51b, 0xdeeff21e, 0x3eaf12b2, 0x94b55c14, 0x74f5bcb8, 0x51d8ebbd, 0xb1980b11, + 0x37048b6e, 0xd7446bc2, 0xf2693cc7, 0x1229dc6b, 0xb83392cd, 0x58737261, 0x7d5e2564, 0x9d1ec5c8, + 0x2c86ced9, 0xccc62e75, 0xe9eb7970, 0x09ab99dc, 0xa3b1d77a, 0x43f137d6, 0x66dc60d3, 0x869c807f, + 0x6e0916dc, 0x8e49f670, 0xab64a175, 0x4b2441d9, 0xe13e0f7f, 0x017eefd3, 0x2453b8d6, 0xc413587a, + 0x758b536b, 0x95cbb3c7, 0xb0e6e4c2, 0x50a6046e, 0xfabc4ac8, 0x1afcaa64, 0x3fd1fd61, 0xdf911dcd, + 0x590d9db2, 0xb94d7d1e, 0x9c602a1b, 0x7c20cab7, 0xd63a8411, 0x367a64bd, 0x135733b8, 0xf317d314, + 0x428fd805, 0xa2cf38a9, 0x87e26fac, 0x67a28f00, 0xcdb8c1a6, 0x2df8210a, 0x08d5760f, 0xe89596a3, + 0xdc122db8, 0x3c52cd14, 0x197f9a11, 0xf93f7abd, 0x5325341b, 0xb365d4b7, 0x964883b2, 0x7608631e, + 0xc790680f, 0x27d088a3, 0x02fddfa6, 0xe2bd3f0a, 0x48a771ac, 0xa8e79100, 0x8dcac605, 0x6d8a26a9, + 0xeb16a6d6, 0x0b56467a, 0x2e7b117f, 0xce3bf1d3, 0x6421bf75, 0x84615fd9, 0xa14c08dc, 0x410ce870, + 0xf094e361, 0x10d403cd, 0x35f954c8, 0xd5b9b464, 0x7fa3fac2, 0x9fe31a6e, 0xbace4d6b, 0x5a8eadc7, + 0xb21b3b64, 0x525bdbc8, 0x77768ccd, 0x97366c61, 0x3d2c22c7, 0xdd6cc26b, 0xf841956e, 0x180175c2, + 0xa9997ed3, 0x49d99e7f, 0x6cf4c97a, 0x8cb429d6, 0x26ae6770, 0xc6ee87dc, 0xe3c3d0d9, 0x03833075, + 0x851fb00a, 0x655f50a6, 0x407207a3, 0xa032e70f, 0x0a28a9a9, 0xea684905, 0xcf451e00, 0x2f05feac, + 0x9e9df5bd, 0x7edd1511, 0x5bf04214, 0xbbb0a2b8, 0x11aaec1e, 0xf1ea0cb2, 0xd4c75bb7, 0x3487bb1b, + 0xbdc82d81, 0x5d88cd2d, 0x78a59a28, 0x98e57a84, 0x32ff3422, 0xd2bfd48e, 0xf792838b, 0x17d26327, + 0xa64a6836, 0x460a889a, 0x6327df9f, 0x83673f33, 0x297d7195, 0xc93d9139, 0xec10c63c, 0x0c502690, + 0x8acca6ef, 0x6a8c4643, 0x4fa11146, 0xafe1f1ea, 0x05fbbf4c, 0xe5bb5fe0, 0xc09608e5, 0x20d6e849, + 0x914ee358, 0x710e03f4, 0x542354f1, 0xb463b45d, 0x1e79fafb, 0xfe391a57, 0xdb144d52, 0x3b54adfe, + 0xd3c13b5d, 0x3381dbf1, 0x16ac8cf4, 0xf6ec6c58, 0x5cf622fe, 0xbcb6c252, 0x999b9557, 0x79db75fb, + 0xc8437eea, 0x28039e46, 0x0d2ec943, 0xed6e29ef, 0x47746749, 0xa73487e5, 0x8219d0e0, 0x6259304c, + 0xe4c5b033, 0x0485509f, 0x21a8079a, 0xc1e8e736, 0x6bf2a990, 0x8bb2493c, 0xae9f1e39, 0x4edffe95, + 0xff47f584, 0x1f071528, 0x3a2a422d, 0xda6aa281, 0x7070ec27, 0x90300c8b, 0xb51d5b8e, 0x555dbb22, + 0x61da0039, 0x819ae095, 0xa4b7b790, 0x44f7573c, 0xeeed199a, 0x0eadf936, 0x2b80ae33, 0xcbc04e9f, + 0x7a58458e, 0x9a18a522, 0xbf35f227, 0x5f75128b, 0xf56f5c2d, 0x152fbc81, 0x3002eb84, 0xd0420b28, + 0x56de8b57, 0xb69e6bfb, 0x93b33cfe, 0x73f3dc52, 0xd9e992f4, 0x39a97258, 0x1c84255d, 0xfcc4c5f1, + 0x4d5ccee0, 0xad1c2e4c, 0x88317949, 0x687199e5, 0xc26bd743, 0x222b37ef, 0x070660ea, 0xe7468046, + 0x0fd316e5, 0xef93f649, 0xcabea14c, 0x2afe41e0, 0x80e40f46, 0x60a4efea, 0x4589b8ef, 0xa5c95843, + 0x14515352, 0xf411b3fe, 0xd13ce4fb, 0x317c0457, 0x9b664af1, 0x7b26aa5d, 0x5e0bfd58, 0xbe4b1df4, + 0x38d79d8b, 0xd8977d27, 0xfdba2a22, 0x1dfaca8e, 0xb7e08428, 0x57a06484, 0x728d3381, 0x92cdd32d, + 0x2355d83c, 0xc3153890, 0xe6386f95, 0x06788f39, 0xac62c19f, 0x4c222133, 0x690f7636, 0x894f969a, + }, + { + 0x00000000, 0x7e7c2df3, 0xfcf85be6, 0x82847615, 0xfc1cc13d, 0x8260ecce, 0x00e49adb, 0x7e98b728, + 0xfdd5f48b, 0x83a9d978, 0x012daf6d, 0x7f51829e, 0x01c935b6, 0x7fb51845, 0xfd316e50, 0x834d43a3, + 0xfe479fe7, 0x803bb214, 0x02bfc401, 0x7cc3e9f2, 0x025b5eda, 0x7c277329, 0xfea3053c, 0x80df28cf, + 0x03926b6c, 0x7dee469f, 0xff6a308a, 0x81161d79, 0xff8eaa51, 0x81f287a2, 0x0376f1b7, 0x7d0adc44, + 0xf963493f, 0x871f64cc, 0x059b12d9, 0x7be73f2a, 0x057f8802, 0x7b03a5f1, 0xf987d3e4, 0x87fbfe17, + 0x04b6bdb4, 0x7aca9047, 0xf84ee652, 0x8632cba1, 0xf8aa7c89, 0x86d6517a, 0x0452276f, 0x7a2e0a9c, + 0x0724d6d8, 0x7958fb2b, 0xfbdc8d3e, 0x85a0a0cd, 0xfb3817e5, 0x85443a16, 0x07c04c03, 0x79bc61f0, + 0xfaf12253, 0x848d0fa0, 0x060979b5, 0x78755446, 0x06ede36e, 0x7891ce9d, 0xfa15b888, 0x8469957b, + 0xf72ae48f, 0x8956c97c, 0x0bd2bf69, 0x75ae929a, 0x0b3625b2, 0x754a0841, 0xf7ce7e54, 0x89b253a7, + 0x0aff1004, 0x74833df7, 0xf6074be2, 0x887b6611, 0xf6e3d139, 0x889ffcca, 0x0a1b8adf, 0x7467a72c, + 0x096d7b68, 0x7711569b, 0xf595208e, 0x8be90d7d, 0xf571ba55, 0x8b0d97a6, 0x0989e1b3, 0x77f5cc40, + 0xf4b88fe3, 0x8ac4a210, 0x0840d405, 0x763cf9f6, 0x08a44ede, 0x76d8632d, 0xf45c1538, 0x8a2038cb, + 0x0e49adb0, 0x70358043, 0xf2b1f656, 0x8ccddba5, 0xf2556c8d, 0x8c29417e, 0x0ead376b, 0x70d11a98, + 0xf39c593b, 0x8de074c8, 0x0f6402dd, 0x71182f2e, 0x0f809806, 0x71fcb5f5, 0xf378c3e0, 0x8d04ee13, + 0xf00e3257, 0x8e721fa4, 0x0cf669b1, 0x728a4442, 0x0c12f36a, 0x726ede99, 0xf0eaa88c, 0x8e96857f, + 0x0ddbc6dc, 0x73a7eb2f, 0xf1239d3a, 0x8f5fb0c9, 0xf1c707e1, 0x8fbb2a12, 0x0d3f5c07, 0x734371f4, + 0xebb9bfef, 0x95c5921c, 0x1741e409, 0x693dc9fa, 0x17a57ed2, 0x69d95321, 0xeb5d2534, 0x952108c7, + 0x166c4b64, 0x68106697, 0xea941082, 0x94e83d71, 0xea708a59, 0x940ca7aa, 0x1688d1bf, 0x68f4fc4c, + 0x15fe2008, 0x6b820dfb, 0xe9067bee, 0x977a561d, 0xe9e2e135, 0x979eccc6, 0x151abad3, 0x6b669720, + 0xe82bd483, 0x9657f970, 0x14d38f65, 0x6aafa296, 0x143715be, 0x6a4b384d, 0xe8cf4e58, 0x96b363ab, + 0x12daf6d0, 0x6ca6db23, 0xee22ad36, 0x905e80c5, 0xeec637ed, 0x90ba1a1e, 0x123e6c0b, 0x6c4241f8, + 0xef0f025b, 0x91732fa8, 0x13f759bd, 0x6d8b744e, 0x1313c366, 0x6d6fee95, 0xefeb9880, 0x9197b573, + 0xec9d6937, 0x92e144c4, 0x106532d1, 0x6e191f22, 0x1081a80a, 0x6efd85f9, 0xec79f3ec, 0x9205de1f, + 0x11489dbc, 0x6f34b04f, 0xedb0c65a, 0x93cceba9, 0xed545c81, 0x93287172, 0x11ac0767, 0x6fd02a94, + 0x1c935b60, 0x62ef7693, 0xe06b0086, 0x9e172d75, 0xe08f9a5d, 0x9ef3b7ae, 0x1c77c1bb, 0x620bec48, + 0xe146afeb, 0x9f3a8218, 0x1dbef40d, 0x63c2d9fe, 0x1d5a6ed6, 0x63264325, 0xe1a23530, 0x9fde18c3, + 0xe2d4c487, 0x9ca8e974, 0x1e2c9f61, 0x6050b292, 0x1ec805ba, 0x60b42849, 0xe2305e5c, 0x9c4c73af, + 0x1f01300c, 0x617d1dff, 0xe3f96bea, 0x9d854619, 0xe31df131, 0x9d61dcc2, 0x1fe5aad7, 0x61998724, + 0xe5f0125f, 0x9b8c3fac, 0x190849b9, 0x6774644a, 0x19ecd362, 0x6790fe91, 0xe5148884, 0x9b68a577, + 0x1825e6d4, 0x6659cb27, 0xe4ddbd32, 0x9aa190c1, 0xe43927e9, 0x9a450a1a, 0x18c17c0f, 0x66bd51fc, + 0x1bb78db8, 0x65cba04b, 0xe74fd65e, 0x9933fbad, 0xe7ab4c85, 0x99d76176, 0x1b531763, 0x652f3a90, + 0xe6627933, 0x981e54c0, 0x1a9a22d5, 0x64e60f26, 0x1a7eb80e, 0x640295fd, 0xe686e3e8, 0x98face1b, + }, + { + 0x00000000, 0xd29f092f, 0xa0d264af, 0x724d6d80, 0x4448bfaf, 0x96d7b680, 0xe49adb00, 0x3605d22f, + 0x88917f5e, 0x5a0e7671, 0x28431bf1, 0xfadc12de, 0xccd9c0f1, 0x1e46c9de, 0x6c0ba45e, 0xbe94ad71, + 0x14ce884d, 0xc6518162, 0xb41cece2, 0x6683e5cd, 0x508637e2, 0x82193ecd, 0xf054534d, 0x22cb5a62, + 0x9c5ff713, 0x4ec0fe3c, 0x3c8d93bc, 0xee129a93, 0xd81748bc, 0x0a884193, 0x78c52c13, 0xaa5a253c, + 0x299d109a, 0xfb0219b5, 0x894f7435, 0x5bd07d1a, 0x6dd5af35, 0xbf4aa61a, 0xcd07cb9a, 0x1f98c2b5, + 0xa10c6fc4, 0x739366eb, 0x01de0b6b, 0xd3410244, 0xe544d06b, 0x37dbd944, 0x4596b4c4, 0x9709bdeb, + 0x3d5398d7, 0xefcc91f8, 0x9d81fc78, 0x4f1ef557, 0x791b2778, 0xab842e57, 0xd9c943d7, 0x0b564af8, + 0xb5c2e789, 0x675deea6, 0x15108326, 0xc78f8a09, 0xf18a5826, 0x23155109, 0x51583c89, 0x83c735a6, + 0x533a2134, 0x81a5281b, 0xf3e8459b, 0x21774cb4, 0x17729e9b, 0xc5ed97b4, 0xb7a0fa34, 0x653ff31b, + 0xdbab5e6a, 0x09345745, 0x7b793ac5, 0xa9e633ea, 0x9fe3e1c5, 0x4d7ce8ea, 0x3f31856a, 0xedae8c45, + 0x47f4a979, 0x956ba056, 0xe726cdd6, 0x35b9c4f9, 0x03bc16d6, 0xd1231ff9, 0xa36e7279, 0x71f17b56, + 0xcf65d627, 0x1dfadf08, 0x6fb7b288, 0xbd28bba7, 0x8b2d6988, 0x59b260a7, 0x2bff0d27, 0xf9600408, + 0x7aa731ae, 0xa8383881, 0xda755501, 0x08ea5c2e, 0x3eef8e01, 0xec70872e, 0x9e3deaae, 0x4ca2e381, + 0xf2364ef0, 0x20a947df, 0x52e42a5f, 0x807b2370, 0xb67ef15f, 0x64e1f870, 0x16ac95f0, 0xc4339cdf, + 0x6e69b9e3, 0xbcf6b0cc, 0xcebbdd4c, 0x1c24d463, 0x2a21064c, 0xf8be0f63, 0x8af362e3, 0x586c6bcc, + 0xe6f8c6bd, 0x3467cf92, 0x462aa212, 0x94b5ab3d, 0xa2b07912, 0x702f703d, 0x02621dbd, 0xd0fd1492, + 0xa6744268, 0x74eb4b47, 0x06a626c7, 0xd4392fe8, 0xe23cfdc7, 0x30a3f4e8, 0x42ee9968, 0x90719047, + 0x2ee53d36, 0xfc7a3419, 0x8e375999, 0x5ca850b6, 0x6aad8299, 0xb8328bb6, 0xca7fe636, 0x18e0ef19, + 0xb2baca25, 0x6025c30a, 0x1268ae8a, 0xc0f7a7a5, 0xf6f2758a, 0x246d7ca5, 0x56201125, 0x84bf180a, + 0x3a2bb57b, 0xe8b4bc54, 0x9af9d1d4, 0x4866d8fb, 0x7e630ad4, 0xacfc03fb, 0xdeb16e7b, 0x0c2e6754, + 0x8fe952f2, 0x5d765bdd, 0x2f3b365d, 0xfda43f72, 0xcba1ed5d, 0x193ee472, 0x6b7389f2, 0xb9ec80dd, + 0x07782dac, 0xd5e72483, 0xa7aa4903, 0x7535402c, 0x43309203, 0x91af9b2c, 0xe3e2f6ac, 0x317dff83, + 0x9b27dabf, 0x49b8d390, 0x3bf5be10, 0xe96ab73f, 0xdf6f6510, 0x0df06c3f, 0x7fbd01bf, 0xad220890, + 0x13b6a5e1, 0xc129acce, 0xb364c14e, 0x61fbc861, 0x57fe1a4e, 0x85611361, 0xf72c7ee1, 0x25b377ce, + 0xf54e635c, 0x27d16a73, 0x559c07f3, 0x87030edc, 0xb106dcf3, 0x6399d5dc, 0x11d4b85c, 0xc34bb173, + 0x7ddf1c02, 0xaf40152d, 0xdd0d78ad, 0x0f927182, 0x3997a3ad, 0xeb08aa82, 0x9945c702, 0x4bdace2d, + 0xe180eb11, 0x331fe23e, 0x41528fbe, 0x93cd8691, 0xa5c854be, 0x77575d91, 0x051a3011, 0xd785393e, + 0x6911944f, 0xbb8e9d60, 0xc9c3f0e0, 0x1b5cf9cf, 0x2d592be0, 0xffc622cf, 0x8d8b4f4f, 0x5f144660, + 0xdcd373c6, 0x0e4c7ae9, 0x7c011769, 0xae9e1e46, 0x989bcc69, 0x4a04c546, 0x3849a8c6, 0xead6a1e9, + 0x54420c98, 0x86dd05b7, 0xf4906837, 0x260f6118, 0x100ab337, 0xc295ba18, 0xb0d8d798, 0x6247deb7, + 0xc81dfb8b, 0x1a82f2a4, 0x68cf9f24, 0xba50960b, 0x8c554424, 0x5eca4d0b, 0x2c87208b, 0xfe1829a4, + 0x408c84d5, 0x92138dfa, 0xe05ee07a, 0x32c1e955, 0x04c43b7a, 0xd65b3255, 0xa4165fd5, 0x768956fa, + }, + { + 0x00000000, 0x4904f221, 0x9209e442, 0xdb0d1663, 0x21ffbe75, 0x68fb4c54, 0xb3f65a37, 0xfaf2a816, + 0x43ff7cea, 0x0afb8ecb, 0xd1f698a8, 0x98f26a89, 0x6200c29f, 0x2b0430be, 0xf00926dd, 0xb90dd4fc, + 0x87fef9d4, 0xcefa0bf5, 0x15f71d96, 0x5cf3efb7, 0xa60147a1, 0xef05b580, 0x3408a3e3, 0x7d0c51c2, + 0xc401853e, 0x8d05771f, 0x5608617c, 0x1f0c935d, 0xe5fe3b4b, 0xacfac96a, 0x77f7df09, 0x3ef32d28, + 0x0a118559, 0x43157778, 0x9818611b, 0xd11c933a, 0x2bee3b2c, 0x62eac90d, 0xb9e7df6e, 0xf0e32d4f, + 0x49eef9b3, 0x00ea0b92, 0xdbe71df1, 0x92e3efd0, 0x681147c6, 0x2115b5e7, 0xfa18a384, 0xb31c51a5, + 0x8def7c8d, 0xc4eb8eac, 0x1fe698cf, 0x56e26aee, 0xac10c2f8, 0xe51430d9, 0x3e1926ba, 0x771dd49b, + 0xce100067, 0x8714f246, 0x5c19e425, 0x151d1604, 0xefefbe12, 0xa6eb4c33, 0x7de65a50, 0x34e2a871, + 0x14230ab2, 0x5d27f893, 0x862aeef0, 0xcf2e1cd1, 0x35dcb4c7, 0x7cd846e6, 0xa7d55085, 0xeed1a2a4, + 0x57dc7658, 0x1ed88479, 0xc5d5921a, 0x8cd1603b, 0x7623c82d, 0x3f273a0c, 0xe42a2c6f, 0xad2ede4e, + 0x93ddf366, 0xdad90147, 0x01d41724, 0x48d0e505, 0xb2224d13, 0xfb26bf32, 0x202ba951, 0x692f5b70, + 0xd0228f8c, 0x99267dad, 0x422b6bce, 0x0b2f99ef, 0xf1dd31f9, 0xb8d9c3d8, 0x63d4d5bb, 0x2ad0279a, + 0x1e328feb, 0x57367dca, 0x8c3b6ba9, 0xc53f9988, 0x3fcd319e, 0x76c9c3bf, 0xadc4d5dc, 0xe4c027fd, + 0x5dcdf301, 0x14c90120, 0xcfc41743, 0x86c0e562, 0x7c324d74, 0x3536bf55, 0xee3ba936, 0xa73f5b17, + 0x99cc763f, 0xd0c8841e, 0x0bc5927d, 0x42c1605c, 0xb833c84a, 0xf1373a6b, 0x2a3a2c08, 0x633ede29, + 0xda330ad5, 0x9337f8f4, 0x483aee97, 0x013e1cb6, 0xfbccb4a0, 0xb2c84681, 0x69c550e2, 0x20c1a2c3, + 0x28461564, 0x6142e745, 0xba4ff126, 0xf34b0307, 0x09b9ab11, 0x40bd5930, 0x9bb04f53, 0xd2b4bd72, + 0x6bb9698e, 0x22bd9baf, 0xf9b08dcc, 0xb0b47fed, 0x4a46d7fb, 0x034225da, 0xd84f33b9, 0x914bc198, + 0xafb8ecb0, 0xe6bc1e91, 0x3db108f2, 0x74b5fad3, 0x8e4752c5, 0xc743a0e4, 0x1c4eb687, 0x554a44a6, + 0xec47905a, 0xa543627b, 0x7e4e7418, 0x374a8639, 0xcdb82e2f, 0x84bcdc0e, 0x5fb1ca6d, 0x16b5384c, + 0x2257903d, 0x6b53621c, 0xb05e747f, 0xf95a865e, 0x03a82e48, 0x4aacdc69, 0x91a1ca0a, 0xd8a5382b, + 0x61a8ecd7, 0x28ac1ef6, 0xf3a10895, 0xbaa5fab4, 0x405752a2, 0x0953a083, 0xd25eb6e0, 0x9b5a44c1, + 0xa5a969e9, 0xecad9bc8, 0x37a08dab, 0x7ea47f8a, 0x8456d79c, 0xcd5225bd, 0x165f33de, 0x5f5bc1ff, + 0xe6561503, 0xaf52e722, 0x745ff141, 0x3d5b0360, 0xc7a9ab76, 0x8ead5957, 0x55a04f34, 0x1ca4bd15, + 0x3c651fd6, 0x7561edf7, 0xae6cfb94, 0xe76809b5, 0x1d9aa1a3, 0x549e5382, 0x8f9345e1, 0xc697b7c0, + 0x7f9a633c, 0x369e911d, 0xed93877e, 0xa497755f, 0x5e65dd49, 0x17612f68, 0xcc6c390b, 0x8568cb2a, + 0xbb9be602, 0xf29f1423, 0x29920240, 0x6096f061, 0x9a645877, 0xd360aa56, 0x086dbc35, 0x41694e14, + 0xf8649ae8, 0xb16068c9, 0x6a6d7eaa, 0x23698c8b, 0xd99b249d, 0x909fd6bc, 0x4b92c0df, 0x029632fe, + 0x36749a8f, 0x7f7068ae, 0xa47d7ecd, 0xed798cec, 0x178b24fa, 0x5e8fd6db, 0x8582c0b8, 0xcc863299, + 0x758be665, 0x3c8f1444, 0xe7820227, 0xae86f006, 0x54745810, 0x1d70aa31, 0xc67dbc52, 0x8f794e73, + 0xb18a635b, 0xf88e917a, 0x23838719, 0x6a877538, 0x9075dd2e, 0xd9712f0f, 0x027c396c, 0x4b78cb4d, + 0xf2751fb1, 0xbb71ed90, 0x607cfbf3, 0x297809d2, 0xd38aa1c4, 0x9a8e53e5, 0x41834586, 0x0887b7a7, + }, }; const uint32_t crc32_short[4][256] = { - { - 0x00000000, 0xdcb17aa4, 0xbc8e83b9, 0x603ff91d, 0x7cf17183, 0xa0400b27, 0xc07ff23a, 0x1cce889e, - 0xf9e2e306, 0x255399a2, 0x456c60bf, 0x99dd1a1b, 0x85139285, 0x59a2e821, 0x399d113c, 0xe52c6b98, - 0xf629b0fd, 0x2a98ca59, 0x4aa73344, 0x961649e0, 0x8ad8c17e, 0x5669bbda, 0x365642c7, 0xeae73863, - 0x0fcb53fb, 0xd37a295f, 0xb345d042, 0x6ff4aae6, 0x733a2278, 0xaf8b58dc, 0xcfb4a1c1, 0x1305db65, - 0xe9bf170b, 0x350e6daf, 0x553194b2, 0x8980ee16, 0x954e6688, 0x49ff1c2c, 0x29c0e531, 0xf5719f95, - 0x105df40d, 0xccec8ea9, 0xacd377b4, 0x70620d10, 0x6cac858e, 0xb01dff2a, 0xd0220637, 0x0c937c93, - 0x1f96a7f6, 0xc327dd52, 0xa318244f, 0x7fa95eeb, 0x6367d675, 0xbfd6acd1, 0xdfe955cc, 0x03582f68, - 0xe67444f0, 0x3ac53e54, 0x5afac749, 0x864bbded, 0x9a853573, 0x46344fd7, 0x260bb6ca, 0xfabacc6e, - 0xd69258e7, 0x0a232243, 0x6a1cdb5e, 0xb6ada1fa, 0xaa632964, 0x76d253c0, 0x16edaadd, 0xca5cd079, - 0x2f70bbe1, 0xf3c1c145, 0x93fe3858, 0x4f4f42fc, 0x5381ca62, 0x8f30b0c6, 0xef0f49db, 0x33be337f, - 0x20bbe81a, 0xfc0a92be, 0x9c356ba3, 0x40841107, 0x5c4a9999, 0x80fbe33d, 0xe0c41a20, 0x3c756084, - 0xd9590b1c, 0x05e871b8, 0x65d788a5, 0xb966f201, 0xa5a87a9f, 0x7919003b, 0x1926f926, 0xc5978382, - 0x3f2d4fec, 0xe39c3548, 0x83a3cc55, 0x5f12b6f1, 0x43dc3e6f, 0x9f6d44cb, 0xff52bdd6, 0x23e3c772, - 0xc6cfacea, 0x1a7ed64e, 0x7a412f53, 0xa6f055f7, 0xba3edd69, 0x668fa7cd, 0x06b05ed0, 0xda012474, - 0xc904ff11, 0x15b585b5, 0x758a7ca8, 0xa93b060c, 0xb5f58e92, 0x6944f436, 0x097b0d2b, 0xd5ca778f, - 0x30e61c17, 0xec5766b3, 0x8c689fae, 0x50d9e50a, 0x4c176d94, 0x90a61730, 0xf099ee2d, 0x2c289489, - 0xa8c8c73f, 0x7479bd9b, 0x14464486, 0xc8f73e22, 0xd439b6bc, 0x0888cc18, 0x68b73505, 0xb4064fa1, - 0x512a2439, 0x8d9b5e9d, 0xeda4a780, 0x3115dd24, 0x2ddb55ba, 0xf16a2f1e, 0x9155d603, 0x4de4aca7, - 0x5ee177c2, 0x82500d66, 0xe26ff47b, 0x3ede8edf, 0x22100641, 0xfea17ce5, 0x9e9e85f8, 0x422fff5c, - 0xa70394c4, 0x7bb2ee60, 0x1b8d177d, 0xc73c6dd9, 0xdbf2e547, 0x07439fe3, 0x677c66fe, 0xbbcd1c5a, - 0x4177d034, 0x9dc6aa90, 0xfdf9538d, 0x21482929, 0x3d86a1b7, 0xe137db13, 0x8108220e, 0x5db958aa, - 0xb8953332, 0x64244996, 0x041bb08b, 0xd8aaca2f, 0xc46442b1, 0x18d53815, 0x78eac108, 0xa45bbbac, - 0xb75e60c9, 0x6bef1a6d, 0x0bd0e370, 0xd76199d4, 0xcbaf114a, 0x171e6bee, 0x772192f3, 0xab90e857, - 0x4ebc83cf, 0x920df96b, 0xf2320076, 0x2e837ad2, 0x324df24c, 0xeefc88e8, 0x8ec371f5, 0x52720b51, - 0x7e5a9fd8, 0xa2ebe57c, 0xc2d41c61, 0x1e6566c5, 0x02abee5b, 0xde1a94ff, 0xbe256de2, 0x62941746, - 0x87b87cde, 0x5b09067a, 0x3b36ff67, 0xe78785c3, 0xfb490d5d, 0x27f877f9, 0x47c78ee4, 0x9b76f440, - 0x88732f25, 0x54c25581, 0x34fdac9c, 0xe84cd638, 0xf4825ea6, 0x28332402, 0x480cdd1f, 0x94bda7bb, - 0x7191cc23, 0xad20b687, 0xcd1f4f9a, 0x11ae353e, 0x0d60bda0, 0xd1d1c704, 0xb1ee3e19, 0x6d5f44bd, - 0x97e588d3, 0x4b54f277, 0x2b6b0b6a, 0xf7da71ce, 0xeb14f950, 0x37a583f4, 0x579a7ae9, 0x8b2b004d, - 0x6e076bd5, 0xb2b61171, 0xd289e86c, 0x0e3892c8, 0x12f61a56, 0xce4760f2, 0xae7899ef, 0x72c9e34b, - 0x61cc382e, 0xbd7d428a, 0xdd42bb97, 0x01f3c133, 0x1d3d49ad, 0xc18c3309, 0xa1b3ca14, 0x7d02b0b0, - 0x982edb28, 0x449fa18c, 0x24a05891, 0xf8112235, 0xe4dfaaab, 0x386ed00f, 0x58512912, 0x84e053b6, - }, - { - 0x00000000, 0x547df88f, 0xa8fbf11e, 0xfc860991, 0x541b94cd, 0x00666c42, 0xfce065d3, 0xa89d9d5c, - 0xa837299a, 0xfc4ad115, 0x00ccd884, 0x54b1200b, 0xfc2cbd57, 0xa85145d8, 0x54d74c49, 0x00aab4c6, - 0x558225c5, 0x01ffdd4a, 0xfd79d4db, 0xa9042c54, 0x0199b108, 0x55e44987, 0xa9624016, 0xfd1fb899, - 0xfdb50c5f, 0xa9c8f4d0, 0x554efd41, 0x013305ce, 0xa9ae9892, 0xfdd3601d, 0x0155698c, 0x55289103, - 0xab044b8a, 0xff79b305, 0x03ffba94, 0x5782421b, 0xff1fdf47, 0xab6227c8, 0x57e42e59, 0x0399d6d6, - 0x03336210, 0x574e9a9f, 0xabc8930e, 0xffb56b81, 0x5728f6dd, 0x03550e52, 0xffd307c3, 0xabaeff4c, - 0xfe866e4f, 0xaafb96c0, 0x567d9f51, 0x020067de, 0xaa9dfa82, 0xfee0020d, 0x02660b9c, 0x561bf313, - 0x56b147d5, 0x02ccbf5a, 0xfe4ab6cb, 0xaa374e44, 0x02aad318, 0x56d72b97, 0xaa512206, 0xfe2cda89, - 0x53e4e1e5, 0x0799196a, 0xfb1f10fb, 0xaf62e874, 0x07ff7528, 0x53828da7, 0xaf048436, 0xfb797cb9, - 0xfbd3c87f, 0xafae30f0, 0x53283961, 0x0755c1ee, 0xafc85cb2, 0xfbb5a43d, 0x0733adac, 0x534e5523, - 0x0666c420, 0x521b3caf, 0xae9d353e, 0xfae0cdb1, 0x527d50ed, 0x0600a862, 0xfa86a1f3, 0xaefb597c, - 0xae51edba, 0xfa2c1535, 0x06aa1ca4, 0x52d7e42b, 0xfa4a7977, 0xae3781f8, 0x52b18869, 0x06cc70e6, - 0xf8e0aa6f, 0xac9d52e0, 0x501b5b71, 0x0466a3fe, 0xacfb3ea2, 0xf886c62d, 0x0400cfbc, 0x507d3733, - 0x50d783f5, 0x04aa7b7a, 0xf82c72eb, 0xac518a64, 0x04cc1738, 0x50b1efb7, 0xac37e626, 0xf84a1ea9, - 0xad628faa, 0xf91f7725, 0x05997eb4, 0x51e4863b, 0xf9791b67, 0xad04e3e8, 0x5182ea79, 0x05ff12f6, - 0x0555a630, 0x51285ebf, 0xadae572e, 0xf9d3afa1, 0x514e32fd, 0x0533ca72, 0xf9b5c3e3, 0xadc83b6c, - 0xa7c9c3ca, 0xf3b43b45, 0x0f3232d4, 0x5b4fca5b, 0xf3d25707, 0xa7afaf88, 0x5b29a619, 0x0f545e96, - 0x0ffeea50, 0x5b8312df, 0xa7051b4e, 0xf378e3c1, 0x5be57e9d, 0x0f988612, 0xf31e8f83, 0xa763770c, - 0xf24be60f, 0xa6361e80, 0x5ab01711, 0x0ecdef9e, 0xa65072c2, 0xf22d8a4d, 0x0eab83dc, 0x5ad67b53, - 0x5a7ccf95, 0x0e01371a, 0xf2873e8b, 0xa6fac604, 0x0e675b58, 0x5a1aa3d7, 0xa69caa46, 0xf2e152c9, - 0x0ccd8840, 0x58b070cf, 0xa436795e, 0xf04b81d1, 0x58d61c8d, 0x0cabe402, 0xf02ded93, 0xa450151c, - 0xa4faa1da, 0xf0875955, 0x0c0150c4, 0x587ca84b, 0xf0e13517, 0xa49ccd98, 0x581ac409, 0x0c673c86, - 0x594fad85, 0x0d32550a, 0xf1b45c9b, 0xa5c9a414, 0x0d543948, 0x5929c1c7, 0xa5afc856, 0xf1d230d9, - 0xf178841f, 0xa5057c90, 0x59837501, 0x0dfe8d8e, 0xa56310d2, 0xf11ee85d, 0x0d98e1cc, 0x59e51943, - 0xf42d222f, 0xa050daa0, 0x5cd6d331, 0x08ab2bbe, 0xa036b6e2, 0xf44b4e6d, 0x08cd47fc, 0x5cb0bf73, - 0x5c1a0bb5, 0x0867f33a, 0xf4e1faab, 0xa09c0224, 0x08019f78, 0x5c7c67f7, 0xa0fa6e66, 0xf48796e9, - 0xa1af07ea, 0xf5d2ff65, 0x0954f6f4, 0x5d290e7b, 0xf5b49327, 0xa1c96ba8, 0x5d4f6239, 0x09329ab6, - 0x09982e70, 0x5de5d6ff, 0xa163df6e, 0xf51e27e1, 0x5d83babd, 0x09fe4232, 0xf5784ba3, 0xa105b32c, - 0x5f2969a5, 0x0b54912a, 0xf7d298bb, 0xa3af6034, 0x0b32fd68, 0x5f4f05e7, 0xa3c90c76, 0xf7b4f4f9, - 0xf71e403f, 0xa363b8b0, 0x5fe5b121, 0x0b9849ae, 0xa305d4f2, 0xf7782c7d, 0x0bfe25ec, 0x5f83dd63, - 0x0aab4c60, 0x5ed6b4ef, 0xa250bd7e, 0xf62d45f1, 0x5eb0d8ad, 0x0acd2022, 0xf64b29b3, 0xa236d13c, - 0xa29c65fa, 0xf6e19d75, 0x0a6794e4, 0x5e1a6c6b, 0xf687f137, 0xa2fa09b8, 0x5e7c0029, 0x0a01f8a6, - }, - { - 0x00000000, 0x4a7ff165, 0x94ffe2ca, 0xde8013af, 0x2c13b365, 0x666c4200, 0xb8ec51af, 0xf293a0ca, - 0x582766ca, 0x125897af, 0xccd88400, 0x86a77565, 0x7434d5af, 0x3e4b24ca, 0xe0cb3765, 0xaab4c600, - 0xb04ecd94, 0xfa313cf1, 0x24b12f5e, 0x6ecede3b, 0x9c5d7ef1, 0xd6228f94, 0x08a29c3b, 0x42dd6d5e, - 0xe869ab5e, 0xa2165a3b, 0x7c964994, 0x36e9b8f1, 0xc47a183b, 0x8e05e95e, 0x5085faf1, 0x1afa0b94, - 0x6571edd9, 0x2f0e1cbc, 0xf18e0f13, 0xbbf1fe76, 0x49625ebc, 0x031dafd9, 0xdd9dbc76, 0x97e24d13, - 0x3d568b13, 0x77297a76, 0xa9a969d9, 0xe3d698bc, 0x11453876, 0x5b3ac913, 0x85badabc, 0xcfc52bd9, - 0xd53f204d, 0x9f40d128, 0x41c0c287, 0x0bbf33e2, 0xf92c9328, 0xb353624d, 0x6dd371e2, 0x27ac8087, - 0x8d184687, 0xc767b7e2, 0x19e7a44d, 0x53985528, 0xa10bf5e2, 0xeb740487, 0x35f41728, 0x7f8be64d, - 0xcae3dbb2, 0x809c2ad7, 0x5e1c3978, 0x1463c81d, 0xe6f068d7, 0xac8f99b2, 0x720f8a1d, 0x38707b78, - 0x92c4bd78, 0xd8bb4c1d, 0x063b5fb2, 0x4c44aed7, 0xbed70e1d, 0xf4a8ff78, 0x2a28ecd7, 0x60571db2, - 0x7aad1626, 0x30d2e743, 0xee52f4ec, 0xa42d0589, 0x56bea543, 0x1cc15426, 0xc2414789, 0x883eb6ec, - 0x228a70ec, 0x68f58189, 0xb6759226, 0xfc0a6343, 0x0e99c389, 0x44e632ec, 0x9a662143, 0xd019d026, - 0xaf92366b, 0xe5edc70e, 0x3b6dd4a1, 0x711225c4, 0x8381850e, 0xc9fe746b, 0x177e67c4, 0x5d0196a1, - 0xf7b550a1, 0xbdcaa1c4, 0x634ab26b, 0x2935430e, 0xdba6e3c4, 0x91d912a1, 0x4f59010e, 0x0526f06b, - 0x1fdcfbff, 0x55a30a9a, 0x8b231935, 0xc15ce850, 0x33cf489a, 0x79b0b9ff, 0xa730aa50, 0xed4f5b35, - 0x47fb9d35, 0x0d846c50, 0xd3047fff, 0x997b8e9a, 0x6be82e50, 0x2197df35, 0xff17cc9a, 0xb5683dff, - 0x902bc195, 0xda5430f0, 0x04d4235f, 0x4eabd23a, 0xbc3872f0, 0xf6478395, 0x28c7903a, 0x62b8615f, - 0xc80ca75f, 0x8273563a, 0x5cf34595, 0x168cb4f0, 0xe41f143a, 0xae60e55f, 0x70e0f6f0, 0x3a9f0795, - 0x20650c01, 0x6a1afd64, 0xb49aeecb, 0xfee51fae, 0x0c76bf64, 0x46094e01, 0x98895dae, 0xd2f6accb, - 0x78426acb, 0x323d9bae, 0xecbd8801, 0xa6c27964, 0x5451d9ae, 0x1e2e28cb, 0xc0ae3b64, 0x8ad1ca01, - 0xf55a2c4c, 0xbf25dd29, 0x61a5ce86, 0x2bda3fe3, 0xd9499f29, 0x93366e4c, 0x4db67de3, 0x07c98c86, - 0xad7d4a86, 0xe702bbe3, 0x3982a84c, 0x73fd5929, 0x816ef9e3, 0xcb110886, 0x15911b29, 0x5feeea4c, - 0x4514e1d8, 0x0f6b10bd, 0xd1eb0312, 0x9b94f277, 0x690752bd, 0x2378a3d8, 0xfdf8b077, 0xb7874112, - 0x1d338712, 0x574c7677, 0x89cc65d8, 0xc3b394bd, 0x31203477, 0x7b5fc512, 0xa5dfd6bd, 0xefa027d8, - 0x5ac81a27, 0x10b7eb42, 0xce37f8ed, 0x84480988, 0x76dba942, 0x3ca45827, 0xe2244b88, 0xa85bbaed, - 0x02ef7ced, 0x48908d88, 0x96109e27, 0xdc6f6f42, 0x2efccf88, 0x64833eed, 0xba032d42, 0xf07cdc27, - 0xea86d7b3, 0xa0f926d6, 0x7e793579, 0x3406c41c, 0xc69564d6, 0x8cea95b3, 0x526a861c, 0x18157779, - 0xb2a1b179, 0xf8de401c, 0x265e53b3, 0x6c21a2d6, 0x9eb2021c, 0xd4cdf379, 0x0a4de0d6, 0x403211b3, - 0x3fb9f7fe, 0x75c6069b, 0xab461534, 0xe139e451, 0x13aa449b, 0x59d5b5fe, 0x8755a651, 0xcd2a5734, - 0x679e9134, 0x2de16051, 0xf36173fe, 0xb91e829b, 0x4b8d2251, 0x01f2d334, 0xdf72c09b, 0x950d31fe, - 0x8ff73a6a, 0xc588cb0f, 0x1b08d8a0, 0x517729c5, 0xa3e4890f, 0xe99b786a, 0x371b6bc5, 0x7d649aa0, - 0xd7d05ca0, 0x9dafadc5, 0x432fbe6a, 0x09504f0f, 0xfbc3efc5, 0xb1bc1ea0, 0x6f3c0d0f, 0x2543fc6a, - }, - { - 0x00000000, 0x25bbf5db, 0x4b77ebb6, 0x6ecc1e6d, 0x96efd76c, 0xb35422b7, 0xdd983cda, 0xf823c901, - 0x2833d829, 0x0d882df2, 0x6344339f, 0x46ffc644, 0xbedc0f45, 0x9b67fa9e, 0xf5abe4f3, 0xd0101128, - 0x5067b052, 0x75dc4589, 0x1b105be4, 0x3eabae3f, 0xc688673e, 0xe33392e5, 0x8dff8c88, 0xa8447953, - 0x7854687b, 0x5def9da0, 0x332383cd, 0x16987616, 0xeebbbf17, 0xcb004acc, 0xa5cc54a1, 0x8077a17a, - 0xa0cf60a4, 0x8574957f, 0xebb88b12, 0xce037ec9, 0x3620b7c8, 0x139b4213, 0x7d575c7e, 0x58eca9a5, - 0x88fcb88d, 0xad474d56, 0xc38b533b, 0xe630a6e0, 0x1e136fe1, 0x3ba89a3a, 0x55648457, 0x70df718c, - 0xf0a8d0f6, 0xd513252d, 0xbbdf3b40, 0x9e64ce9b, 0x6647079a, 0x43fcf241, 0x2d30ec2c, 0x088b19f7, - 0xd89b08df, 0xfd20fd04, 0x93ece369, 0xb65716b2, 0x4e74dfb3, 0x6bcf2a68, 0x05033405, 0x20b8c1de, - 0x4472b7b9, 0x61c94262, 0x0f055c0f, 0x2abea9d4, 0xd29d60d5, 0xf726950e, 0x99ea8b63, 0xbc517eb8, - 0x6c416f90, 0x49fa9a4b, 0x27368426, 0x028d71fd, 0xfaaeb8fc, 0xdf154d27, 0xb1d9534a, 0x9462a691, - 0x141507eb, 0x31aef230, 0x5f62ec5d, 0x7ad91986, 0x82fad087, 0xa741255c, 0xc98d3b31, 0xec36ceea, - 0x3c26dfc2, 0x199d2a19, 0x77513474, 0x52eac1af, 0xaac908ae, 0x8f72fd75, 0xe1bee318, 0xc40516c3, - 0xe4bdd71d, 0xc10622c6, 0xafca3cab, 0x8a71c970, 0x72520071, 0x57e9f5aa, 0x3925ebc7, 0x1c9e1e1c, - 0xcc8e0f34, 0xe935faef, 0x87f9e482, 0xa2421159, 0x5a61d858, 0x7fda2d83, 0x111633ee, 0x34adc635, - 0xb4da674f, 0x91619294, 0xffad8cf9, 0xda167922, 0x2235b023, 0x078e45f8, 0x69425b95, 0x4cf9ae4e, - 0x9ce9bf66, 0xb9524abd, 0xd79e54d0, 0xf225a10b, 0x0a06680a, 0x2fbd9dd1, 0x417183bc, 0x64ca7667, - 0x88e56f72, 0xad5e9aa9, 0xc39284c4, 0xe629711f, 0x1e0ab81e, 0x3bb14dc5, 0x557d53a8, 0x70c6a673, - 0xa0d6b75b, 0x856d4280, 0xeba15ced, 0xce1aa936, 0x36396037, 0x138295ec, 0x7d4e8b81, 0x58f57e5a, - 0xd882df20, 0xfd392afb, 0x93f53496, 0xb64ec14d, 0x4e6d084c, 0x6bd6fd97, 0x051ae3fa, 0x20a11621, - 0xf0b10709, 0xd50af2d2, 0xbbc6ecbf, 0x9e7d1964, 0x665ed065, 0x43e525be, 0x2d293bd3, 0x0892ce08, - 0x282a0fd6, 0x0d91fa0d, 0x635de460, 0x46e611bb, 0xbec5d8ba, 0x9b7e2d61, 0xf5b2330c, 0xd009c6d7, - 0x0019d7ff, 0x25a22224, 0x4b6e3c49, 0x6ed5c992, 0x96f60093, 0xb34df548, 0xdd81eb25, 0xf83a1efe, - 0x784dbf84, 0x5df64a5f, 0x333a5432, 0x1681a1e9, 0xeea268e8, 0xcb199d33, 0xa5d5835e, 0x806e7685, - 0x507e67ad, 0x75c59276, 0x1b098c1b, 0x3eb279c0, 0xc691b0c1, 0xe32a451a, 0x8de65b77, 0xa85daeac, - 0xcc97d8cb, 0xe92c2d10, 0x87e0337d, 0xa25bc6a6, 0x5a780fa7, 0x7fc3fa7c, 0x110fe411, 0x34b411ca, - 0xe4a400e2, 0xc11ff539, 0xafd3eb54, 0x8a681e8f, 0x724bd78e, 0x57f02255, 0x393c3c38, 0x1c87c9e3, - 0x9cf06899, 0xb94b9d42, 0xd787832f, 0xf23c76f4, 0x0a1fbff5, 0x2fa44a2e, 0x41685443, 0x64d3a198, - 0xb4c3b0b0, 0x9178456b, 0xffb45b06, 0xda0faedd, 0x222c67dc, 0x07979207, 0x695b8c6a, 0x4ce079b1, - 0x6c58b86f, 0x49e34db4, 0x272f53d9, 0x0294a602, 0xfab76f03, 0xdf0c9ad8, 0xb1c084b5, 0x947b716e, - 0x446b6046, 0x61d0959d, 0x0f1c8bf0, 0x2aa77e2b, 0xd284b72a, 0xf73f42f1, 0x99f35c9c, 0xbc48a947, - 0x3c3f083d, 0x1984fde6, 0x7748e38b, 0x52f31650, 0xaad0df51, 0x8f6b2a8a, 0xe1a734e7, 0xc41cc13c, - 0x140cd014, 0x31b725cf, 0x5f7b3ba2, 0x7ac0ce79, 0x82e30778, 0xa758f2a3, 0xc994ecce, 0xec2f1915, - }, + { + 0x00000000, 0xdcb17aa4, 0xbc8e83b9, 0x603ff91d, 0x7cf17183, 0xa0400b27, 0xc07ff23a, 0x1cce889e, + 0xf9e2e306, 0x255399a2, 0x456c60bf, 0x99dd1a1b, 0x85139285, 0x59a2e821, 0x399d113c, 0xe52c6b98, + 0xf629b0fd, 0x2a98ca59, 0x4aa73344, 0x961649e0, 0x8ad8c17e, 0x5669bbda, 0x365642c7, 0xeae73863, + 0x0fcb53fb, 0xd37a295f, 0xb345d042, 0x6ff4aae6, 0x733a2278, 0xaf8b58dc, 0xcfb4a1c1, 0x1305db65, + 0xe9bf170b, 0x350e6daf, 0x553194b2, 0x8980ee16, 0x954e6688, 0x49ff1c2c, 0x29c0e531, 0xf5719f95, + 0x105df40d, 0xccec8ea9, 0xacd377b4, 0x70620d10, 0x6cac858e, 0xb01dff2a, 0xd0220637, 0x0c937c93, + 0x1f96a7f6, 0xc327dd52, 0xa318244f, 0x7fa95eeb, 0x6367d675, 0xbfd6acd1, 0xdfe955cc, 0x03582f68, + 0xe67444f0, 0x3ac53e54, 0x5afac749, 0x864bbded, 0x9a853573, 0x46344fd7, 0x260bb6ca, 0xfabacc6e, + 0xd69258e7, 0x0a232243, 0x6a1cdb5e, 0xb6ada1fa, 0xaa632964, 0x76d253c0, 0x16edaadd, 0xca5cd079, + 0x2f70bbe1, 0xf3c1c145, 0x93fe3858, 0x4f4f42fc, 0x5381ca62, 0x8f30b0c6, 0xef0f49db, 0x33be337f, + 0x20bbe81a, 0xfc0a92be, 0x9c356ba3, 0x40841107, 0x5c4a9999, 0x80fbe33d, 0xe0c41a20, 0x3c756084, + 0xd9590b1c, 0x05e871b8, 0x65d788a5, 0xb966f201, 0xa5a87a9f, 0x7919003b, 0x1926f926, 0xc5978382, + 0x3f2d4fec, 0xe39c3548, 0x83a3cc55, 0x5f12b6f1, 0x43dc3e6f, 0x9f6d44cb, 0xff52bdd6, 0x23e3c772, + 0xc6cfacea, 0x1a7ed64e, 0x7a412f53, 0xa6f055f7, 0xba3edd69, 0x668fa7cd, 0x06b05ed0, 0xda012474, + 0xc904ff11, 0x15b585b5, 0x758a7ca8, 0xa93b060c, 0xb5f58e92, 0x6944f436, 0x097b0d2b, 0xd5ca778f, + 0x30e61c17, 0xec5766b3, 0x8c689fae, 0x50d9e50a, 0x4c176d94, 0x90a61730, 0xf099ee2d, 0x2c289489, + 0xa8c8c73f, 0x7479bd9b, 0x14464486, 0xc8f73e22, 0xd439b6bc, 0x0888cc18, 0x68b73505, 0xb4064fa1, + 0x512a2439, 0x8d9b5e9d, 0xeda4a780, 0x3115dd24, 0x2ddb55ba, 0xf16a2f1e, 0x9155d603, 0x4de4aca7, + 0x5ee177c2, 0x82500d66, 0xe26ff47b, 0x3ede8edf, 0x22100641, 0xfea17ce5, 0x9e9e85f8, 0x422fff5c, + 0xa70394c4, 0x7bb2ee60, 0x1b8d177d, 0xc73c6dd9, 0xdbf2e547, 0x07439fe3, 0x677c66fe, 0xbbcd1c5a, + 0x4177d034, 0x9dc6aa90, 0xfdf9538d, 0x21482929, 0x3d86a1b7, 0xe137db13, 0x8108220e, 0x5db958aa, + 0xb8953332, 0x64244996, 0x041bb08b, 0xd8aaca2f, 0xc46442b1, 0x18d53815, 0x78eac108, 0xa45bbbac, + 0xb75e60c9, 0x6bef1a6d, 0x0bd0e370, 0xd76199d4, 0xcbaf114a, 0x171e6bee, 0x772192f3, 0xab90e857, + 0x4ebc83cf, 0x920df96b, 0xf2320076, 0x2e837ad2, 0x324df24c, 0xeefc88e8, 0x8ec371f5, 0x52720b51, + 0x7e5a9fd8, 0xa2ebe57c, 0xc2d41c61, 0x1e6566c5, 0x02abee5b, 0xde1a94ff, 0xbe256de2, 0x62941746, + 0x87b87cde, 0x5b09067a, 0x3b36ff67, 0xe78785c3, 0xfb490d5d, 0x27f877f9, 0x47c78ee4, 0x9b76f440, + 0x88732f25, 0x54c25581, 0x34fdac9c, 0xe84cd638, 0xf4825ea6, 0x28332402, 0x480cdd1f, 0x94bda7bb, + 0x7191cc23, 0xad20b687, 0xcd1f4f9a, 0x11ae353e, 0x0d60bda0, 0xd1d1c704, 0xb1ee3e19, 0x6d5f44bd, + 0x97e588d3, 0x4b54f277, 0x2b6b0b6a, 0xf7da71ce, 0xeb14f950, 0x37a583f4, 0x579a7ae9, 0x8b2b004d, + 0x6e076bd5, 0xb2b61171, 0xd289e86c, 0x0e3892c8, 0x12f61a56, 0xce4760f2, 0xae7899ef, 0x72c9e34b, + 0x61cc382e, 0xbd7d428a, 0xdd42bb97, 0x01f3c133, 0x1d3d49ad, 0xc18c3309, 0xa1b3ca14, 0x7d02b0b0, + 0x982edb28, 0x449fa18c, 0x24a05891, 0xf8112235, 0xe4dfaaab, 0x386ed00f, 0x58512912, 0x84e053b6, + }, + { + 0x00000000, 0x547df88f, 0xa8fbf11e, 0xfc860991, 0x541b94cd, 0x00666c42, 0xfce065d3, 0xa89d9d5c, + 0xa837299a, 0xfc4ad115, 0x00ccd884, 0x54b1200b, 0xfc2cbd57, 0xa85145d8, 0x54d74c49, 0x00aab4c6, + 0x558225c5, 0x01ffdd4a, 0xfd79d4db, 0xa9042c54, 0x0199b108, 0x55e44987, 0xa9624016, 0xfd1fb899, + 0xfdb50c5f, 0xa9c8f4d0, 0x554efd41, 0x013305ce, 0xa9ae9892, 0xfdd3601d, 0x0155698c, 0x55289103, + 0xab044b8a, 0xff79b305, 0x03ffba94, 0x5782421b, 0xff1fdf47, 0xab6227c8, 0x57e42e59, 0x0399d6d6, + 0x03336210, 0x574e9a9f, 0xabc8930e, 0xffb56b81, 0x5728f6dd, 0x03550e52, 0xffd307c3, 0xabaeff4c, + 0xfe866e4f, 0xaafb96c0, 0x567d9f51, 0x020067de, 0xaa9dfa82, 0xfee0020d, 0x02660b9c, 0x561bf313, + 0x56b147d5, 0x02ccbf5a, 0xfe4ab6cb, 0xaa374e44, 0x02aad318, 0x56d72b97, 0xaa512206, 0xfe2cda89, + 0x53e4e1e5, 0x0799196a, 0xfb1f10fb, 0xaf62e874, 0x07ff7528, 0x53828da7, 0xaf048436, 0xfb797cb9, + 0xfbd3c87f, 0xafae30f0, 0x53283961, 0x0755c1ee, 0xafc85cb2, 0xfbb5a43d, 0x0733adac, 0x534e5523, + 0x0666c420, 0x521b3caf, 0xae9d353e, 0xfae0cdb1, 0x527d50ed, 0x0600a862, 0xfa86a1f3, 0xaefb597c, + 0xae51edba, 0xfa2c1535, 0x06aa1ca4, 0x52d7e42b, 0xfa4a7977, 0xae3781f8, 0x52b18869, 0x06cc70e6, + 0xf8e0aa6f, 0xac9d52e0, 0x501b5b71, 0x0466a3fe, 0xacfb3ea2, 0xf886c62d, 0x0400cfbc, 0x507d3733, + 0x50d783f5, 0x04aa7b7a, 0xf82c72eb, 0xac518a64, 0x04cc1738, 0x50b1efb7, 0xac37e626, 0xf84a1ea9, + 0xad628faa, 0xf91f7725, 0x05997eb4, 0x51e4863b, 0xf9791b67, 0xad04e3e8, 0x5182ea79, 0x05ff12f6, + 0x0555a630, 0x51285ebf, 0xadae572e, 0xf9d3afa1, 0x514e32fd, 0x0533ca72, 0xf9b5c3e3, 0xadc83b6c, + 0xa7c9c3ca, 0xf3b43b45, 0x0f3232d4, 0x5b4fca5b, 0xf3d25707, 0xa7afaf88, 0x5b29a619, 0x0f545e96, + 0x0ffeea50, 0x5b8312df, 0xa7051b4e, 0xf378e3c1, 0x5be57e9d, 0x0f988612, 0xf31e8f83, 0xa763770c, + 0xf24be60f, 0xa6361e80, 0x5ab01711, 0x0ecdef9e, 0xa65072c2, 0xf22d8a4d, 0x0eab83dc, 0x5ad67b53, + 0x5a7ccf95, 0x0e01371a, 0xf2873e8b, 0xa6fac604, 0x0e675b58, 0x5a1aa3d7, 0xa69caa46, 0xf2e152c9, + 0x0ccd8840, 0x58b070cf, 0xa436795e, 0xf04b81d1, 0x58d61c8d, 0x0cabe402, 0xf02ded93, 0xa450151c, + 0xa4faa1da, 0xf0875955, 0x0c0150c4, 0x587ca84b, 0xf0e13517, 0xa49ccd98, 0x581ac409, 0x0c673c86, + 0x594fad85, 0x0d32550a, 0xf1b45c9b, 0xa5c9a414, 0x0d543948, 0x5929c1c7, 0xa5afc856, 0xf1d230d9, + 0xf178841f, 0xa5057c90, 0x59837501, 0x0dfe8d8e, 0xa56310d2, 0xf11ee85d, 0x0d98e1cc, 0x59e51943, + 0xf42d222f, 0xa050daa0, 0x5cd6d331, 0x08ab2bbe, 0xa036b6e2, 0xf44b4e6d, 0x08cd47fc, 0x5cb0bf73, + 0x5c1a0bb5, 0x0867f33a, 0xf4e1faab, 0xa09c0224, 0x08019f78, 0x5c7c67f7, 0xa0fa6e66, 0xf48796e9, + 0xa1af07ea, 0xf5d2ff65, 0x0954f6f4, 0x5d290e7b, 0xf5b49327, 0xa1c96ba8, 0x5d4f6239, 0x09329ab6, + 0x09982e70, 0x5de5d6ff, 0xa163df6e, 0xf51e27e1, 0x5d83babd, 0x09fe4232, 0xf5784ba3, 0xa105b32c, + 0x5f2969a5, 0x0b54912a, 0xf7d298bb, 0xa3af6034, 0x0b32fd68, 0x5f4f05e7, 0xa3c90c76, 0xf7b4f4f9, + 0xf71e403f, 0xa363b8b0, 0x5fe5b121, 0x0b9849ae, 0xa305d4f2, 0xf7782c7d, 0x0bfe25ec, 0x5f83dd63, + 0x0aab4c60, 0x5ed6b4ef, 0xa250bd7e, 0xf62d45f1, 0x5eb0d8ad, 0x0acd2022, 0xf64b29b3, 0xa236d13c, + 0xa29c65fa, 0xf6e19d75, 0x0a6794e4, 0x5e1a6c6b, 0xf687f137, 0xa2fa09b8, 0x5e7c0029, 0x0a01f8a6, + }, + { + 0x00000000, 0x4a7ff165, 0x94ffe2ca, 0xde8013af, 0x2c13b365, 0x666c4200, 0xb8ec51af, 0xf293a0ca, + 0x582766ca, 0x125897af, 0xccd88400, 0x86a77565, 0x7434d5af, 0x3e4b24ca, 0xe0cb3765, 0xaab4c600, + 0xb04ecd94, 0xfa313cf1, 0x24b12f5e, 0x6ecede3b, 0x9c5d7ef1, 0xd6228f94, 0x08a29c3b, 0x42dd6d5e, + 0xe869ab5e, 0xa2165a3b, 0x7c964994, 0x36e9b8f1, 0xc47a183b, 0x8e05e95e, 0x5085faf1, 0x1afa0b94, + 0x6571edd9, 0x2f0e1cbc, 0xf18e0f13, 0xbbf1fe76, 0x49625ebc, 0x031dafd9, 0xdd9dbc76, 0x97e24d13, + 0x3d568b13, 0x77297a76, 0xa9a969d9, 0xe3d698bc, 0x11453876, 0x5b3ac913, 0x85badabc, 0xcfc52bd9, + 0xd53f204d, 0x9f40d128, 0x41c0c287, 0x0bbf33e2, 0xf92c9328, 0xb353624d, 0x6dd371e2, 0x27ac8087, + 0x8d184687, 0xc767b7e2, 0x19e7a44d, 0x53985528, 0xa10bf5e2, 0xeb740487, 0x35f41728, 0x7f8be64d, + 0xcae3dbb2, 0x809c2ad7, 0x5e1c3978, 0x1463c81d, 0xe6f068d7, 0xac8f99b2, 0x720f8a1d, 0x38707b78, + 0x92c4bd78, 0xd8bb4c1d, 0x063b5fb2, 0x4c44aed7, 0xbed70e1d, 0xf4a8ff78, 0x2a28ecd7, 0x60571db2, + 0x7aad1626, 0x30d2e743, 0xee52f4ec, 0xa42d0589, 0x56bea543, 0x1cc15426, 0xc2414789, 0x883eb6ec, + 0x228a70ec, 0x68f58189, 0xb6759226, 0xfc0a6343, 0x0e99c389, 0x44e632ec, 0x9a662143, 0xd019d026, + 0xaf92366b, 0xe5edc70e, 0x3b6dd4a1, 0x711225c4, 0x8381850e, 0xc9fe746b, 0x177e67c4, 0x5d0196a1, + 0xf7b550a1, 0xbdcaa1c4, 0x634ab26b, 0x2935430e, 0xdba6e3c4, 0x91d912a1, 0x4f59010e, 0x0526f06b, + 0x1fdcfbff, 0x55a30a9a, 0x8b231935, 0xc15ce850, 0x33cf489a, 0x79b0b9ff, 0xa730aa50, 0xed4f5b35, + 0x47fb9d35, 0x0d846c50, 0xd3047fff, 0x997b8e9a, 0x6be82e50, 0x2197df35, 0xff17cc9a, 0xb5683dff, + 0x902bc195, 0xda5430f0, 0x04d4235f, 0x4eabd23a, 0xbc3872f0, 0xf6478395, 0x28c7903a, 0x62b8615f, + 0xc80ca75f, 0x8273563a, 0x5cf34595, 0x168cb4f0, 0xe41f143a, 0xae60e55f, 0x70e0f6f0, 0x3a9f0795, + 0x20650c01, 0x6a1afd64, 0xb49aeecb, 0xfee51fae, 0x0c76bf64, 0x46094e01, 0x98895dae, 0xd2f6accb, + 0x78426acb, 0x323d9bae, 0xecbd8801, 0xa6c27964, 0x5451d9ae, 0x1e2e28cb, 0xc0ae3b64, 0x8ad1ca01, + 0xf55a2c4c, 0xbf25dd29, 0x61a5ce86, 0x2bda3fe3, 0xd9499f29, 0x93366e4c, 0x4db67de3, 0x07c98c86, + 0xad7d4a86, 0xe702bbe3, 0x3982a84c, 0x73fd5929, 0x816ef9e3, 0xcb110886, 0x15911b29, 0x5feeea4c, + 0x4514e1d8, 0x0f6b10bd, 0xd1eb0312, 0x9b94f277, 0x690752bd, 0x2378a3d8, 0xfdf8b077, 0xb7874112, + 0x1d338712, 0x574c7677, 0x89cc65d8, 0xc3b394bd, 0x31203477, 0x7b5fc512, 0xa5dfd6bd, 0xefa027d8, + 0x5ac81a27, 0x10b7eb42, 0xce37f8ed, 0x84480988, 0x76dba942, 0x3ca45827, 0xe2244b88, 0xa85bbaed, + 0x02ef7ced, 0x48908d88, 0x96109e27, 0xdc6f6f42, 0x2efccf88, 0x64833eed, 0xba032d42, 0xf07cdc27, + 0xea86d7b3, 0xa0f926d6, 0x7e793579, 0x3406c41c, 0xc69564d6, 0x8cea95b3, 0x526a861c, 0x18157779, + 0xb2a1b179, 0xf8de401c, 0x265e53b3, 0x6c21a2d6, 0x9eb2021c, 0xd4cdf379, 0x0a4de0d6, 0x403211b3, + 0x3fb9f7fe, 0x75c6069b, 0xab461534, 0xe139e451, 0x13aa449b, 0x59d5b5fe, 0x8755a651, 0xcd2a5734, + 0x679e9134, 0x2de16051, 0xf36173fe, 0xb91e829b, 0x4b8d2251, 0x01f2d334, 0xdf72c09b, 0x950d31fe, + 0x8ff73a6a, 0xc588cb0f, 0x1b08d8a0, 0x517729c5, 0xa3e4890f, 0xe99b786a, 0x371b6bc5, 0x7d649aa0, + 0xd7d05ca0, 0x9dafadc5, 0x432fbe6a, 0x09504f0f, 0xfbc3efc5, 0xb1bc1ea0, 0x6f3c0d0f, 0x2543fc6a, + }, + { + 0x00000000, 0x25bbf5db, 0x4b77ebb6, 0x6ecc1e6d, 0x96efd76c, 0xb35422b7, 0xdd983cda, 0xf823c901, + 0x2833d829, 0x0d882df2, 0x6344339f, 0x46ffc644, 0xbedc0f45, 0x9b67fa9e, 0xf5abe4f3, 0xd0101128, + 0x5067b052, 0x75dc4589, 0x1b105be4, 0x3eabae3f, 0xc688673e, 0xe33392e5, 0x8dff8c88, 0xa8447953, + 0x7854687b, 0x5def9da0, 0x332383cd, 0x16987616, 0xeebbbf17, 0xcb004acc, 0xa5cc54a1, 0x8077a17a, + 0xa0cf60a4, 0x8574957f, 0xebb88b12, 0xce037ec9, 0x3620b7c8, 0x139b4213, 0x7d575c7e, 0x58eca9a5, + 0x88fcb88d, 0xad474d56, 0xc38b533b, 0xe630a6e0, 0x1e136fe1, 0x3ba89a3a, 0x55648457, 0x70df718c, + 0xf0a8d0f6, 0xd513252d, 0xbbdf3b40, 0x9e64ce9b, 0x6647079a, 0x43fcf241, 0x2d30ec2c, 0x088b19f7, + 0xd89b08df, 0xfd20fd04, 0x93ece369, 0xb65716b2, 0x4e74dfb3, 0x6bcf2a68, 0x05033405, 0x20b8c1de, + 0x4472b7b9, 0x61c94262, 0x0f055c0f, 0x2abea9d4, 0xd29d60d5, 0xf726950e, 0x99ea8b63, 0xbc517eb8, + 0x6c416f90, 0x49fa9a4b, 0x27368426, 0x028d71fd, 0xfaaeb8fc, 0xdf154d27, 0xb1d9534a, 0x9462a691, + 0x141507eb, 0x31aef230, 0x5f62ec5d, 0x7ad91986, 0x82fad087, 0xa741255c, 0xc98d3b31, 0xec36ceea, + 0x3c26dfc2, 0x199d2a19, 0x77513474, 0x52eac1af, 0xaac908ae, 0x8f72fd75, 0xe1bee318, 0xc40516c3, + 0xe4bdd71d, 0xc10622c6, 0xafca3cab, 0x8a71c970, 0x72520071, 0x57e9f5aa, 0x3925ebc7, 0x1c9e1e1c, + 0xcc8e0f34, 0xe935faef, 0x87f9e482, 0xa2421159, 0x5a61d858, 0x7fda2d83, 0x111633ee, 0x34adc635, + 0xb4da674f, 0x91619294, 0xffad8cf9, 0xda167922, 0x2235b023, 0x078e45f8, 0x69425b95, 0x4cf9ae4e, + 0x9ce9bf66, 0xb9524abd, 0xd79e54d0, 0xf225a10b, 0x0a06680a, 0x2fbd9dd1, 0x417183bc, 0x64ca7667, + 0x88e56f72, 0xad5e9aa9, 0xc39284c4, 0xe629711f, 0x1e0ab81e, 0x3bb14dc5, 0x557d53a8, 0x70c6a673, + 0xa0d6b75b, 0x856d4280, 0xeba15ced, 0xce1aa936, 0x36396037, 0x138295ec, 0x7d4e8b81, 0x58f57e5a, + 0xd882df20, 0xfd392afb, 0x93f53496, 0xb64ec14d, 0x4e6d084c, 0x6bd6fd97, 0x051ae3fa, 0x20a11621, + 0xf0b10709, 0xd50af2d2, 0xbbc6ecbf, 0x9e7d1964, 0x665ed065, 0x43e525be, 0x2d293bd3, 0x0892ce08, + 0x282a0fd6, 0x0d91fa0d, 0x635de460, 0x46e611bb, 0xbec5d8ba, 0x9b7e2d61, 0xf5b2330c, 0xd009c6d7, + 0x0019d7ff, 0x25a22224, 0x4b6e3c49, 0x6ed5c992, 0x96f60093, 0xb34df548, 0xdd81eb25, 0xf83a1efe, + 0x784dbf84, 0x5df64a5f, 0x333a5432, 0x1681a1e9, 0xeea268e8, 0xcb199d33, 0xa5d5835e, 0x806e7685, + 0x507e67ad, 0x75c59276, 0x1b098c1b, 0x3eb279c0, 0xc691b0c1, 0xe32a451a, 0x8de65b77, 0xa85daeac, + 0xcc97d8cb, 0xe92c2d10, 0x87e0337d, 0xa25bc6a6, 0x5a780fa7, 0x7fc3fa7c, 0x110fe411, 0x34b411ca, + 0xe4a400e2, 0xc11ff539, 0xafd3eb54, 0x8a681e8f, 0x724bd78e, 0x57f02255, 0x393c3c38, 0x1c87c9e3, + 0x9cf06899, 0xb94b9d42, 0xd787832f, 0xf23c76f4, 0x0a1fbff5, 0x2fa44a2e, 0x41685443, 0x64d3a198, + 0xb4c3b0b0, 0x9178456b, 0xffb45b06, 0xda0faedd, 0x222c67dc, 0x07979207, 0x695b8c6a, 0x4ce079b1, + 0x6c58b86f, 0x49e34db4, 0x272f53d9, 0x0294a602, 0xfab76f03, 0xdf0c9ad8, 0xb1c084b5, 0x947b716e, + 0x446b6046, 0x61d0959d, 0x0f1c8bf0, 0x2aa77e2b, 0xd284b72a, 0xf73f42f1, 0x99f35c9c, 0xbc48a947, + 0x3c3f083d, 0x1984fde6, 0x7748e38b, 0x52f31650, 0xaad0df51, 0x8f6b2a8a, 0xe1a734e7, 0xc41cc13c, + 0x140cd014, 0x31b725cf, 0x5f7b3ba2, 0x7ac0ce79, 0x82e30778, 0xa758f2a3, 0xc994ecce, 0xec2f1915, + }, }; #endif const uint32_t crc32c_sw_table[16][256] = { - { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, - }, - { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483, - }, - { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8, - }, - { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842, - }, - { - 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, - 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, - 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, - 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, - 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, - 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, - 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, - 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, - 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, - 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, - 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, - 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, - 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, - 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, - 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, - 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, - 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, - 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, - 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, - 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, - 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, - 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, - 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, - 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, - 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, - 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, - 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, - 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, - 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, - 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, - 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, - 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3, - }, - { - 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, - 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, - 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, - 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, - 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, - 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, - 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, - 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, - 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, - 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, - 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, - 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, - 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, - 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, - 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, - 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, - 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, - 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, - 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, - 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, - 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, - 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, - 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, - 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, - 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, - 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, - 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, - 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, - 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, - 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, - 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, - 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c, - }, - { - 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, - 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, - 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, - 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, - 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, - 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, - 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, - 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, - 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, - 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, - 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, - 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, - 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, - 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, - 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, - 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, - 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, - 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, - 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, - 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, - 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, - 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, - 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, - 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, - 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, - 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, - 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, - 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, - 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, - 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, - 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, - 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f, - }, - { - 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, - 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, - 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, - 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, - 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, - 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, - 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, - 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, - 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, - 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, - 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, - 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, - 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, - 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, - 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, - 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, - 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, - 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, - 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, - 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, - 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, - 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, - 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, - 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, - 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, - 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, - 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, - 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, - 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, - 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, - 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, - 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5, - }, - { - 0x00000000, 0xf43ed648, 0xed91da61, 0x19af0c29, 0xdecfc233, 0x2af1147b, 0x335e1852, 0xc760ce1a, - 0xb873f297, 0x4c4d24df, 0x55e228f6, 0xa1dcfebe, 0x66bc30a4, 0x9282e6ec, 0x8b2deac5, 0x7f133c8d, - 0x750b93df, 0x81354597, 0x989a49be, 0x6ca49ff6, 0xabc451ec, 0x5ffa87a4, 0x46558b8d, 0xb26b5dc5, - 0xcd786148, 0x3946b700, 0x20e9bb29, 0xd4d76d61, 0x13b7a37b, 0xe7897533, 0xfe26791a, 0x0a18af52, - 0xea1727be, 0x1e29f1f6, 0x0786fddf, 0xf3b82b97, 0x34d8e58d, 0xc0e633c5, 0xd9493fec, 0x2d77e9a4, - 0x5264d529, 0xa65a0361, 0xbff50f48, 0x4bcbd900, 0x8cab171a, 0x7895c152, 0x613acd7b, 0x95041b33, - 0x9f1cb461, 0x6b226229, 0x728d6e00, 0x86b3b848, 0x41d37652, 0xb5eda01a, 0xac42ac33, 0x587c7a7b, - 0x276f46f6, 0xd35190be, 0xcafe9c97, 0x3ec04adf, 0xf9a084c5, 0x0d9e528d, 0x14315ea4, 0xe00f88ec, - 0xd1c2398d, 0x25fcefc5, 0x3c53e3ec, 0xc86d35a4, 0x0f0dfbbe, 0xfb332df6, 0xe29c21df, 0x16a2f797, - 0x69b1cb1a, 0x9d8f1d52, 0x8420117b, 0x701ec733, 0xb77e0929, 0x4340df61, 0x5aefd348, 0xaed10500, - 0xa4c9aa52, 0x50f77c1a, 0x49587033, 0xbd66a67b, 0x7a066861, 0x8e38be29, 0x9797b200, 0x63a96448, - 0x1cba58c5, 0xe8848e8d, 0xf12b82a4, 0x051554ec, 0xc2759af6, 0x364b4cbe, 0x2fe44097, 0xdbda96df, - 0x3bd51e33, 0xcfebc87b, 0xd644c452, 0x227a121a, 0xe51adc00, 0x11240a48, 0x088b0661, 0xfcb5d029, - 0x83a6eca4, 0x77983aec, 0x6e3736c5, 0x9a09e08d, 0x5d692e97, 0xa957f8df, 0xb0f8f4f6, 0x44c622be, - 0x4ede8dec, 0xbae05ba4, 0xa34f578d, 0x577181c5, 0x90114fdf, 0x642f9997, 0x7d8095be, 0x89be43f6, - 0xf6ad7f7b, 0x0293a933, 0x1b3ca51a, 0xef027352, 0x2862bd48, 0xdc5c6b00, 0xc5f36729, 0x31cdb161, - 0xa66805eb, 0x5256d3a3, 0x4bf9df8a, 0xbfc709c2, 0x78a7c7d8, 0x8c991190, 0x95361db9, 0x6108cbf1, - 0x1e1bf77c, 0xea252134, 0xf38a2d1d, 0x07b4fb55, 0xc0d4354f, 0x34eae307, 0x2d45ef2e, 0xd97b3966, - 0xd3639634, 0x275d407c, 0x3ef24c55, 0xcacc9a1d, 0x0dac5407, 0xf992824f, 0xe03d8e66, 0x1403582e, - 0x6b1064a3, 0x9f2eb2eb, 0x8681bec2, 0x72bf688a, 0xb5dfa690, 0x41e170d8, 0x584e7cf1, 0xac70aab9, - 0x4c7f2255, 0xb841f41d, 0xa1eef834, 0x55d02e7c, 0x92b0e066, 0x668e362e, 0x7f213a07, 0x8b1fec4f, - 0xf40cd0c2, 0x0032068a, 0x199d0aa3, 0xeda3dceb, 0x2ac312f1, 0xdefdc4b9, 0xc752c890, 0x336c1ed8, - 0x3974b18a, 0xcd4a67c2, 0xd4e56beb, 0x20dbbda3, 0xe7bb73b9, 0x1385a5f1, 0x0a2aa9d8, 0xfe147f90, - 0x8107431d, 0x75399555, 0x6c96997c, 0x98a84f34, 0x5fc8812e, 0xabf65766, 0xb2595b4f, 0x46678d07, - 0x77aa3c66, 0x8394ea2e, 0x9a3be607, 0x6e05304f, 0xa965fe55, 0x5d5b281d, 0x44f42434, 0xb0caf27c, - 0xcfd9cef1, 0x3be718b9, 0x22481490, 0xd676c2d8, 0x11160cc2, 0xe528da8a, 0xfc87d6a3, 0x08b900eb, - 0x02a1afb9, 0xf69f79f1, 0xef3075d8, 0x1b0ea390, 0xdc6e6d8a, 0x2850bbc2, 0x31ffb7eb, 0xc5c161a3, - 0xbad25d2e, 0x4eec8b66, 0x5743874f, 0xa37d5107, 0x641d9f1d, 0x90234955, 0x898c457c, 0x7db29334, - 0x9dbd1bd8, 0x6983cd90, 0x702cc1b9, 0x841217f1, 0x4372d9eb, 0xb74c0fa3, 0xaee3038a, 0x5addd5c2, - 0x25cee94f, 0xd1f03f07, 0xc85f332e, 0x3c61e566, 0xfb012b7c, 0x0f3ffd34, 0x1690f11d, 0xe2ae2755, - 0xe8b68807, 0x1c885e4f, 0x05275266, 0xf119842e, 0x36794a34, 0xc2479c7c, 0xdbe89055, 0x2fd6461d, - 0x50c57a90, 0xa4fbacd8, 0xbd54a0f1, 0x496a76b9, 0x8e0ab8a3, 0x7a346eeb, 0x639b62c2, 0x97a5b48a, - }, - { - 0x00000000, 0xcb567ba5, 0x934081bb, 0x5816fa1e, 0x236d7587, 0xe83b0e22, 0xb02df43c, 0x7b7b8f99, - 0x46daeb0e, 0x8d8c90ab, 0xd59a6ab5, 0x1ecc1110, 0x65b79e89, 0xaee1e52c, 0xf6f71f32, 0x3da16497, - 0x8db5d61c, 0x46e3adb9, 0x1ef557a7, 0xd5a32c02, 0xaed8a39b, 0x658ed83e, 0x3d982220, 0xf6ce5985, - 0xcb6f3d12, 0x003946b7, 0x582fbca9, 0x9379c70c, 0xe8024895, 0x23543330, 0x7b42c92e, 0xb014b28b, - 0x1e87dac9, 0xd5d1a16c, 0x8dc75b72, 0x469120d7, 0x3deaaf4e, 0xf6bcd4eb, 0xaeaa2ef5, 0x65fc5550, - 0x585d31c7, 0x930b4a62, 0xcb1db07c, 0x004bcbd9, 0x7b304440, 0xb0663fe5, 0xe870c5fb, 0x2326be5e, - 0x93320cd5, 0x58647770, 0x00728d6e, 0xcb24f6cb, 0xb05f7952, 0x7b0902f7, 0x231ff8e9, 0xe849834c, - 0xd5e8e7db, 0x1ebe9c7e, 0x46a86660, 0x8dfe1dc5, 0xf685925c, 0x3dd3e9f9, 0x65c513e7, 0xae936842, - 0x3d0fb592, 0xf659ce37, 0xae4f3429, 0x65194f8c, 0x1e62c015, 0xd534bbb0, 0x8d2241ae, 0x46743a0b, - 0x7bd55e9c, 0xb0832539, 0xe895df27, 0x23c3a482, 0x58b82b1b, 0x93ee50be, 0xcbf8aaa0, 0x00aed105, - 0xb0ba638e, 0x7bec182b, 0x23fae235, 0xe8ac9990, 0x93d71609, 0x58816dac, 0x009797b2, 0xcbc1ec17, - 0xf6608880, 0x3d36f325, 0x6520093b, 0xae76729e, 0xd50dfd07, 0x1e5b86a2, 0x464d7cbc, 0x8d1b0719, - 0x23886f5b, 0xe8de14fe, 0xb0c8eee0, 0x7b9e9545, 0x00e51adc, 0xcbb36179, 0x93a59b67, 0x58f3e0c2, - 0x65528455, 0xae04fff0, 0xf61205ee, 0x3d447e4b, 0x463ff1d2, 0x8d698a77, 0xd57f7069, 0x1e290bcc, - 0xae3db947, 0x656bc2e2, 0x3d7d38fc, 0xf62b4359, 0x8d50ccc0, 0x4606b765, 0x1e104d7b, 0xd54636de, - 0xe8e75249, 0x23b129ec, 0x7ba7d3f2, 0xb0f1a857, 0xcb8a27ce, 0x00dc5c6b, 0x58caa675, 0x939cddd0, - 0x7a1f6b24, 0xb1491081, 0xe95fea9f, 0x2209913a, 0x59721ea3, 0x92246506, 0xca329f18, 0x0164e4bd, - 0x3cc5802a, 0xf793fb8f, 0xaf850191, 0x64d37a34, 0x1fa8f5ad, 0xd4fe8e08, 0x8ce87416, 0x47be0fb3, - 0xf7aabd38, 0x3cfcc69d, 0x64ea3c83, 0xafbc4726, 0xd4c7c8bf, 0x1f91b31a, 0x47874904, 0x8cd132a1, - 0xb1705636, 0x7a262d93, 0x2230d78d, 0xe966ac28, 0x921d23b1, 0x594b5814, 0x015da20a, 0xca0bd9af, - 0x6498b1ed, 0xafceca48, 0xf7d83056, 0x3c8e4bf3, 0x47f5c46a, 0x8ca3bfcf, 0xd4b545d1, 0x1fe33e74, - 0x22425ae3, 0xe9142146, 0xb102db58, 0x7a54a0fd, 0x012f2f64, 0xca7954c1, 0x926faedf, 0x5939d57a, - 0xe92d67f1, 0x227b1c54, 0x7a6de64a, 0xb13b9def, 0xca401276, 0x011669d3, 0x590093cd, 0x9256e868, - 0xaff78cff, 0x64a1f75a, 0x3cb70d44, 0xf7e176e1, 0x8c9af978, 0x47cc82dd, 0x1fda78c3, 0xd48c0366, - 0x4710deb6, 0x8c46a513, 0xd4505f0d, 0x1f0624a8, 0x647dab31, 0xaf2bd094, 0xf73d2a8a, 0x3c6b512f, - 0x01ca35b8, 0xca9c4e1d, 0x928ab403, 0x59dccfa6, 0x22a7403f, 0xe9f13b9a, 0xb1e7c184, 0x7ab1ba21, - 0xcaa508aa, 0x01f3730f, 0x59e58911, 0x92b3f2b4, 0xe9c87d2d, 0x229e0688, 0x7a88fc96, 0xb1de8733, - 0x8c7fe3a4, 0x47299801, 0x1f3f621f, 0xd46919ba, 0xaf129623, 0x6444ed86, 0x3c521798, 0xf7046c3d, - 0x5997047f, 0x92c17fda, 0xcad785c4, 0x0181fe61, 0x7afa71f8, 0xb1ac0a5d, 0xe9baf043, 0x22ec8be6, - 0x1f4def71, 0xd41b94d4, 0x8c0d6eca, 0x475b156f, 0x3c209af6, 0xf776e153, 0xaf601b4d, 0x643660e8, - 0xd422d263, 0x1f74a9c6, 0x476253d8, 0x8c34287d, 0xf74fa7e4, 0x3c19dc41, 0x640f265f, 0xaf595dfa, - 0x92f8396d, 0x59ae42c8, 0x01b8b8d6, 0xcaeec373, 0xb1954cea, 0x7ac3374f, 0x22d5cd51, 0xe983b6f4, - }, - { - 0x00000000, 0x9771f7c1, 0x2b0f9973, 0xbc7e6eb2, 0x561f32e6, 0xc16ec527, 0x7d10ab95, 0xea615c54, - 0xac3e65cc, 0x3b4f920d, 0x8731fcbf, 0x10400b7e, 0xfa21572a, 0x6d50a0eb, 0xd12ece59, 0x465f3998, - 0x5d90bd69, 0xcae14aa8, 0x769f241a, 0xe1eed3db, 0x0b8f8f8f, 0x9cfe784e, 0x208016fc, 0xb7f1e13d, - 0xf1aed8a5, 0x66df2f64, 0xdaa141d6, 0x4dd0b617, 0xa7b1ea43, 0x30c01d82, 0x8cbe7330, 0x1bcf84f1, - 0xbb217ad2, 0x2c508d13, 0x902ee3a1, 0x075f1460, 0xed3e4834, 0x7a4fbff5, 0xc631d147, 0x51402686, - 0x171f1f1e, 0x806ee8df, 0x3c10866d, 0xab6171ac, 0x41002df8, 0xd671da39, 0x6a0fb48b, 0xfd7e434a, - 0xe6b1c7bb, 0x71c0307a, 0xcdbe5ec8, 0x5acfa909, 0xb0aef55d, 0x27df029c, 0x9ba16c2e, 0x0cd09bef, - 0x4a8fa277, 0xddfe55b6, 0x61803b04, 0xf6f1ccc5, 0x1c909091, 0x8be16750, 0x379f09e2, 0xa0eefe23, - 0x73ae8355, 0xe4df7494, 0x58a11a26, 0xcfd0ede7, 0x25b1b1b3, 0xb2c04672, 0x0ebe28c0, 0x99cfdf01, - 0xdf90e699, 0x48e11158, 0xf49f7fea, 0x63ee882b, 0x898fd47f, 0x1efe23be, 0xa2804d0c, 0x35f1bacd, - 0x2e3e3e3c, 0xb94fc9fd, 0x0531a74f, 0x9240508e, 0x78210cda, 0xef50fb1b, 0x532e95a9, 0xc45f6268, - 0x82005bf0, 0x1571ac31, 0xa90fc283, 0x3e7e3542, 0xd41f6916, 0x436e9ed7, 0xff10f065, 0x686107a4, - 0xc88ff987, 0x5ffe0e46, 0xe38060f4, 0x74f19735, 0x9e90cb61, 0x09e13ca0, 0xb59f5212, 0x22eea5d3, - 0x64b19c4b, 0xf3c06b8a, 0x4fbe0538, 0xd8cff2f9, 0x32aeaead, 0xa5df596c, 0x19a137de, 0x8ed0c01f, - 0x951f44ee, 0x026eb32f, 0xbe10dd9d, 0x29612a5c, 0xc3007608, 0x547181c9, 0xe80fef7b, 0x7f7e18ba, - 0x39212122, 0xae50d6e3, 0x122eb851, 0x855f4f90, 0x6f3e13c4, 0xf84fe405, 0x44318ab7, 0xd3407d76, - 0xe75d06aa, 0x702cf16b, 0xcc529fd9, 0x5b236818, 0xb142344c, 0x2633c38d, 0x9a4dad3f, 0x0d3c5afe, - 0x4b636366, 0xdc1294a7, 0x606cfa15, 0xf71d0dd4, 0x1d7c5180, 0x8a0da641, 0x3673c8f3, 0xa1023f32, - 0xbacdbbc3, 0x2dbc4c02, 0x91c222b0, 0x06b3d571, 0xecd28925, 0x7ba37ee4, 0xc7dd1056, 0x50ace797, - 0x16f3de0f, 0x818229ce, 0x3dfc477c, 0xaa8db0bd, 0x40ecece9, 0xd79d1b28, 0x6be3759a, 0xfc92825b, - 0x5c7c7c78, 0xcb0d8bb9, 0x7773e50b, 0xe00212ca, 0x0a634e9e, 0x9d12b95f, 0x216cd7ed, 0xb61d202c, - 0xf04219b4, 0x6733ee75, 0xdb4d80c7, 0x4c3c7706, 0xa65d2b52, 0x312cdc93, 0x8d52b221, 0x1a2345e0, - 0x01ecc111, 0x969d36d0, 0x2ae35862, 0xbd92afa3, 0x57f3f3f7, 0xc0820436, 0x7cfc6a84, 0xeb8d9d45, - 0xadd2a4dd, 0x3aa3531c, 0x86dd3dae, 0x11acca6f, 0xfbcd963b, 0x6cbc61fa, 0xd0c20f48, 0x47b3f889, - 0x94f385ff, 0x0382723e, 0xbffc1c8c, 0x288deb4d, 0xc2ecb719, 0x559d40d8, 0xe9e32e6a, 0x7e92d9ab, - 0x38cde033, 0xafbc17f2, 0x13c27940, 0x84b38e81, 0x6ed2d2d5, 0xf9a32514, 0x45dd4ba6, 0xd2acbc67, - 0xc9633896, 0x5e12cf57, 0xe26ca1e5, 0x751d5624, 0x9f7c0a70, 0x080dfdb1, 0xb4739303, 0x230264c2, - 0x655d5d5a, 0xf22caa9b, 0x4e52c429, 0xd92333e8, 0x33426fbc, 0xa433987d, 0x184df6cf, 0x8f3c010e, - 0x2fd2ff2d, 0xb8a308ec, 0x04dd665e, 0x93ac919f, 0x79cdcdcb, 0xeebc3a0a, 0x52c254b8, 0xc5b3a379, - 0x83ec9ae1, 0x149d6d20, 0xa8e30392, 0x3f92f453, 0xd5f3a807, 0x42825fc6, 0xfefc3174, 0x698dc6b5, - 0x72424244, 0xe533b585, 0x594ddb37, 0xce3c2cf6, 0x245d70a2, 0xb32c8763, 0x0f52e9d1, 0x98231e10, - 0xde7c2788, 0x490dd049, 0xf573befb, 0x6202493a, 0x8863156e, 0x1f12e2af, 0xa36c8c1d, 0x341d7bdc, - }, - { - 0x00000000, 0x3171d430, 0x62e3a860, 0x53927c50, 0xc5c750c0, 0xf4b684f0, 0xa724f8a0, 0x96552c90, - 0x8e62d771, 0xbf130341, 0xec817f11, 0xddf0ab21, 0x4ba587b1, 0x7ad45381, 0x29462fd1, 0x1837fbe1, - 0x1929d813, 0x28580c23, 0x7bca7073, 0x4abba443, 0xdcee88d3, 0xed9f5ce3, 0xbe0d20b3, 0x8f7cf483, - 0x974b0f62, 0xa63adb52, 0xf5a8a702, 0xc4d97332, 0x528c5fa2, 0x63fd8b92, 0x306ff7c2, 0x011e23f2, - 0x3253b026, 0x03226416, 0x50b01846, 0x61c1cc76, 0xf794e0e6, 0xc6e534d6, 0x95774886, 0xa4069cb6, - 0xbc316757, 0x8d40b367, 0xded2cf37, 0xefa31b07, 0x79f63797, 0x4887e3a7, 0x1b159ff7, 0x2a644bc7, - 0x2b7a6835, 0x1a0bbc05, 0x4999c055, 0x78e81465, 0xeebd38f5, 0xdfccecc5, 0x8c5e9095, 0xbd2f44a5, - 0xa518bf44, 0x94696b74, 0xc7fb1724, 0xf68ac314, 0x60dfef84, 0x51ae3bb4, 0x023c47e4, 0x334d93d4, - 0x64a7604c, 0x55d6b47c, 0x0644c82c, 0x37351c1c, 0xa160308c, 0x9011e4bc, 0xc38398ec, 0xf2f24cdc, - 0xeac5b73d, 0xdbb4630d, 0x88261f5d, 0xb957cb6d, 0x2f02e7fd, 0x1e7333cd, 0x4de14f9d, 0x7c909bad, - 0x7d8eb85f, 0x4cff6c6f, 0x1f6d103f, 0x2e1cc40f, 0xb849e89f, 0x89383caf, 0xdaaa40ff, 0xebdb94cf, - 0xf3ec6f2e, 0xc29dbb1e, 0x910fc74e, 0xa07e137e, 0x362b3fee, 0x075aebde, 0x54c8978e, 0x65b943be, - 0x56f4d06a, 0x6785045a, 0x3417780a, 0x0566ac3a, 0x933380aa, 0xa242549a, 0xf1d028ca, 0xc0a1fcfa, - 0xd896071b, 0xe9e7d32b, 0xba75af7b, 0x8b047b4b, 0x1d5157db, 0x2c2083eb, 0x7fb2ffbb, 0x4ec32b8b, - 0x4fdd0879, 0x7eacdc49, 0x2d3ea019, 0x1c4f7429, 0x8a1a58b9, 0xbb6b8c89, 0xe8f9f0d9, 0xd98824e9, - 0xc1bfdf08, 0xf0ce0b38, 0xa35c7768, 0x922da358, 0x04788fc8, 0x35095bf8, 0x669b27a8, 0x57eaf398, - 0xc94ec098, 0xf83f14a8, 0xabad68f8, 0x9adcbcc8, 0x0c899058, 0x3df84468, 0x6e6a3838, 0x5f1bec08, - 0x472c17e9, 0x765dc3d9, 0x25cfbf89, 0x14be6bb9, 0x82eb4729, 0xb39a9319, 0xe008ef49, 0xd1793b79, - 0xd067188b, 0xe116ccbb, 0xb284b0eb, 0x83f564db, 0x15a0484b, 0x24d19c7b, 0x7743e02b, 0x4632341b, - 0x5e05cffa, 0x6f741bca, 0x3ce6679a, 0x0d97b3aa, 0x9bc29f3a, 0xaab34b0a, 0xf921375a, 0xc850e36a, - 0xfb1d70be, 0xca6ca48e, 0x99fed8de, 0xa88f0cee, 0x3eda207e, 0x0fabf44e, 0x5c39881e, 0x6d485c2e, - 0x757fa7cf, 0x440e73ff, 0x179c0faf, 0x26eddb9f, 0xb0b8f70f, 0x81c9233f, 0xd25b5f6f, 0xe32a8b5f, - 0xe234a8ad, 0xd3457c9d, 0x80d700cd, 0xb1a6d4fd, 0x27f3f86d, 0x16822c5d, 0x4510500d, 0x7461843d, - 0x6c567fdc, 0x5d27abec, 0x0eb5d7bc, 0x3fc4038c, 0xa9912f1c, 0x98e0fb2c, 0xcb72877c, 0xfa03534c, - 0xade9a0d4, 0x9c9874e4, 0xcf0a08b4, 0xfe7bdc84, 0x682ef014, 0x595f2424, 0x0acd5874, 0x3bbc8c44, - 0x238b77a5, 0x12faa395, 0x4168dfc5, 0x70190bf5, 0xe64c2765, 0xd73df355, 0x84af8f05, 0xb5de5b35, - 0xb4c078c7, 0x85b1acf7, 0xd623d0a7, 0xe7520497, 0x71072807, 0x4076fc37, 0x13e48067, 0x22955457, - 0x3aa2afb6, 0x0bd37b86, 0x584107d6, 0x6930d3e6, 0xff65ff76, 0xce142b46, 0x9d865716, 0xacf78326, - 0x9fba10f2, 0xaecbc4c2, 0xfd59b892, 0xcc286ca2, 0x5a7d4032, 0x6b0c9402, 0x389ee852, 0x09ef3c62, - 0x11d8c783, 0x20a913b3, 0x733b6fe3, 0x424abbd3, 0xd41f9743, 0xe56e4373, 0xb6fc3f23, 0x878deb13, - 0x8693c8e1, 0xb7e21cd1, 0xe4706081, 0xd501b4b1, 0x43549821, 0x72254c11, 0x21b73041, 0x10c6e471, - 0x08f11f90, 0x3980cba0, 0x6a12b7f0, 0x5b6363c0, 0xcd364f50, 0xfc479b60, 0xafd5e730, 0x9ea43300, - }, - { - 0x00000000, 0x30d23865, 0x61a470ca, 0x517648af, 0xc348e194, 0xf39ad9f1, 0xa2ec915e, 0x923ea93b, - 0x837db5d9, 0xb3af8dbc, 0xe2d9c513, 0xd20bfd76, 0x4035544d, 0x70e76c28, 0x21912487, 0x11431ce2, - 0x03171d43, 0x33c52526, 0x62b36d89, 0x526155ec, 0xc05ffcd7, 0xf08dc4b2, 0xa1fb8c1d, 0x9129b478, - 0x806aa89a, 0xb0b890ff, 0xe1ced850, 0xd11ce035, 0x4322490e, 0x73f0716b, 0x228639c4, 0x125401a1, - 0x062e3a86, 0x36fc02e3, 0x678a4a4c, 0x57587229, 0xc566db12, 0xf5b4e377, 0xa4c2abd8, 0x941093bd, - 0x85538f5f, 0xb581b73a, 0xe4f7ff95, 0xd425c7f0, 0x461b6ecb, 0x76c956ae, 0x27bf1e01, 0x176d2664, - 0x053927c5, 0x35eb1fa0, 0x649d570f, 0x544f6f6a, 0xc671c651, 0xf6a3fe34, 0xa7d5b69b, 0x97078efe, - 0x8644921c, 0xb696aa79, 0xe7e0e2d6, 0xd732dab3, 0x450c7388, 0x75de4bed, 0x24a80342, 0x147a3b27, - 0x0c5c750c, 0x3c8e4d69, 0x6df805c6, 0x5d2a3da3, 0xcf149498, 0xffc6acfd, 0xaeb0e452, 0x9e62dc37, - 0x8f21c0d5, 0xbff3f8b0, 0xee85b01f, 0xde57887a, 0x4c692141, 0x7cbb1924, 0x2dcd518b, 0x1d1f69ee, - 0x0f4b684f, 0x3f99502a, 0x6eef1885, 0x5e3d20e0, 0xcc0389db, 0xfcd1b1be, 0xada7f911, 0x9d75c174, - 0x8c36dd96, 0xbce4e5f3, 0xed92ad5c, 0xdd409539, 0x4f7e3c02, 0x7fac0467, 0x2eda4cc8, 0x1e0874ad, - 0x0a724f8a, 0x3aa077ef, 0x6bd63f40, 0x5b040725, 0xc93aae1e, 0xf9e8967b, 0xa89eded4, 0x984ce6b1, - 0x890ffa53, 0xb9ddc236, 0xe8ab8a99, 0xd879b2fc, 0x4a471bc7, 0x7a9523a2, 0x2be36b0d, 0x1b315368, - 0x096552c9, 0x39b76aac, 0x68c12203, 0x58131a66, 0xca2db35d, 0xfaff8b38, 0xab89c397, 0x9b5bfbf2, - 0x8a18e710, 0xbacadf75, 0xebbc97da, 0xdb6eafbf, 0x49500684, 0x79823ee1, 0x28f4764e, 0x18264e2b, - 0x18b8ea18, 0x286ad27d, 0x791c9ad2, 0x49cea2b7, 0xdbf00b8c, 0xeb2233e9, 0xba547b46, 0x8a864323, - 0x9bc55fc1, 0xab1767a4, 0xfa612f0b, 0xcab3176e, 0x588dbe55, 0x685f8630, 0x3929ce9f, 0x09fbf6fa, - 0x1baff75b, 0x2b7dcf3e, 0x7a0b8791, 0x4ad9bff4, 0xd8e716cf, 0xe8352eaa, 0xb9436605, 0x89915e60, - 0x98d24282, 0xa8007ae7, 0xf9763248, 0xc9a40a2d, 0x5b9aa316, 0x6b489b73, 0x3a3ed3dc, 0x0aecebb9, - 0x1e96d09e, 0x2e44e8fb, 0x7f32a054, 0x4fe09831, 0xddde310a, 0xed0c096f, 0xbc7a41c0, 0x8ca879a5, - 0x9deb6547, 0xad395d22, 0xfc4f158d, 0xcc9d2de8, 0x5ea384d3, 0x6e71bcb6, 0x3f07f419, 0x0fd5cc7c, - 0x1d81cddd, 0x2d53f5b8, 0x7c25bd17, 0x4cf78572, 0xdec92c49, 0xee1b142c, 0xbf6d5c83, 0x8fbf64e6, - 0x9efc7804, 0xae2e4061, 0xff5808ce, 0xcf8a30ab, 0x5db49990, 0x6d66a1f5, 0x3c10e95a, 0x0cc2d13f, - 0x14e49f14, 0x2436a771, 0x7540efde, 0x4592d7bb, 0xd7ac7e80, 0xe77e46e5, 0xb6080e4a, 0x86da362f, - 0x97992acd, 0xa74b12a8, 0xf63d5a07, 0xc6ef6262, 0x54d1cb59, 0x6403f33c, 0x3575bb93, 0x05a783f6, - 0x17f38257, 0x2721ba32, 0x7657f29d, 0x4685caf8, 0xd4bb63c3, 0xe4695ba6, 0xb51f1309, 0x85cd2b6c, - 0x948e378e, 0xa45c0feb, 0xf52a4744, 0xc5f87f21, 0x57c6d61a, 0x6714ee7f, 0x3662a6d0, 0x06b09eb5, - 0x12caa592, 0x22189df7, 0x736ed558, 0x43bced3d, 0xd1824406, 0xe1507c63, 0xb02634cc, 0x80f40ca9, - 0x91b7104b, 0xa165282e, 0xf0136081, 0xc0c158e4, 0x52fff1df, 0x622dc9ba, 0x335b8115, 0x0389b970, - 0x11ddb8d1, 0x210f80b4, 0x7079c81b, 0x40abf07e, 0xd2955945, 0xe2476120, 0xb331298f, 0x83e311ea, - 0x92a00d08, 0xa272356d, 0xf3047dc2, 0xc3d645a7, 0x51e8ec9c, 0x613ad4f9, 0x304c9c56, 0x009ea433, - }, - { - 0x00000000, 0x54075546, 0xa80eaa8c, 0xfc09ffca, 0x55f123e9, 0x01f676af, 0xfdff8965, 0xa9f8dc23, - 0xabe247d2, 0xffe51294, 0x03eced5e, 0x57ebb818, 0xfe13643b, 0xaa14317d, 0x561dceb7, 0x021a9bf1, - 0x5228f955, 0x062fac13, 0xfa2653d9, 0xae21069f, 0x07d9dabc, 0x53de8ffa, 0xafd77030, 0xfbd02576, - 0xf9cabe87, 0xadcdebc1, 0x51c4140b, 0x05c3414d, 0xac3b9d6e, 0xf83cc828, 0x043537e2, 0x503262a4, - 0xa451f2aa, 0xf056a7ec, 0x0c5f5826, 0x58580d60, 0xf1a0d143, 0xa5a78405, 0x59ae7bcf, 0x0da92e89, - 0x0fb3b578, 0x5bb4e03e, 0xa7bd1ff4, 0xf3ba4ab2, 0x5a429691, 0x0e45c3d7, 0xf24c3c1d, 0xa64b695b, - 0xf6790bff, 0xa27e5eb9, 0x5e77a173, 0x0a70f435, 0xa3882816, 0xf78f7d50, 0x0b86829a, 0x5f81d7dc, - 0x5d9b4c2d, 0x099c196b, 0xf595e6a1, 0xa192b3e7, 0x086a6fc4, 0x5c6d3a82, 0xa064c548, 0xf463900e, - 0x4d4f93a5, 0x1948c6e3, 0xe5413929, 0xb1466c6f, 0x18beb04c, 0x4cb9e50a, 0xb0b01ac0, 0xe4b74f86, - 0xe6add477, 0xb2aa8131, 0x4ea37efb, 0x1aa42bbd, 0xb35cf79e, 0xe75ba2d8, 0x1b525d12, 0x4f550854, - 0x1f676af0, 0x4b603fb6, 0xb769c07c, 0xe36e953a, 0x4a964919, 0x1e911c5f, 0xe298e395, 0xb69fb6d3, - 0xb4852d22, 0xe0827864, 0x1c8b87ae, 0x488cd2e8, 0xe1740ecb, 0xb5735b8d, 0x497aa447, 0x1d7df101, - 0xe91e610f, 0xbd193449, 0x4110cb83, 0x15179ec5, 0xbcef42e6, 0xe8e817a0, 0x14e1e86a, 0x40e6bd2c, - 0x42fc26dd, 0x16fb739b, 0xeaf28c51, 0xbef5d917, 0x170d0534, 0x430a5072, 0xbf03afb8, 0xeb04fafe, - 0xbb36985a, 0xef31cd1c, 0x133832d6, 0x473f6790, 0xeec7bbb3, 0xbac0eef5, 0x46c9113f, 0x12ce4479, - 0x10d4df88, 0x44d38ace, 0xb8da7504, 0xecdd2042, 0x4525fc61, 0x1122a927, 0xed2b56ed, 0xb92c03ab, - 0x9a9f274a, 0xce98720c, 0x32918dc6, 0x6696d880, 0xcf6e04a3, 0x9b6951e5, 0x6760ae2f, 0x3367fb69, - 0x317d6098, 0x657a35de, 0x9973ca14, 0xcd749f52, 0x648c4371, 0x308b1637, 0xcc82e9fd, 0x9885bcbb, - 0xc8b7de1f, 0x9cb08b59, 0x60b97493, 0x34be21d5, 0x9d46fdf6, 0xc941a8b0, 0x3548577a, 0x614f023c, - 0x635599cd, 0x3752cc8b, 0xcb5b3341, 0x9f5c6607, 0x36a4ba24, 0x62a3ef62, 0x9eaa10a8, 0xcaad45ee, - 0x3eced5e0, 0x6ac980a6, 0x96c07f6c, 0xc2c72a2a, 0x6b3ff609, 0x3f38a34f, 0xc3315c85, 0x973609c3, - 0x952c9232, 0xc12bc774, 0x3d2238be, 0x69256df8, 0xc0ddb1db, 0x94dae49d, 0x68d31b57, 0x3cd44e11, - 0x6ce62cb5, 0x38e179f3, 0xc4e88639, 0x90efd37f, 0x39170f5c, 0x6d105a1a, 0x9119a5d0, 0xc51ef096, - 0xc7046b67, 0x93033e21, 0x6f0ac1eb, 0x3b0d94ad, 0x92f5488e, 0xc6f21dc8, 0x3afbe202, 0x6efcb744, - 0xd7d0b4ef, 0x83d7e1a9, 0x7fde1e63, 0x2bd94b25, 0x82219706, 0xd626c240, 0x2a2f3d8a, 0x7e2868cc, - 0x7c32f33d, 0x2835a67b, 0xd43c59b1, 0x803b0cf7, 0x29c3d0d4, 0x7dc48592, 0x81cd7a58, 0xd5ca2f1e, - 0x85f84dba, 0xd1ff18fc, 0x2df6e736, 0x79f1b270, 0xd0096e53, 0x840e3b15, 0x7807c4df, 0x2c009199, - 0x2e1a0a68, 0x7a1d5f2e, 0x8614a0e4, 0xd213f5a2, 0x7beb2981, 0x2fec7cc7, 0xd3e5830d, 0x87e2d64b, - 0x73814645, 0x27861303, 0xdb8fecc9, 0x8f88b98f, 0x267065ac, 0x727730ea, 0x8e7ecf20, 0xda799a66, - 0xd8630197, 0x8c6454d1, 0x706dab1b, 0x246afe5d, 0x8d92227e, 0xd9957738, 0x259c88f2, 0x719bddb4, - 0x21a9bf10, 0x75aeea56, 0x89a7159c, 0xdda040da, 0x74589cf9, 0x205fc9bf, 0xdc563675, 0x88516333, - 0x8a4bf8c2, 0xde4cad84, 0x2245524e, 0x76420708, 0xdfbadb2b, 0x8bbd8e6d, 0x77b471a7, 0x23b324e1, - }, - { - 0x00000000, 0x678efd01, 0xcf1dfa02, 0xa8930703, 0x9bd782f5, 0xfc597ff4, 0x54ca78f7, 0x334485f6, - 0x3243731b, 0x55cd8e1a, 0xfd5e8919, 0x9ad07418, 0xa994f1ee, 0xce1a0cef, 0x66890bec, 0x0107f6ed, - 0x6486e636, 0x03081b37, 0xab9b1c34, 0xcc15e135, 0xff5164c3, 0x98df99c2, 0x304c9ec1, 0x57c263c0, - 0x56c5952d, 0x314b682c, 0x99d86f2f, 0xfe56922e, 0xcd1217d8, 0xaa9cead9, 0x020fedda, 0x658110db, - 0xc90dcc6c, 0xae83316d, 0x0610366e, 0x619ecb6f, 0x52da4e99, 0x3554b398, 0x9dc7b49b, 0xfa49499a, - 0xfb4ebf77, 0x9cc04276, 0x34534575, 0x53ddb874, 0x60993d82, 0x0717c083, 0xaf84c780, 0xc80a3a81, - 0xad8b2a5a, 0xca05d75b, 0x6296d058, 0x05182d59, 0x365ca8af, 0x51d255ae, 0xf94152ad, 0x9ecfafac, - 0x9fc85941, 0xf846a440, 0x50d5a343, 0x375b5e42, 0x041fdbb4, 0x639126b5, 0xcb0221b6, 0xac8cdcb7, - 0x97f7ee29, 0xf0791328, 0x58ea142b, 0x3f64e92a, 0x0c206cdc, 0x6bae91dd, 0xc33d96de, 0xa4b36bdf, - 0xa5b49d32, 0xc23a6033, 0x6aa96730, 0x0d279a31, 0x3e631fc7, 0x59ede2c6, 0xf17ee5c5, 0x96f018c4, - 0xf371081f, 0x94fff51e, 0x3c6cf21d, 0x5be20f1c, 0x68a68aea, 0x0f2877eb, 0xa7bb70e8, 0xc0358de9, - 0xc1327b04, 0xa6bc8605, 0x0e2f8106, 0x69a17c07, 0x5ae5f9f1, 0x3d6b04f0, 0x95f803f3, 0xf276fef2, - 0x5efa2245, 0x3974df44, 0x91e7d847, 0xf6692546, 0xc52da0b0, 0xa2a35db1, 0x0a305ab2, 0x6dbea7b3, - 0x6cb9515e, 0x0b37ac5f, 0xa3a4ab5c, 0xc42a565d, 0xf76ed3ab, 0x90e02eaa, 0x387329a9, 0x5ffdd4a8, - 0x3a7cc473, 0x5df23972, 0xf5613e71, 0x92efc370, 0xa1ab4686, 0xc625bb87, 0x6eb6bc84, 0x09384185, - 0x083fb768, 0x6fb14a69, 0xc7224d6a, 0xa0acb06b, 0x93e8359d, 0xf466c89c, 0x5cf5cf9f, 0x3b7b329e, - 0x2a03aaa3, 0x4d8d57a2, 0xe51e50a1, 0x8290ada0, 0xb1d42856, 0xd65ad557, 0x7ec9d254, 0x19472f55, - 0x1840d9b8, 0x7fce24b9, 0xd75d23ba, 0xb0d3debb, 0x83975b4d, 0xe419a64c, 0x4c8aa14f, 0x2b045c4e, - 0x4e854c95, 0x290bb194, 0x8198b697, 0xe6164b96, 0xd552ce60, 0xb2dc3361, 0x1a4f3462, 0x7dc1c963, - 0x7cc63f8e, 0x1b48c28f, 0xb3dbc58c, 0xd455388d, 0xe711bd7b, 0x809f407a, 0x280c4779, 0x4f82ba78, - 0xe30e66cf, 0x84809bce, 0x2c139ccd, 0x4b9d61cc, 0x78d9e43a, 0x1f57193b, 0xb7c41e38, 0xd04ae339, - 0xd14d15d4, 0xb6c3e8d5, 0x1e50efd6, 0x79de12d7, 0x4a9a9721, 0x2d146a20, 0x85876d23, 0xe2099022, - 0x878880f9, 0xe0067df8, 0x48957afb, 0x2f1b87fa, 0x1c5f020c, 0x7bd1ff0d, 0xd342f80e, 0xb4cc050f, - 0xb5cbf3e2, 0xd2450ee3, 0x7ad609e0, 0x1d58f4e1, 0x2e1c7117, 0x49928c16, 0xe1018b15, 0x868f7614, - 0xbdf4448a, 0xda7ab98b, 0x72e9be88, 0x15674389, 0x2623c67f, 0x41ad3b7e, 0xe93e3c7d, 0x8eb0c17c, - 0x8fb73791, 0xe839ca90, 0x40aacd93, 0x27243092, 0x1460b564, 0x73ee4865, 0xdb7d4f66, 0xbcf3b267, - 0xd972a2bc, 0xbefc5fbd, 0x166f58be, 0x71e1a5bf, 0x42a52049, 0x252bdd48, 0x8db8da4b, 0xea36274a, - 0xeb31d1a7, 0x8cbf2ca6, 0x242c2ba5, 0x43a2d6a4, 0x70e65352, 0x1768ae53, 0xbffba950, 0xd8755451, - 0x74f988e6, 0x137775e7, 0xbbe472e4, 0xdc6a8fe5, 0xef2e0a13, 0x88a0f712, 0x2033f011, 0x47bd0d10, - 0x46bafbfd, 0x213406fc, 0x89a701ff, 0xee29fcfe, 0xdd6d7908, 0xbae38409, 0x1270830a, 0x75fe7e0b, - 0x107f6ed0, 0x77f193d1, 0xdf6294d2, 0xb8ec69d3, 0x8ba8ec25, 0xec261124, 0x44b51627, 0x233beb26, - 0x223c1dcb, 0x45b2e0ca, 0xed21e7c9, 0x8aaf1ac8, 0xb9eb9f3e, 0xde65623f, 0x76f6653c, 0x1178983d, - }, - { - 0x00000000, 0xf20c0dfe, 0xe1f46d0d, 0x13f860f3, 0xc604aceb, 0x3408a115, 0x27f0c1e6, 0xd5fccc18, - 0x89e52f27, 0x7be922d9, 0x6811422a, 0x9a1d4fd4, 0x4fe183cc, 0xbded8e32, 0xae15eec1, 0x5c19e33f, - 0x162628bf, 0xe42a2541, 0xf7d245b2, 0x05de484c, 0xd0228454, 0x222e89aa, 0x31d6e959, 0xc3dae4a7, - 0x9fc30798, 0x6dcf0a66, 0x7e376a95, 0x8c3b676b, 0x59c7ab73, 0xabcba68d, 0xb833c67e, 0x4a3fcb80, - 0x2c4c517e, 0xde405c80, 0xcdb83c73, 0x3fb4318d, 0xea48fd95, 0x1844f06b, 0x0bbc9098, 0xf9b09d66, - 0xa5a97e59, 0x57a573a7, 0x445d1354, 0xb6511eaa, 0x63add2b2, 0x91a1df4c, 0x8259bfbf, 0x7055b241, - 0x3a6a79c1, 0xc866743f, 0xdb9e14cc, 0x29921932, 0xfc6ed52a, 0x0e62d8d4, 0x1d9ab827, 0xef96b5d9, - 0xb38f56e6, 0x41835b18, 0x527b3beb, 0xa0773615, 0x758bfa0d, 0x8787f7f3, 0x947f9700, 0x66739afe, - 0x5898a2fc, 0xaa94af02, 0xb96ccff1, 0x4b60c20f, 0x9e9c0e17, 0x6c9003e9, 0x7f68631a, 0x8d646ee4, - 0xd17d8ddb, 0x23718025, 0x3089e0d6, 0xc285ed28, 0x17792130, 0xe5752cce, 0xf68d4c3d, 0x048141c3, - 0x4ebe8a43, 0xbcb287bd, 0xaf4ae74e, 0x5d46eab0, 0x88ba26a8, 0x7ab62b56, 0x694e4ba5, 0x9b42465b, - 0xc75ba564, 0x3557a89a, 0x26afc869, 0xd4a3c597, 0x015f098f, 0xf3530471, 0xe0ab6482, 0x12a7697c, - 0x74d4f382, 0x86d8fe7c, 0x95209e8f, 0x672c9371, 0xb2d05f69, 0x40dc5297, 0x53243264, 0xa1283f9a, - 0xfd31dca5, 0x0f3dd15b, 0x1cc5b1a8, 0xeec9bc56, 0x3b35704e, 0xc9397db0, 0xdac11d43, 0x28cd10bd, - 0x62f2db3d, 0x90fed6c3, 0x8306b630, 0x710abbce, 0xa4f677d6, 0x56fa7a28, 0x45021adb, 0xb70e1725, - 0xeb17f41a, 0x191bf9e4, 0x0ae39917, 0xf8ef94e9, 0x2d1358f1, 0xdf1f550f, 0xcce735fc, 0x3eeb3802, - 0xb13145f8, 0x433d4806, 0x50c528f5, 0xa2c9250b, 0x7735e913, 0x8539e4ed, 0x96c1841e, 0x64cd89e0, - 0x38d46adf, 0xcad86721, 0xd92007d2, 0x2b2c0a2c, 0xfed0c634, 0x0cdccbca, 0x1f24ab39, 0xed28a6c7, - 0xa7176d47, 0x551b60b9, 0x46e3004a, 0xb4ef0db4, 0x6113c1ac, 0x931fcc52, 0x80e7aca1, 0x72eba15f, - 0x2ef24260, 0xdcfe4f9e, 0xcf062f6d, 0x3d0a2293, 0xe8f6ee8b, 0x1afae375, 0x09028386, 0xfb0e8e78, - 0x9d7d1486, 0x6f711978, 0x7c89798b, 0x8e857475, 0x5b79b86d, 0xa975b593, 0xba8dd560, 0x4881d89e, - 0x14983ba1, 0xe694365f, 0xf56c56ac, 0x07605b52, 0xd29c974a, 0x20909ab4, 0x3368fa47, 0xc164f7b9, - 0x8b5b3c39, 0x795731c7, 0x6aaf5134, 0x98a35cca, 0x4d5f90d2, 0xbf539d2c, 0xacabfddf, 0x5ea7f021, - 0x02be131e, 0xf0b21ee0, 0xe34a7e13, 0x114673ed, 0xc4babff5, 0x36b6b20b, 0x254ed2f8, 0xd742df06, - 0xe9a9e704, 0x1ba5eafa, 0x085d8a09, 0xfa5187f7, 0x2fad4bef, 0xdda14611, 0xce5926e2, 0x3c552b1c, - 0x604cc823, 0x9240c5dd, 0x81b8a52e, 0x73b4a8d0, 0xa64864c8, 0x54446936, 0x47bc09c5, 0xb5b0043b, - 0xff8fcfbb, 0x0d83c245, 0x1e7ba2b6, 0xec77af48, 0x398b6350, 0xcb876eae, 0xd87f0e5d, 0x2a7303a3, - 0x766ae09c, 0x8466ed62, 0x979e8d91, 0x6592806f, 0xb06e4c77, 0x42624189, 0x519a217a, 0xa3962c84, - 0xc5e5b67a, 0x37e9bb84, 0x2411db77, 0xd61dd689, 0x03e11a91, 0xf1ed176f, 0xe215779c, 0x10197a62, - 0x4c00995d, 0xbe0c94a3, 0xadf4f450, 0x5ff8f9ae, 0x8a0435b6, 0x78083848, 0x6bf058bb, 0x99fc5545, - 0xd3c39ec5, 0x21cf933b, 0x3237f3c8, 0xc03bfe36, 0x15c7322e, 0xe7cb3fd0, 0xf4335f23, 0x063f52dd, - 0x5a26b1e2, 0xa82abc1c, 0xbbd2dcef, 0x49ded111, 0x9c221d09, 0x6e2e10f7, 0x7dd67004, 0x8fda7dfa, - }, + { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, + }, + { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483, + }, + { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8, + }, + { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842, + }, + { + 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, + 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, + 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, + 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, + 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, + 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, + 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, + 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, + 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, + 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, + 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, + 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, + 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, + 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, + 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, + 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, + 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, + 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, + 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, + 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, + 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, + 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, + 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, + 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, + 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, + 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, + 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, + 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, + 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, + 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, + 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, + 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3, + }, + { + 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, + 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, + 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, + 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, + 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, + 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, + 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, + 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, + 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, + 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, + 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, + 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, + 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, + 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, + 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, + 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, + 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, + 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, + 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, + 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, + 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, + 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, + 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, + 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, + 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, + 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, + 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, + 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, + 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, + 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, + 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, + 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c, + }, + { + 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, + 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, + 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, + 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, + 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, + 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, + 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, + 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, + 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, + 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, + 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, + 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, + 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, + 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, + 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, + 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, + 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, + 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, + 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, + 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, + 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, + 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, + 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, + 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, + 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, + 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, + 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, + 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, + 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, + 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, + 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, + 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f, + }, + { + 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, + 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, + 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, + 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, + 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, + 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, + 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, + 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, + 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, + 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, + 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, + 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, + 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, + 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, + 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, + 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, + 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, + 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, + 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, + 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, + 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, + 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, + 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, + 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, + 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, + 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, + 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, + 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, + 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, + 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, + 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, + 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5, + }, + { + 0x00000000, 0xf43ed648, 0xed91da61, 0x19af0c29, 0xdecfc233, 0x2af1147b, 0x335e1852, 0xc760ce1a, + 0xb873f297, 0x4c4d24df, 0x55e228f6, 0xa1dcfebe, 0x66bc30a4, 0x9282e6ec, 0x8b2deac5, 0x7f133c8d, + 0x750b93df, 0x81354597, 0x989a49be, 0x6ca49ff6, 0xabc451ec, 0x5ffa87a4, 0x46558b8d, 0xb26b5dc5, + 0xcd786148, 0x3946b700, 0x20e9bb29, 0xd4d76d61, 0x13b7a37b, 0xe7897533, 0xfe26791a, 0x0a18af52, + 0xea1727be, 0x1e29f1f6, 0x0786fddf, 0xf3b82b97, 0x34d8e58d, 0xc0e633c5, 0xd9493fec, 0x2d77e9a4, + 0x5264d529, 0xa65a0361, 0xbff50f48, 0x4bcbd900, 0x8cab171a, 0x7895c152, 0x613acd7b, 0x95041b33, + 0x9f1cb461, 0x6b226229, 0x728d6e00, 0x86b3b848, 0x41d37652, 0xb5eda01a, 0xac42ac33, 0x587c7a7b, + 0x276f46f6, 0xd35190be, 0xcafe9c97, 0x3ec04adf, 0xf9a084c5, 0x0d9e528d, 0x14315ea4, 0xe00f88ec, + 0xd1c2398d, 0x25fcefc5, 0x3c53e3ec, 0xc86d35a4, 0x0f0dfbbe, 0xfb332df6, 0xe29c21df, 0x16a2f797, + 0x69b1cb1a, 0x9d8f1d52, 0x8420117b, 0x701ec733, 0xb77e0929, 0x4340df61, 0x5aefd348, 0xaed10500, + 0xa4c9aa52, 0x50f77c1a, 0x49587033, 0xbd66a67b, 0x7a066861, 0x8e38be29, 0x9797b200, 0x63a96448, + 0x1cba58c5, 0xe8848e8d, 0xf12b82a4, 0x051554ec, 0xc2759af6, 0x364b4cbe, 0x2fe44097, 0xdbda96df, + 0x3bd51e33, 0xcfebc87b, 0xd644c452, 0x227a121a, 0xe51adc00, 0x11240a48, 0x088b0661, 0xfcb5d029, + 0x83a6eca4, 0x77983aec, 0x6e3736c5, 0x9a09e08d, 0x5d692e97, 0xa957f8df, 0xb0f8f4f6, 0x44c622be, + 0x4ede8dec, 0xbae05ba4, 0xa34f578d, 0x577181c5, 0x90114fdf, 0x642f9997, 0x7d8095be, 0x89be43f6, + 0xf6ad7f7b, 0x0293a933, 0x1b3ca51a, 0xef027352, 0x2862bd48, 0xdc5c6b00, 0xc5f36729, 0x31cdb161, + 0xa66805eb, 0x5256d3a3, 0x4bf9df8a, 0xbfc709c2, 0x78a7c7d8, 0x8c991190, 0x95361db9, 0x6108cbf1, + 0x1e1bf77c, 0xea252134, 0xf38a2d1d, 0x07b4fb55, 0xc0d4354f, 0x34eae307, 0x2d45ef2e, 0xd97b3966, + 0xd3639634, 0x275d407c, 0x3ef24c55, 0xcacc9a1d, 0x0dac5407, 0xf992824f, 0xe03d8e66, 0x1403582e, + 0x6b1064a3, 0x9f2eb2eb, 0x8681bec2, 0x72bf688a, 0xb5dfa690, 0x41e170d8, 0x584e7cf1, 0xac70aab9, + 0x4c7f2255, 0xb841f41d, 0xa1eef834, 0x55d02e7c, 0x92b0e066, 0x668e362e, 0x7f213a07, 0x8b1fec4f, + 0xf40cd0c2, 0x0032068a, 0x199d0aa3, 0xeda3dceb, 0x2ac312f1, 0xdefdc4b9, 0xc752c890, 0x336c1ed8, + 0x3974b18a, 0xcd4a67c2, 0xd4e56beb, 0x20dbbda3, 0xe7bb73b9, 0x1385a5f1, 0x0a2aa9d8, 0xfe147f90, + 0x8107431d, 0x75399555, 0x6c96997c, 0x98a84f34, 0x5fc8812e, 0xabf65766, 0xb2595b4f, 0x46678d07, + 0x77aa3c66, 0x8394ea2e, 0x9a3be607, 0x6e05304f, 0xa965fe55, 0x5d5b281d, 0x44f42434, 0xb0caf27c, + 0xcfd9cef1, 0x3be718b9, 0x22481490, 0xd676c2d8, 0x11160cc2, 0xe528da8a, 0xfc87d6a3, 0x08b900eb, + 0x02a1afb9, 0xf69f79f1, 0xef3075d8, 0x1b0ea390, 0xdc6e6d8a, 0x2850bbc2, 0x31ffb7eb, 0xc5c161a3, + 0xbad25d2e, 0x4eec8b66, 0x5743874f, 0xa37d5107, 0x641d9f1d, 0x90234955, 0x898c457c, 0x7db29334, + 0x9dbd1bd8, 0x6983cd90, 0x702cc1b9, 0x841217f1, 0x4372d9eb, 0xb74c0fa3, 0xaee3038a, 0x5addd5c2, + 0x25cee94f, 0xd1f03f07, 0xc85f332e, 0x3c61e566, 0xfb012b7c, 0x0f3ffd34, 0x1690f11d, 0xe2ae2755, + 0xe8b68807, 0x1c885e4f, 0x05275266, 0xf119842e, 0x36794a34, 0xc2479c7c, 0xdbe89055, 0x2fd6461d, + 0x50c57a90, 0xa4fbacd8, 0xbd54a0f1, 0x496a76b9, 0x8e0ab8a3, 0x7a346eeb, 0x639b62c2, 0x97a5b48a, + }, + { + 0x00000000, 0xcb567ba5, 0x934081bb, 0x5816fa1e, 0x236d7587, 0xe83b0e22, 0xb02df43c, 0x7b7b8f99, + 0x46daeb0e, 0x8d8c90ab, 0xd59a6ab5, 0x1ecc1110, 0x65b79e89, 0xaee1e52c, 0xf6f71f32, 0x3da16497, + 0x8db5d61c, 0x46e3adb9, 0x1ef557a7, 0xd5a32c02, 0xaed8a39b, 0x658ed83e, 0x3d982220, 0xf6ce5985, + 0xcb6f3d12, 0x003946b7, 0x582fbca9, 0x9379c70c, 0xe8024895, 0x23543330, 0x7b42c92e, 0xb014b28b, + 0x1e87dac9, 0xd5d1a16c, 0x8dc75b72, 0x469120d7, 0x3deaaf4e, 0xf6bcd4eb, 0xaeaa2ef5, 0x65fc5550, + 0x585d31c7, 0x930b4a62, 0xcb1db07c, 0x004bcbd9, 0x7b304440, 0xb0663fe5, 0xe870c5fb, 0x2326be5e, + 0x93320cd5, 0x58647770, 0x00728d6e, 0xcb24f6cb, 0xb05f7952, 0x7b0902f7, 0x231ff8e9, 0xe849834c, + 0xd5e8e7db, 0x1ebe9c7e, 0x46a86660, 0x8dfe1dc5, 0xf685925c, 0x3dd3e9f9, 0x65c513e7, 0xae936842, + 0x3d0fb592, 0xf659ce37, 0xae4f3429, 0x65194f8c, 0x1e62c015, 0xd534bbb0, 0x8d2241ae, 0x46743a0b, + 0x7bd55e9c, 0xb0832539, 0xe895df27, 0x23c3a482, 0x58b82b1b, 0x93ee50be, 0xcbf8aaa0, 0x00aed105, + 0xb0ba638e, 0x7bec182b, 0x23fae235, 0xe8ac9990, 0x93d71609, 0x58816dac, 0x009797b2, 0xcbc1ec17, + 0xf6608880, 0x3d36f325, 0x6520093b, 0xae76729e, 0xd50dfd07, 0x1e5b86a2, 0x464d7cbc, 0x8d1b0719, + 0x23886f5b, 0xe8de14fe, 0xb0c8eee0, 0x7b9e9545, 0x00e51adc, 0xcbb36179, 0x93a59b67, 0x58f3e0c2, + 0x65528455, 0xae04fff0, 0xf61205ee, 0x3d447e4b, 0x463ff1d2, 0x8d698a77, 0xd57f7069, 0x1e290bcc, + 0xae3db947, 0x656bc2e2, 0x3d7d38fc, 0xf62b4359, 0x8d50ccc0, 0x4606b765, 0x1e104d7b, 0xd54636de, + 0xe8e75249, 0x23b129ec, 0x7ba7d3f2, 0xb0f1a857, 0xcb8a27ce, 0x00dc5c6b, 0x58caa675, 0x939cddd0, + 0x7a1f6b24, 0xb1491081, 0xe95fea9f, 0x2209913a, 0x59721ea3, 0x92246506, 0xca329f18, 0x0164e4bd, + 0x3cc5802a, 0xf793fb8f, 0xaf850191, 0x64d37a34, 0x1fa8f5ad, 0xd4fe8e08, 0x8ce87416, 0x47be0fb3, + 0xf7aabd38, 0x3cfcc69d, 0x64ea3c83, 0xafbc4726, 0xd4c7c8bf, 0x1f91b31a, 0x47874904, 0x8cd132a1, + 0xb1705636, 0x7a262d93, 0x2230d78d, 0xe966ac28, 0x921d23b1, 0x594b5814, 0x015da20a, 0xca0bd9af, + 0x6498b1ed, 0xafceca48, 0xf7d83056, 0x3c8e4bf3, 0x47f5c46a, 0x8ca3bfcf, 0xd4b545d1, 0x1fe33e74, + 0x22425ae3, 0xe9142146, 0xb102db58, 0x7a54a0fd, 0x012f2f64, 0xca7954c1, 0x926faedf, 0x5939d57a, + 0xe92d67f1, 0x227b1c54, 0x7a6de64a, 0xb13b9def, 0xca401276, 0x011669d3, 0x590093cd, 0x9256e868, + 0xaff78cff, 0x64a1f75a, 0x3cb70d44, 0xf7e176e1, 0x8c9af978, 0x47cc82dd, 0x1fda78c3, 0xd48c0366, + 0x4710deb6, 0x8c46a513, 0xd4505f0d, 0x1f0624a8, 0x647dab31, 0xaf2bd094, 0xf73d2a8a, 0x3c6b512f, + 0x01ca35b8, 0xca9c4e1d, 0x928ab403, 0x59dccfa6, 0x22a7403f, 0xe9f13b9a, 0xb1e7c184, 0x7ab1ba21, + 0xcaa508aa, 0x01f3730f, 0x59e58911, 0x92b3f2b4, 0xe9c87d2d, 0x229e0688, 0x7a88fc96, 0xb1de8733, + 0x8c7fe3a4, 0x47299801, 0x1f3f621f, 0xd46919ba, 0xaf129623, 0x6444ed86, 0x3c521798, 0xf7046c3d, + 0x5997047f, 0x92c17fda, 0xcad785c4, 0x0181fe61, 0x7afa71f8, 0xb1ac0a5d, 0xe9baf043, 0x22ec8be6, + 0x1f4def71, 0xd41b94d4, 0x8c0d6eca, 0x475b156f, 0x3c209af6, 0xf776e153, 0xaf601b4d, 0x643660e8, + 0xd422d263, 0x1f74a9c6, 0x476253d8, 0x8c34287d, 0xf74fa7e4, 0x3c19dc41, 0x640f265f, 0xaf595dfa, + 0x92f8396d, 0x59ae42c8, 0x01b8b8d6, 0xcaeec373, 0xb1954cea, 0x7ac3374f, 0x22d5cd51, 0xe983b6f4, + }, + { + 0x00000000, 0x9771f7c1, 0x2b0f9973, 0xbc7e6eb2, 0x561f32e6, 0xc16ec527, 0x7d10ab95, 0xea615c54, + 0xac3e65cc, 0x3b4f920d, 0x8731fcbf, 0x10400b7e, 0xfa21572a, 0x6d50a0eb, 0xd12ece59, 0x465f3998, + 0x5d90bd69, 0xcae14aa8, 0x769f241a, 0xe1eed3db, 0x0b8f8f8f, 0x9cfe784e, 0x208016fc, 0xb7f1e13d, + 0xf1aed8a5, 0x66df2f64, 0xdaa141d6, 0x4dd0b617, 0xa7b1ea43, 0x30c01d82, 0x8cbe7330, 0x1bcf84f1, + 0xbb217ad2, 0x2c508d13, 0x902ee3a1, 0x075f1460, 0xed3e4834, 0x7a4fbff5, 0xc631d147, 0x51402686, + 0x171f1f1e, 0x806ee8df, 0x3c10866d, 0xab6171ac, 0x41002df8, 0xd671da39, 0x6a0fb48b, 0xfd7e434a, + 0xe6b1c7bb, 0x71c0307a, 0xcdbe5ec8, 0x5acfa909, 0xb0aef55d, 0x27df029c, 0x9ba16c2e, 0x0cd09bef, + 0x4a8fa277, 0xddfe55b6, 0x61803b04, 0xf6f1ccc5, 0x1c909091, 0x8be16750, 0x379f09e2, 0xa0eefe23, + 0x73ae8355, 0xe4df7494, 0x58a11a26, 0xcfd0ede7, 0x25b1b1b3, 0xb2c04672, 0x0ebe28c0, 0x99cfdf01, + 0xdf90e699, 0x48e11158, 0xf49f7fea, 0x63ee882b, 0x898fd47f, 0x1efe23be, 0xa2804d0c, 0x35f1bacd, + 0x2e3e3e3c, 0xb94fc9fd, 0x0531a74f, 0x9240508e, 0x78210cda, 0xef50fb1b, 0x532e95a9, 0xc45f6268, + 0x82005bf0, 0x1571ac31, 0xa90fc283, 0x3e7e3542, 0xd41f6916, 0x436e9ed7, 0xff10f065, 0x686107a4, + 0xc88ff987, 0x5ffe0e46, 0xe38060f4, 0x74f19735, 0x9e90cb61, 0x09e13ca0, 0xb59f5212, 0x22eea5d3, + 0x64b19c4b, 0xf3c06b8a, 0x4fbe0538, 0xd8cff2f9, 0x32aeaead, 0xa5df596c, 0x19a137de, 0x8ed0c01f, + 0x951f44ee, 0x026eb32f, 0xbe10dd9d, 0x29612a5c, 0xc3007608, 0x547181c9, 0xe80fef7b, 0x7f7e18ba, + 0x39212122, 0xae50d6e3, 0x122eb851, 0x855f4f90, 0x6f3e13c4, 0xf84fe405, 0x44318ab7, 0xd3407d76, + 0xe75d06aa, 0x702cf16b, 0xcc529fd9, 0x5b236818, 0xb142344c, 0x2633c38d, 0x9a4dad3f, 0x0d3c5afe, + 0x4b636366, 0xdc1294a7, 0x606cfa15, 0xf71d0dd4, 0x1d7c5180, 0x8a0da641, 0x3673c8f3, 0xa1023f32, + 0xbacdbbc3, 0x2dbc4c02, 0x91c222b0, 0x06b3d571, 0xecd28925, 0x7ba37ee4, 0xc7dd1056, 0x50ace797, + 0x16f3de0f, 0x818229ce, 0x3dfc477c, 0xaa8db0bd, 0x40ecece9, 0xd79d1b28, 0x6be3759a, 0xfc92825b, + 0x5c7c7c78, 0xcb0d8bb9, 0x7773e50b, 0xe00212ca, 0x0a634e9e, 0x9d12b95f, 0x216cd7ed, 0xb61d202c, + 0xf04219b4, 0x6733ee75, 0xdb4d80c7, 0x4c3c7706, 0xa65d2b52, 0x312cdc93, 0x8d52b221, 0x1a2345e0, + 0x01ecc111, 0x969d36d0, 0x2ae35862, 0xbd92afa3, 0x57f3f3f7, 0xc0820436, 0x7cfc6a84, 0xeb8d9d45, + 0xadd2a4dd, 0x3aa3531c, 0x86dd3dae, 0x11acca6f, 0xfbcd963b, 0x6cbc61fa, 0xd0c20f48, 0x47b3f889, + 0x94f385ff, 0x0382723e, 0xbffc1c8c, 0x288deb4d, 0xc2ecb719, 0x559d40d8, 0xe9e32e6a, 0x7e92d9ab, + 0x38cde033, 0xafbc17f2, 0x13c27940, 0x84b38e81, 0x6ed2d2d5, 0xf9a32514, 0x45dd4ba6, 0xd2acbc67, + 0xc9633896, 0x5e12cf57, 0xe26ca1e5, 0x751d5624, 0x9f7c0a70, 0x080dfdb1, 0xb4739303, 0x230264c2, + 0x655d5d5a, 0xf22caa9b, 0x4e52c429, 0xd92333e8, 0x33426fbc, 0xa433987d, 0x184df6cf, 0x8f3c010e, + 0x2fd2ff2d, 0xb8a308ec, 0x04dd665e, 0x93ac919f, 0x79cdcdcb, 0xeebc3a0a, 0x52c254b8, 0xc5b3a379, + 0x83ec9ae1, 0x149d6d20, 0xa8e30392, 0x3f92f453, 0xd5f3a807, 0x42825fc6, 0xfefc3174, 0x698dc6b5, + 0x72424244, 0xe533b585, 0x594ddb37, 0xce3c2cf6, 0x245d70a2, 0xb32c8763, 0x0f52e9d1, 0x98231e10, + 0xde7c2788, 0x490dd049, 0xf573befb, 0x6202493a, 0x8863156e, 0x1f12e2af, 0xa36c8c1d, 0x341d7bdc, + }, + { + 0x00000000, 0x3171d430, 0x62e3a860, 0x53927c50, 0xc5c750c0, 0xf4b684f0, 0xa724f8a0, 0x96552c90, + 0x8e62d771, 0xbf130341, 0xec817f11, 0xddf0ab21, 0x4ba587b1, 0x7ad45381, 0x29462fd1, 0x1837fbe1, + 0x1929d813, 0x28580c23, 0x7bca7073, 0x4abba443, 0xdcee88d3, 0xed9f5ce3, 0xbe0d20b3, 0x8f7cf483, + 0x974b0f62, 0xa63adb52, 0xf5a8a702, 0xc4d97332, 0x528c5fa2, 0x63fd8b92, 0x306ff7c2, 0x011e23f2, + 0x3253b026, 0x03226416, 0x50b01846, 0x61c1cc76, 0xf794e0e6, 0xc6e534d6, 0x95774886, 0xa4069cb6, + 0xbc316757, 0x8d40b367, 0xded2cf37, 0xefa31b07, 0x79f63797, 0x4887e3a7, 0x1b159ff7, 0x2a644bc7, + 0x2b7a6835, 0x1a0bbc05, 0x4999c055, 0x78e81465, 0xeebd38f5, 0xdfccecc5, 0x8c5e9095, 0xbd2f44a5, + 0xa518bf44, 0x94696b74, 0xc7fb1724, 0xf68ac314, 0x60dfef84, 0x51ae3bb4, 0x023c47e4, 0x334d93d4, + 0x64a7604c, 0x55d6b47c, 0x0644c82c, 0x37351c1c, 0xa160308c, 0x9011e4bc, 0xc38398ec, 0xf2f24cdc, + 0xeac5b73d, 0xdbb4630d, 0x88261f5d, 0xb957cb6d, 0x2f02e7fd, 0x1e7333cd, 0x4de14f9d, 0x7c909bad, + 0x7d8eb85f, 0x4cff6c6f, 0x1f6d103f, 0x2e1cc40f, 0xb849e89f, 0x89383caf, 0xdaaa40ff, 0xebdb94cf, + 0xf3ec6f2e, 0xc29dbb1e, 0x910fc74e, 0xa07e137e, 0x362b3fee, 0x075aebde, 0x54c8978e, 0x65b943be, + 0x56f4d06a, 0x6785045a, 0x3417780a, 0x0566ac3a, 0x933380aa, 0xa242549a, 0xf1d028ca, 0xc0a1fcfa, + 0xd896071b, 0xe9e7d32b, 0xba75af7b, 0x8b047b4b, 0x1d5157db, 0x2c2083eb, 0x7fb2ffbb, 0x4ec32b8b, + 0x4fdd0879, 0x7eacdc49, 0x2d3ea019, 0x1c4f7429, 0x8a1a58b9, 0xbb6b8c89, 0xe8f9f0d9, 0xd98824e9, + 0xc1bfdf08, 0xf0ce0b38, 0xa35c7768, 0x922da358, 0x04788fc8, 0x35095bf8, 0x669b27a8, 0x57eaf398, + 0xc94ec098, 0xf83f14a8, 0xabad68f8, 0x9adcbcc8, 0x0c899058, 0x3df84468, 0x6e6a3838, 0x5f1bec08, + 0x472c17e9, 0x765dc3d9, 0x25cfbf89, 0x14be6bb9, 0x82eb4729, 0xb39a9319, 0xe008ef49, 0xd1793b79, + 0xd067188b, 0xe116ccbb, 0xb284b0eb, 0x83f564db, 0x15a0484b, 0x24d19c7b, 0x7743e02b, 0x4632341b, + 0x5e05cffa, 0x6f741bca, 0x3ce6679a, 0x0d97b3aa, 0x9bc29f3a, 0xaab34b0a, 0xf921375a, 0xc850e36a, + 0xfb1d70be, 0xca6ca48e, 0x99fed8de, 0xa88f0cee, 0x3eda207e, 0x0fabf44e, 0x5c39881e, 0x6d485c2e, + 0x757fa7cf, 0x440e73ff, 0x179c0faf, 0x26eddb9f, 0xb0b8f70f, 0x81c9233f, 0xd25b5f6f, 0xe32a8b5f, + 0xe234a8ad, 0xd3457c9d, 0x80d700cd, 0xb1a6d4fd, 0x27f3f86d, 0x16822c5d, 0x4510500d, 0x7461843d, + 0x6c567fdc, 0x5d27abec, 0x0eb5d7bc, 0x3fc4038c, 0xa9912f1c, 0x98e0fb2c, 0xcb72877c, 0xfa03534c, + 0xade9a0d4, 0x9c9874e4, 0xcf0a08b4, 0xfe7bdc84, 0x682ef014, 0x595f2424, 0x0acd5874, 0x3bbc8c44, + 0x238b77a5, 0x12faa395, 0x4168dfc5, 0x70190bf5, 0xe64c2765, 0xd73df355, 0x84af8f05, 0xb5de5b35, + 0xb4c078c7, 0x85b1acf7, 0xd623d0a7, 0xe7520497, 0x71072807, 0x4076fc37, 0x13e48067, 0x22955457, + 0x3aa2afb6, 0x0bd37b86, 0x584107d6, 0x6930d3e6, 0xff65ff76, 0xce142b46, 0x9d865716, 0xacf78326, + 0x9fba10f2, 0xaecbc4c2, 0xfd59b892, 0xcc286ca2, 0x5a7d4032, 0x6b0c9402, 0x389ee852, 0x09ef3c62, + 0x11d8c783, 0x20a913b3, 0x733b6fe3, 0x424abbd3, 0xd41f9743, 0xe56e4373, 0xb6fc3f23, 0x878deb13, + 0x8693c8e1, 0xb7e21cd1, 0xe4706081, 0xd501b4b1, 0x43549821, 0x72254c11, 0x21b73041, 0x10c6e471, + 0x08f11f90, 0x3980cba0, 0x6a12b7f0, 0x5b6363c0, 0xcd364f50, 0xfc479b60, 0xafd5e730, 0x9ea43300, + }, + { + 0x00000000, 0x30d23865, 0x61a470ca, 0x517648af, 0xc348e194, 0xf39ad9f1, 0xa2ec915e, 0x923ea93b, + 0x837db5d9, 0xb3af8dbc, 0xe2d9c513, 0xd20bfd76, 0x4035544d, 0x70e76c28, 0x21912487, 0x11431ce2, + 0x03171d43, 0x33c52526, 0x62b36d89, 0x526155ec, 0xc05ffcd7, 0xf08dc4b2, 0xa1fb8c1d, 0x9129b478, + 0x806aa89a, 0xb0b890ff, 0xe1ced850, 0xd11ce035, 0x4322490e, 0x73f0716b, 0x228639c4, 0x125401a1, + 0x062e3a86, 0x36fc02e3, 0x678a4a4c, 0x57587229, 0xc566db12, 0xf5b4e377, 0xa4c2abd8, 0x941093bd, + 0x85538f5f, 0xb581b73a, 0xe4f7ff95, 0xd425c7f0, 0x461b6ecb, 0x76c956ae, 0x27bf1e01, 0x176d2664, + 0x053927c5, 0x35eb1fa0, 0x649d570f, 0x544f6f6a, 0xc671c651, 0xf6a3fe34, 0xa7d5b69b, 0x97078efe, + 0x8644921c, 0xb696aa79, 0xe7e0e2d6, 0xd732dab3, 0x450c7388, 0x75de4bed, 0x24a80342, 0x147a3b27, + 0x0c5c750c, 0x3c8e4d69, 0x6df805c6, 0x5d2a3da3, 0xcf149498, 0xffc6acfd, 0xaeb0e452, 0x9e62dc37, + 0x8f21c0d5, 0xbff3f8b0, 0xee85b01f, 0xde57887a, 0x4c692141, 0x7cbb1924, 0x2dcd518b, 0x1d1f69ee, + 0x0f4b684f, 0x3f99502a, 0x6eef1885, 0x5e3d20e0, 0xcc0389db, 0xfcd1b1be, 0xada7f911, 0x9d75c174, + 0x8c36dd96, 0xbce4e5f3, 0xed92ad5c, 0xdd409539, 0x4f7e3c02, 0x7fac0467, 0x2eda4cc8, 0x1e0874ad, + 0x0a724f8a, 0x3aa077ef, 0x6bd63f40, 0x5b040725, 0xc93aae1e, 0xf9e8967b, 0xa89eded4, 0x984ce6b1, + 0x890ffa53, 0xb9ddc236, 0xe8ab8a99, 0xd879b2fc, 0x4a471bc7, 0x7a9523a2, 0x2be36b0d, 0x1b315368, + 0x096552c9, 0x39b76aac, 0x68c12203, 0x58131a66, 0xca2db35d, 0xfaff8b38, 0xab89c397, 0x9b5bfbf2, + 0x8a18e710, 0xbacadf75, 0xebbc97da, 0xdb6eafbf, 0x49500684, 0x79823ee1, 0x28f4764e, 0x18264e2b, + 0x18b8ea18, 0x286ad27d, 0x791c9ad2, 0x49cea2b7, 0xdbf00b8c, 0xeb2233e9, 0xba547b46, 0x8a864323, + 0x9bc55fc1, 0xab1767a4, 0xfa612f0b, 0xcab3176e, 0x588dbe55, 0x685f8630, 0x3929ce9f, 0x09fbf6fa, + 0x1baff75b, 0x2b7dcf3e, 0x7a0b8791, 0x4ad9bff4, 0xd8e716cf, 0xe8352eaa, 0xb9436605, 0x89915e60, + 0x98d24282, 0xa8007ae7, 0xf9763248, 0xc9a40a2d, 0x5b9aa316, 0x6b489b73, 0x3a3ed3dc, 0x0aecebb9, + 0x1e96d09e, 0x2e44e8fb, 0x7f32a054, 0x4fe09831, 0xddde310a, 0xed0c096f, 0xbc7a41c0, 0x8ca879a5, + 0x9deb6547, 0xad395d22, 0xfc4f158d, 0xcc9d2de8, 0x5ea384d3, 0x6e71bcb6, 0x3f07f419, 0x0fd5cc7c, + 0x1d81cddd, 0x2d53f5b8, 0x7c25bd17, 0x4cf78572, 0xdec92c49, 0xee1b142c, 0xbf6d5c83, 0x8fbf64e6, + 0x9efc7804, 0xae2e4061, 0xff5808ce, 0xcf8a30ab, 0x5db49990, 0x6d66a1f5, 0x3c10e95a, 0x0cc2d13f, + 0x14e49f14, 0x2436a771, 0x7540efde, 0x4592d7bb, 0xd7ac7e80, 0xe77e46e5, 0xb6080e4a, 0x86da362f, + 0x97992acd, 0xa74b12a8, 0xf63d5a07, 0xc6ef6262, 0x54d1cb59, 0x6403f33c, 0x3575bb93, 0x05a783f6, + 0x17f38257, 0x2721ba32, 0x7657f29d, 0x4685caf8, 0xd4bb63c3, 0xe4695ba6, 0xb51f1309, 0x85cd2b6c, + 0x948e378e, 0xa45c0feb, 0xf52a4744, 0xc5f87f21, 0x57c6d61a, 0x6714ee7f, 0x3662a6d0, 0x06b09eb5, + 0x12caa592, 0x22189df7, 0x736ed558, 0x43bced3d, 0xd1824406, 0xe1507c63, 0xb02634cc, 0x80f40ca9, + 0x91b7104b, 0xa165282e, 0xf0136081, 0xc0c158e4, 0x52fff1df, 0x622dc9ba, 0x335b8115, 0x0389b970, + 0x11ddb8d1, 0x210f80b4, 0x7079c81b, 0x40abf07e, 0xd2955945, 0xe2476120, 0xb331298f, 0x83e311ea, + 0x92a00d08, 0xa272356d, 0xf3047dc2, 0xc3d645a7, 0x51e8ec9c, 0x613ad4f9, 0x304c9c56, 0x009ea433, + }, + { + 0x00000000, 0x54075546, 0xa80eaa8c, 0xfc09ffca, 0x55f123e9, 0x01f676af, 0xfdff8965, 0xa9f8dc23, + 0xabe247d2, 0xffe51294, 0x03eced5e, 0x57ebb818, 0xfe13643b, 0xaa14317d, 0x561dceb7, 0x021a9bf1, + 0x5228f955, 0x062fac13, 0xfa2653d9, 0xae21069f, 0x07d9dabc, 0x53de8ffa, 0xafd77030, 0xfbd02576, + 0xf9cabe87, 0xadcdebc1, 0x51c4140b, 0x05c3414d, 0xac3b9d6e, 0xf83cc828, 0x043537e2, 0x503262a4, + 0xa451f2aa, 0xf056a7ec, 0x0c5f5826, 0x58580d60, 0xf1a0d143, 0xa5a78405, 0x59ae7bcf, 0x0da92e89, + 0x0fb3b578, 0x5bb4e03e, 0xa7bd1ff4, 0xf3ba4ab2, 0x5a429691, 0x0e45c3d7, 0xf24c3c1d, 0xa64b695b, + 0xf6790bff, 0xa27e5eb9, 0x5e77a173, 0x0a70f435, 0xa3882816, 0xf78f7d50, 0x0b86829a, 0x5f81d7dc, + 0x5d9b4c2d, 0x099c196b, 0xf595e6a1, 0xa192b3e7, 0x086a6fc4, 0x5c6d3a82, 0xa064c548, 0xf463900e, + 0x4d4f93a5, 0x1948c6e3, 0xe5413929, 0xb1466c6f, 0x18beb04c, 0x4cb9e50a, 0xb0b01ac0, 0xe4b74f86, + 0xe6add477, 0xb2aa8131, 0x4ea37efb, 0x1aa42bbd, 0xb35cf79e, 0xe75ba2d8, 0x1b525d12, 0x4f550854, + 0x1f676af0, 0x4b603fb6, 0xb769c07c, 0xe36e953a, 0x4a964919, 0x1e911c5f, 0xe298e395, 0xb69fb6d3, + 0xb4852d22, 0xe0827864, 0x1c8b87ae, 0x488cd2e8, 0xe1740ecb, 0xb5735b8d, 0x497aa447, 0x1d7df101, + 0xe91e610f, 0xbd193449, 0x4110cb83, 0x15179ec5, 0xbcef42e6, 0xe8e817a0, 0x14e1e86a, 0x40e6bd2c, + 0x42fc26dd, 0x16fb739b, 0xeaf28c51, 0xbef5d917, 0x170d0534, 0x430a5072, 0xbf03afb8, 0xeb04fafe, + 0xbb36985a, 0xef31cd1c, 0x133832d6, 0x473f6790, 0xeec7bbb3, 0xbac0eef5, 0x46c9113f, 0x12ce4479, + 0x10d4df88, 0x44d38ace, 0xb8da7504, 0xecdd2042, 0x4525fc61, 0x1122a927, 0xed2b56ed, 0xb92c03ab, + 0x9a9f274a, 0xce98720c, 0x32918dc6, 0x6696d880, 0xcf6e04a3, 0x9b6951e5, 0x6760ae2f, 0x3367fb69, + 0x317d6098, 0x657a35de, 0x9973ca14, 0xcd749f52, 0x648c4371, 0x308b1637, 0xcc82e9fd, 0x9885bcbb, + 0xc8b7de1f, 0x9cb08b59, 0x60b97493, 0x34be21d5, 0x9d46fdf6, 0xc941a8b0, 0x3548577a, 0x614f023c, + 0x635599cd, 0x3752cc8b, 0xcb5b3341, 0x9f5c6607, 0x36a4ba24, 0x62a3ef62, 0x9eaa10a8, 0xcaad45ee, + 0x3eced5e0, 0x6ac980a6, 0x96c07f6c, 0xc2c72a2a, 0x6b3ff609, 0x3f38a34f, 0xc3315c85, 0x973609c3, + 0x952c9232, 0xc12bc774, 0x3d2238be, 0x69256df8, 0xc0ddb1db, 0x94dae49d, 0x68d31b57, 0x3cd44e11, + 0x6ce62cb5, 0x38e179f3, 0xc4e88639, 0x90efd37f, 0x39170f5c, 0x6d105a1a, 0x9119a5d0, 0xc51ef096, + 0xc7046b67, 0x93033e21, 0x6f0ac1eb, 0x3b0d94ad, 0x92f5488e, 0xc6f21dc8, 0x3afbe202, 0x6efcb744, + 0xd7d0b4ef, 0x83d7e1a9, 0x7fde1e63, 0x2bd94b25, 0x82219706, 0xd626c240, 0x2a2f3d8a, 0x7e2868cc, + 0x7c32f33d, 0x2835a67b, 0xd43c59b1, 0x803b0cf7, 0x29c3d0d4, 0x7dc48592, 0x81cd7a58, 0xd5ca2f1e, + 0x85f84dba, 0xd1ff18fc, 0x2df6e736, 0x79f1b270, 0xd0096e53, 0x840e3b15, 0x7807c4df, 0x2c009199, + 0x2e1a0a68, 0x7a1d5f2e, 0x8614a0e4, 0xd213f5a2, 0x7beb2981, 0x2fec7cc7, 0xd3e5830d, 0x87e2d64b, + 0x73814645, 0x27861303, 0xdb8fecc9, 0x8f88b98f, 0x267065ac, 0x727730ea, 0x8e7ecf20, 0xda799a66, + 0xd8630197, 0x8c6454d1, 0x706dab1b, 0x246afe5d, 0x8d92227e, 0xd9957738, 0x259c88f2, 0x719bddb4, + 0x21a9bf10, 0x75aeea56, 0x89a7159c, 0xdda040da, 0x74589cf9, 0x205fc9bf, 0xdc563675, 0x88516333, + 0x8a4bf8c2, 0xde4cad84, 0x2245524e, 0x76420708, 0xdfbadb2b, 0x8bbd8e6d, 0x77b471a7, 0x23b324e1, + }, + { + 0x00000000, 0x678efd01, 0xcf1dfa02, 0xa8930703, 0x9bd782f5, 0xfc597ff4, 0x54ca78f7, 0x334485f6, + 0x3243731b, 0x55cd8e1a, 0xfd5e8919, 0x9ad07418, 0xa994f1ee, 0xce1a0cef, 0x66890bec, 0x0107f6ed, + 0x6486e636, 0x03081b37, 0xab9b1c34, 0xcc15e135, 0xff5164c3, 0x98df99c2, 0x304c9ec1, 0x57c263c0, + 0x56c5952d, 0x314b682c, 0x99d86f2f, 0xfe56922e, 0xcd1217d8, 0xaa9cead9, 0x020fedda, 0x658110db, + 0xc90dcc6c, 0xae83316d, 0x0610366e, 0x619ecb6f, 0x52da4e99, 0x3554b398, 0x9dc7b49b, 0xfa49499a, + 0xfb4ebf77, 0x9cc04276, 0x34534575, 0x53ddb874, 0x60993d82, 0x0717c083, 0xaf84c780, 0xc80a3a81, + 0xad8b2a5a, 0xca05d75b, 0x6296d058, 0x05182d59, 0x365ca8af, 0x51d255ae, 0xf94152ad, 0x9ecfafac, + 0x9fc85941, 0xf846a440, 0x50d5a343, 0x375b5e42, 0x041fdbb4, 0x639126b5, 0xcb0221b6, 0xac8cdcb7, + 0x97f7ee29, 0xf0791328, 0x58ea142b, 0x3f64e92a, 0x0c206cdc, 0x6bae91dd, 0xc33d96de, 0xa4b36bdf, + 0xa5b49d32, 0xc23a6033, 0x6aa96730, 0x0d279a31, 0x3e631fc7, 0x59ede2c6, 0xf17ee5c5, 0x96f018c4, + 0xf371081f, 0x94fff51e, 0x3c6cf21d, 0x5be20f1c, 0x68a68aea, 0x0f2877eb, 0xa7bb70e8, 0xc0358de9, + 0xc1327b04, 0xa6bc8605, 0x0e2f8106, 0x69a17c07, 0x5ae5f9f1, 0x3d6b04f0, 0x95f803f3, 0xf276fef2, + 0x5efa2245, 0x3974df44, 0x91e7d847, 0xf6692546, 0xc52da0b0, 0xa2a35db1, 0x0a305ab2, 0x6dbea7b3, + 0x6cb9515e, 0x0b37ac5f, 0xa3a4ab5c, 0xc42a565d, 0xf76ed3ab, 0x90e02eaa, 0x387329a9, 0x5ffdd4a8, + 0x3a7cc473, 0x5df23972, 0xf5613e71, 0x92efc370, 0xa1ab4686, 0xc625bb87, 0x6eb6bc84, 0x09384185, + 0x083fb768, 0x6fb14a69, 0xc7224d6a, 0xa0acb06b, 0x93e8359d, 0xf466c89c, 0x5cf5cf9f, 0x3b7b329e, + 0x2a03aaa3, 0x4d8d57a2, 0xe51e50a1, 0x8290ada0, 0xb1d42856, 0xd65ad557, 0x7ec9d254, 0x19472f55, + 0x1840d9b8, 0x7fce24b9, 0xd75d23ba, 0xb0d3debb, 0x83975b4d, 0xe419a64c, 0x4c8aa14f, 0x2b045c4e, + 0x4e854c95, 0x290bb194, 0x8198b697, 0xe6164b96, 0xd552ce60, 0xb2dc3361, 0x1a4f3462, 0x7dc1c963, + 0x7cc63f8e, 0x1b48c28f, 0xb3dbc58c, 0xd455388d, 0xe711bd7b, 0x809f407a, 0x280c4779, 0x4f82ba78, + 0xe30e66cf, 0x84809bce, 0x2c139ccd, 0x4b9d61cc, 0x78d9e43a, 0x1f57193b, 0xb7c41e38, 0xd04ae339, + 0xd14d15d4, 0xb6c3e8d5, 0x1e50efd6, 0x79de12d7, 0x4a9a9721, 0x2d146a20, 0x85876d23, 0xe2099022, + 0x878880f9, 0xe0067df8, 0x48957afb, 0x2f1b87fa, 0x1c5f020c, 0x7bd1ff0d, 0xd342f80e, 0xb4cc050f, + 0xb5cbf3e2, 0xd2450ee3, 0x7ad609e0, 0x1d58f4e1, 0x2e1c7117, 0x49928c16, 0xe1018b15, 0x868f7614, + 0xbdf4448a, 0xda7ab98b, 0x72e9be88, 0x15674389, 0x2623c67f, 0x41ad3b7e, 0xe93e3c7d, 0x8eb0c17c, + 0x8fb73791, 0xe839ca90, 0x40aacd93, 0x27243092, 0x1460b564, 0x73ee4865, 0xdb7d4f66, 0xbcf3b267, + 0xd972a2bc, 0xbefc5fbd, 0x166f58be, 0x71e1a5bf, 0x42a52049, 0x252bdd48, 0x8db8da4b, 0xea36274a, + 0xeb31d1a7, 0x8cbf2ca6, 0x242c2ba5, 0x43a2d6a4, 0x70e65352, 0x1768ae53, 0xbffba950, 0xd8755451, + 0x74f988e6, 0x137775e7, 0xbbe472e4, 0xdc6a8fe5, 0xef2e0a13, 0x88a0f712, 0x2033f011, 0x47bd0d10, + 0x46bafbfd, 0x213406fc, 0x89a701ff, 0xee29fcfe, 0xdd6d7908, 0xbae38409, 0x1270830a, 0x75fe7e0b, + 0x107f6ed0, 0x77f193d1, 0xdf6294d2, 0xb8ec69d3, 0x8ba8ec25, 0xec261124, 0x44b51627, 0x233beb26, + 0x223c1dcb, 0x45b2e0ca, 0xed21e7c9, 0x8aaf1ac8, 0xb9eb9f3e, 0xde65623f, 0x76f6653c, 0x1178983d, + }, + { + 0x00000000, 0xf20c0dfe, 0xe1f46d0d, 0x13f860f3, 0xc604aceb, 0x3408a115, 0x27f0c1e6, 0xd5fccc18, + 0x89e52f27, 0x7be922d9, 0x6811422a, 0x9a1d4fd4, 0x4fe183cc, 0xbded8e32, 0xae15eec1, 0x5c19e33f, + 0x162628bf, 0xe42a2541, 0xf7d245b2, 0x05de484c, 0xd0228454, 0x222e89aa, 0x31d6e959, 0xc3dae4a7, + 0x9fc30798, 0x6dcf0a66, 0x7e376a95, 0x8c3b676b, 0x59c7ab73, 0xabcba68d, 0xb833c67e, 0x4a3fcb80, + 0x2c4c517e, 0xde405c80, 0xcdb83c73, 0x3fb4318d, 0xea48fd95, 0x1844f06b, 0x0bbc9098, 0xf9b09d66, + 0xa5a97e59, 0x57a573a7, 0x445d1354, 0xb6511eaa, 0x63add2b2, 0x91a1df4c, 0x8259bfbf, 0x7055b241, + 0x3a6a79c1, 0xc866743f, 0xdb9e14cc, 0x29921932, 0xfc6ed52a, 0x0e62d8d4, 0x1d9ab827, 0xef96b5d9, + 0xb38f56e6, 0x41835b18, 0x527b3beb, 0xa0773615, 0x758bfa0d, 0x8787f7f3, 0x947f9700, 0x66739afe, + 0x5898a2fc, 0xaa94af02, 0xb96ccff1, 0x4b60c20f, 0x9e9c0e17, 0x6c9003e9, 0x7f68631a, 0x8d646ee4, + 0xd17d8ddb, 0x23718025, 0x3089e0d6, 0xc285ed28, 0x17792130, 0xe5752cce, 0xf68d4c3d, 0x048141c3, + 0x4ebe8a43, 0xbcb287bd, 0xaf4ae74e, 0x5d46eab0, 0x88ba26a8, 0x7ab62b56, 0x694e4ba5, 0x9b42465b, + 0xc75ba564, 0x3557a89a, 0x26afc869, 0xd4a3c597, 0x015f098f, 0xf3530471, 0xe0ab6482, 0x12a7697c, + 0x74d4f382, 0x86d8fe7c, 0x95209e8f, 0x672c9371, 0xb2d05f69, 0x40dc5297, 0x53243264, 0xa1283f9a, + 0xfd31dca5, 0x0f3dd15b, 0x1cc5b1a8, 0xeec9bc56, 0x3b35704e, 0xc9397db0, 0xdac11d43, 0x28cd10bd, + 0x62f2db3d, 0x90fed6c3, 0x8306b630, 0x710abbce, 0xa4f677d6, 0x56fa7a28, 0x45021adb, 0xb70e1725, + 0xeb17f41a, 0x191bf9e4, 0x0ae39917, 0xf8ef94e9, 0x2d1358f1, 0xdf1f550f, 0xcce735fc, 0x3eeb3802, + 0xb13145f8, 0x433d4806, 0x50c528f5, 0xa2c9250b, 0x7735e913, 0x8539e4ed, 0x96c1841e, 0x64cd89e0, + 0x38d46adf, 0xcad86721, 0xd92007d2, 0x2b2c0a2c, 0xfed0c634, 0x0cdccbca, 0x1f24ab39, 0xed28a6c7, + 0xa7176d47, 0x551b60b9, 0x46e3004a, 0xb4ef0db4, 0x6113c1ac, 0x931fcc52, 0x80e7aca1, 0x72eba15f, + 0x2ef24260, 0xdcfe4f9e, 0xcf062f6d, 0x3d0a2293, 0xe8f6ee8b, 0x1afae375, 0x09028386, 0xfb0e8e78, + 0x9d7d1486, 0x6f711978, 0x7c89798b, 0x8e857475, 0x5b79b86d, 0xa975b593, 0xba8dd560, 0x4881d89e, + 0x14983ba1, 0xe694365f, 0xf56c56ac, 0x07605b52, 0xd29c974a, 0x20909ab4, 0x3368fa47, 0xc164f7b9, + 0x8b5b3c39, 0x795731c7, 0x6aaf5134, 0x98a35cca, 0x4d5f90d2, 0xbf539d2c, 0xacabfddf, 0x5ea7f021, + 0x02be131e, 0xf0b21ee0, 0xe34a7e13, 0x114673ed, 0xc4babff5, 0x36b6b20b, 0x254ed2f8, 0xd742df06, + 0xe9a9e704, 0x1ba5eafa, 0x085d8a09, 0xfa5187f7, 0x2fad4bef, 0xdda14611, 0xce5926e2, 0x3c552b1c, + 0x604cc823, 0x9240c5dd, 0x81b8a52e, 0x73b4a8d0, 0xa64864c8, 0x54446936, 0x47bc09c5, 0xb5b0043b, + 0xff8fcfbb, 0x0d83c245, 0x1e7ba2b6, 0xec77af48, 0x398b6350, 0xcb876eae, 0xd87f0e5d, 0x2a7303a3, + 0x766ae09c, 0x8466ed62, 0x979e8d91, 0x6592806f, 0xb06e4c77, 0x42624189, 0x519a217a, 0xa3962c84, + 0xc5e5b67a, 0x37e9bb84, 0x2411db77, 0xd61dd689, 0x03e11a91, 0xf1ed176f, 0xe215779c, 0x10197a62, + 0x4c00995d, 0xbe0c94a3, 0xadf4f450, 0x5ff8f9ae, 0x8a0435b6, 0x78083848, 0x6bf058bb, 0x99fc5545, + 0xd3c39ec5, 0x21cf933b, 0x3237f3c8, 0xc03bfe36, 0x15c7322e, 0xe7cb3fd0, 0xf4335f23, 0x063f52dd, + 0x5a26b1e2, 0xa82abc1c, 0xbbd2dcef, 0x49ded111, 0x9c221d09, 0x6e2e10f7, 0x7dd67004, 0x8fda7dfa, + }, }; diff --git a/util/VCode.h b/util/VCode.h index 907c3367..b09eddbb 100644 --- a/util/VCode.h +++ b/util/VCode.h @@ -59,65 +59,71 @@ // fulfill this role quite well. CRC32c in particular has explicit // hardware support in many popular architectures, making it one of // the lowest-overhead options, both in terms of time and op count. -// ----------------------------------------------------------------------------- -void VCODE_INIT(void); -uint32_t VCODE_FINALIZE(void); +//----------------------------------------------------------------------------- +void VCODE_INIT( void ); +uint32_t VCODE_FINALIZE( void ); // VCodes have 64-bit state to lessen the probability of internal // state collisions. Since CRC HW support is commonly for 32-bits at // most, two separate CRCs are stored. typedef struct { - uint32_t data_hash; - uint32_t lens_hash; + uint32_t data_hash; + uint32_t lens_hash; } vcode_state_t; #define VCODE_COUNT 3 extern vcode_state_t vcode_states[VCODE_COUNT]; -extern uint32_t g_doVCode; -extern uint32_t g_inputVCode; -extern uint32_t g_outputVCode; -extern uint32_t g_resultVCode; +extern uint32_t g_doVCode; +extern uint32_t g_inputVCode; +extern uint32_t g_outputVCode; +extern uint32_t g_resultVCode; //----------------------------------------------------------------------------- // HW CRC32c wrappers/accessors #if defined(HAVE_ARM_ACLE) -# include "Intrinsics.h" -# define HWCRC_U64 __crc32cd -# define HWCRC_U8 __crc32cb + #include "Intrinsics.h" + #define HWCRC_U64 __crc32cd + #define HWCRC_U8 __crc32cb #elif defined(HAVE_ARM64_ASM) -static inline uint32_t _hwcrc_asm64(uint32_t crc, uint64_t data) { - __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n" - : [c] "+r"(crc) - : [v] "r"(data)); + +static inline uint32_t _hwcrc_asm64( uint32_t crc, uint64_t data ) { + __asm__ __volatile__ ("crc32cx %w[c], %w[c], %x[v]\n" + : [c] "+r"(crc) + : [v] "r"(data)); return crc; } -static inline uint32_t _hwcrc_asm8(uint32_t crc, uint8_t data) { - __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n" - : [c] "+r"(crc) - : [v] "r"(data)); + +static inline uint32_t _hwcrc_asm8( uint32_t crc, uint8_t data ) { + __asm__ __volatile__ ("crc32cb %w[c], %w[c], %w[v]\n" + : [c] "+r"(crc) + : [v] "r"(data)); return crc; } -# define HWCRC_U64 _hwcrc_asm64 -# define HWCRC_U8 _hwcrc_asm8 + + #define HWCRC_U64 _hwcrc_asm64 + #define HWCRC_U8 _hwcrc_asm8 #elif defined(HAVE_X86_64_CRC32C) -# include "Intrinsics.h" -# define HWCRC_U64 _mm_crc32_u64 -# define HWCRC_U8 _mm_crc32_u8 + #include "Intrinsics.h" + #define HWCRC_U64 _mm_crc32_u64 + #define HWCRC_U8 _mm_crc32_u8 #elif defined(HAVE_X86_64_ASM) -static inline uint32_t _hwcrc_asm64(uint64_t crc, uint64_t data) { - __asm__ __volatile__("crc32q %1, %0\n" - : "+r"(crc) - : "rm"(data)); + +static inline uint32_t _hwcrc_asm64( uint64_t crc, uint64_t data ) { + __asm__ __volatile__ ("crc32q %1, %0\n" + : "+r"(crc) + : "rm"(data)); return (uint32_t)crc; } -static inline uint32_t _hwcrc_asm8(uint32_t crc, uint8_t data) { - __asm__ __volatile__("crc32b %1, %0\n" - : "+r"(crc) - : "r"(data)); + +static inline uint32_t _hwcrc_asm8( uint32_t crc, uint8_t data ) { + __asm__ __volatile__ ("crc32b %1, %0\n" + : "+r"(crc) + : "r"(data)); return crc; } -# define HWCRC_U64 _hwcrc_asm64 -# define HWCRC_U8 _hwcrc_asm8 + + #define HWCRC_U64 _hwcrc_asm64 + #define HWCRC_U8 _hwcrc_asm8 #endif //----------------------------------------------------------------------------- @@ -125,69 +131,72 @@ static inline uint32_t _hwcrc_asm8(uint32_t crc, uint8_t data) { extern const uint32_t crc32c_sw_table[16][256]; // This is based on Mark Adler's implementation. -static inline uint32_t crc32c_update_sw_u64(uint32_t crc, uint64_t data) { +static inline uint32_t crc32c_update_sw_u64( uint32_t crc, uint64_t data ) { uint64_t crc64 = crc ^ data; + crc64 = - crc32c_sw_table[7][ crc64 & 0xff] ^ - crc32c_sw_table[6][(crc64 >> 8) & 0xff] ^ - crc32c_sw_table[5][(crc64 >> 16) & 0xff] ^ - crc32c_sw_table[4][(crc64 >> 24) & 0xff] ^ - crc32c_sw_table[3][(crc64 >> 32) & 0xff] ^ - crc32c_sw_table[2][(crc64 >> 40) & 0xff] ^ - crc32c_sw_table[1][(crc64 >> 48) & 0xff] ^ - crc32c_sw_table[0][ crc64 >> 56] ; + crc32c_sw_table[7][crc64 & 0xff] ^ + crc32c_sw_table[6][(crc64 >> 8) & 0xff] ^ + crc32c_sw_table[5][(crc64 >> 16) & 0xff] ^ + crc32c_sw_table[4][(crc64 >> 24) & 0xff] ^ + crc32c_sw_table[3][(crc64 >> 32) & 0xff] ^ + crc32c_sw_table[2][(crc64 >> 40) & 0xff] ^ + crc32c_sw_table[1][(crc64 >> 48) & 0xff] ^ + crc32c_sw_table[0][crc64 >> 56]; return (uint32_t)crc64; } -static inline void crc32c_update_u64(uint32_t * crcptr, uint64_t data) { +static inline void crc32c_update_u64( uint32_t * crcptr, uint64_t data ) { uint32_t crc = *crcptr; + #if defined(HWCRC_U64) - crc = HWCRC_U64(crc, data); + crc = HWCRC_U64(crc, data); #else - crc = crc32c_update_sw_u64(crc, data); + crc = crc32c_update_sw_u64(crc, data); #endif *crcptr = crc; } //----------------------------------------------------------------------------- // Special-case inline-able handling of 8-or-fewer byte integer VCode inputs -static inline void VCODE_HASH_SMALL(const uint64_t data, unsigned idx) { - if (idx >= VCODE_COUNT) +static inline void VCODE_HASH_SMALL( const uint64_t data, unsigned idx ) { + if (idx >= VCODE_COUNT) { return; + } crc32c_update_u64(&vcode_states[idx].data_hash, data); - crc32c_update_u64(&vcode_states[idx].lens_hash, 8); + crc32c_update_u64(&vcode_states[idx].lens_hash, 8); } -template < typename T > -static inline void addVCodeInput(const T data) { +template +static inline void addVCodeInput( const T data ) { static_assert(std::is_integral::value, "Non-integer data requires addVCode(const void *, size_t)"); - if (g_doVCode) VCODE_HASH_SMALL((uint64_t)data, 0); + if (g_doVCode) { VCODE_HASH_SMALL((uint64_t)data, 0); } } -template < typename T > -static inline void addVCodeOutput(const T data) { +template +static inline void addVCodeOutput( const T data ) { static_assert(std::is_integral::value, "Non-integer data requires addVCode(const void *, size_t)"); - if (g_doVCode) VCODE_HASH_SMALL((uint64_t)data, 1); + if (g_doVCode) { VCODE_HASH_SMALL((uint64_t)data, 1); } } -template < typename T > -static inline void addVCodeResult(const T data) { +template +static inline void addVCodeResult( const T data ) { static_assert(std::is_integral::value, "Non-integer data requires addVCode(const void *, size_t)"); - if (g_doVCode) VCODE_HASH_SMALL((uint64_t)data, 2); + if (g_doVCode) { VCODE_HASH_SMALL((uint64_t)data, 2); } } //----------------------------------------------------------------------------- // General-purpose VCode input handling -void VCODE_HASH(const void * input, size_t len, unsigned idx); +void VCODE_HASH( const void * input, size_t len, unsigned idx ); -static inline void addVCodeInput(const void * in, size_t len) { - if (g_doVCode) VCODE_HASH(in, len, 0); +static inline void addVCodeInput( const void * in, size_t len ) { + if (g_doVCode) { VCODE_HASH(in, len, 0); } } -static inline void addVCodeOutput(const void * in, size_t len) { - if (g_doVCode) VCODE_HASH(in, len, 1); +static inline void addVCodeOutput( const void * in, size_t len ) { + if (g_doVCode) { VCODE_HASH(in, len, 1); } } -static inline void addVCodeResult(const void * in, size_t len) { - if (g_doVCode) VCODE_HASH(in, len, 2); +static inline void addVCodeResult( const void * in, size_t len ) { + if (g_doVCode) { VCODE_HASH(in, len, 2); } }