diff --git a/.gitignore b/.gitignore index d15cb8096..75eb4b16a 100644 --- a/.gitignore +++ b/.gitignore @@ -57,7 +57,9 @@ history /config.status /configure /doc/asy-latex.i* +/doc/asy-latex.hd /doc/asy.1 +/doc/asymptote_html/ /glrender.d.54461 /gsl.symbols.h /keywords.h @@ -66,6 +68,10 @@ history /types.symbols.h *.dSYM .DS_Store +/errors.temp +/base/webgl/asygl.js +/v3dheadertypes.py +/v3dtypes.py ### TeX-related ## Core latex/pdflatex auxiliary files: @@ -95,6 +101,9 @@ history /doc/**/asymptote.* !/doc/asymptote.texi /doc/options +/doc/latexusage-?.asy +/doc/latexusage-?.tex +/doc/latexusage-*.pbsdat .asy_* ## Bibliography auxiliary files (bibtex/biblatex/biber): diff --git a/CMakeLists.txt b/CMakeLists.txt index 0cee80ae5..ddc3fd6d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,21 @@ cmake_minimum_required(VERSION 3.27) +if (NOT CMAKE_HOST_SYSTEM_PROCESSOR) + if (WIN32) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64") + else() + set(CMAKE_HOST_SYSTEM_PROCESSOR "i386") + endif() + else() + execute_process(COMMAND uname -m OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +endif() +message(STATUS "Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}") +if (NOT CMAKE_SYSTEM_PROCESSOR) + set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}) +endif() + include(cmake-scripts/common.cmake) include(asy-pkg-version-suffix.cmake OPTIONAL RESULT_VARIABLE ASY_ADDR_VERSION_SUFFIX_FILE) include(cmake-scripts/options.cmake) diff --git a/Makefile.in b/Makefile.in index b572a0ec5..f1e56def5 100644 --- a/Makefile.in +++ b/Makefile.in @@ -11,8 +11,8 @@ GCOPTIONS = @GCOPTIONS@ GCLIB = @GCLIB@ GCPPLIB = @GCPPLIB@ GCLIBS = $(GCPPLIB) $(GCLIB) -LFLAGS = @LDFLAGS@ -LIBS = $(LFLAGS) @PTHREAD_LIBS@ @GLEW@ @LIBS@ $(GCLIBS) @LSPLIBS@ +LFLAGS = @LDFLAGS@ -Lhighwayhash/lib +LIBS = $(LFLAGS) @PTHREAD_LIBS@ @GLEW@ @LIBS@ $(GCLIBS) @LSPLIBS@ -lhighwayhash CXX_STANDARD = @CXX_STANDARD@ DOSLIBS = $(LIBS) -ltermcap -lwolfssl -lgdi32 -lwinmm -s -static LSP_BUILD_ROOT=@LSP_BUILD_ROOT@ @@ -29,7 +29,7 @@ PYUIC ?= pyuic5 # We have to remove OpenGL, threading, GC, etc from this. SHAREDLIBS = $(filter-out -lglut -GL -pthread $(GCLIBS), $(LIBS)) -vpath %.cc prc +vpath %.cc prc highwayhash vpath %.cc thirdparty_impl/tinyexr_impl/src vpath %.ui GUI/windows vpath %.py GUI/pyUIClass @@ -59,7 +59,8 @@ COREFILES = $(CAMP) $(SYMBOL_FILES) $(PRC) $(TINYEXR_FILES) \ Delaunay predicates glrender tr shaders jsfile v3dfile \ EXRFiles GLTextures lspserv symbolmaps win32helpers win32pipestream \ win32xdr xstream \ - lspdec lspexp lspfundec lspstm + lspdec lspexp lspfundec lspstm \ + hashing FILES = $(COREFILES) main @@ -93,7 +94,7 @@ DEFS = @DEFS@ @OPTIONS@ @PTHREAD_CFLAGS@ -DFFTWPP_SINGLE_THREAD -Wall -I. CFLAGS = @CFLAGS@ OPTS = $(DEFS) @CPPFLAGS@ @CXXFLAGS@ $(CFLAGS) \ -Ibackports/optional/include \ - -Iprc/include -I$(LSP_ROOT)/include + -Iprc/include -I$(LSP_ROOT)/include -Ihighwayhash GLEWOPTS = $(DEFS) @CPPFLAGS@ $(CFLAGS) -DGLEW_NO_GLU -DGLEW_BUILD -O1 -fPIC # Options for compiling the object files for the shared library. @@ -136,7 +137,7 @@ endif export prefix docdir exampledir mandir infodir INSTALL MAKE DESTDIR TEXI2DVI -asy: base/version.asy $(FILES:=.o) $(XNAME) revision.o @LSPLIB@ @GLEW@ +asy: base/version.asy $(FILES:=.o) $(XNAME) revision.o @LSPLIB@ @GLEW@ libhighwayhash.a $(CXX) $(OPTS) -o $(NAME) $(FILES:=.o) revision.o $(LIBS) $(XNAME): $(PYFILES) @@ -175,6 +176,9 @@ $(LSP_BUILD_ROOT)/liblspcpp.a: @LSP_CMAKE_OPTIONS@ $(MAKE) -C $(LSP_BUILD_ROOT) +libhighwayhash.a: + cd highwayhash && $(MAKE) + all: asy sty man faq asy-keywords.el $(GCLIB): @@ -372,6 +376,7 @@ clean: FORCE -cd LspCpp && $(MAKE) distclean -cd LspCpp && rm -rf liblspcpp.a Makefile CMakeFiles third_party/uri/src/CMakeFiles -cd tinyexr && $(MAKE) clean + -cd highwayhash && $(MAKE) clean gc-clean: FORCE clean -$(MAKE) -C $(GC) clean @@ -394,6 +399,8 @@ cleaner: FORCE clean distclean: FORCE cleaner cd doc && $(MAKE) distclean + cd highwayhash && $(MAKE) distclean + cleanest: FORCE maintainer-clean maintainer-clean: FORCE distclean diff --git a/base/collections/enumerate.asy b/base/collections/enumerate.asy new file mode 100644 index 000000000..d73f33465 --- /dev/null +++ b/base/collections/enumerate.asy @@ -0,0 +1,46 @@ +typedef import(T); + +from collections.iter(T=T) access Iter_T, Iterable_T; +from collections.genericpair(K=int, V=T) access + Pair_K_V as Pair_int_T, + makePair; +from collections.iter(T=Pair_int_T) access + Iter_T as Iter_Pair_int_T, + Iterable_T as Iterable_Pair_int_T, + Iterable; + +Iterable_Pair_int_T enumerate(Iterable_T iterable) { + Iter_Pair_int_T iter() { + int i = 0; + Iter_T it = iterable.operator iter(); + Iter_Pair_int_T result; + result.valid = it.valid; + result.get = new Pair_int_T() { + return makePair(i, it.get()); + }; + result.advance = new void() { + ++i; + it.advance(); + }; + return result; + } + return Iterable(iter); +} + +Iterable_Pair_int_T enumerate(T[] array) { + Iter_Pair_int_T iter() { + int i = 0; + Iter_Pair_int_T result; + result.valid = new bool() { + return i < array.length; + }; + result.get = new Pair_int_T() { + return makePair(i, array[i]); + }; + result.advance = new void() { + ++i; + }; + return result; + } + return Iterable(iter); +} \ No newline at end of file diff --git a/base/collections/genericpair.asy b/base/collections/genericpair.asy new file mode 100644 index 000000000..55e861d05 --- /dev/null +++ b/base/collections/genericpair.asy @@ -0,0 +1,20 @@ +typedef import(K, V); + +struct Pair_K_V { + restricted K k; + restricted V v; + void operator init(K k, V v) { + this.k = k; + this.v = v; + } + autounravel bool operator ==(Pair_K_V a, Pair_K_V b) { + // NOTE: This won't compile if K or V is an array type since == is + // vectorized for arrays. We could locally define a cast operator from + // bool[] to bool, but that would not behave as expected if comparing two + // arrays of different lengths. (We would get an error instead of false.) + return a.k == b.k && a.v == b.v; + } + int hash(); // To be overridden by the user. +} + +Pair_K_V makePair(K k, V v) = Pair_K_V; \ No newline at end of file diff --git a/base/collections/hashmap.asy b/base/collections/hashmap.asy new file mode 100644 index 000000000..989ca0a79 --- /dev/null +++ b/base/collections/hashmap.asy @@ -0,0 +1,90 @@ +typedef import(K, V); + +from collections.map(K=K, V=V) access Map_K_V, Iter_K, Iter_K_V, Iterable_K; +from collections.genericpair(K=K, V=V) access Pair_K_V, makePair; +from collections.hashrepset(T=Pair_K_V) access + HashRepSet_T as HashRepSet_K_V; + +private Pair_K_V operator tuple(K k, V v) { + Pair_K_V pair = makePair(k, v); + pair.hash = k.hash; + return pair; +} + +struct HashMap_K_V { + struct _ { autounravel restricted Map_K_V map; } + + private HashRepSet_K_V pairs = HashRepSet_K_V( + nullT=null, + equiv = new bool(Pair_K_V a, Pair_K_V b) { + // NOTE: This should never be called on a null pair. + return a.k == b.k; + }, + isNullT = new bool(Pair_K_V kv) { return alias(kv, null); } + ); + + void operator init() { + using F = void(); + ((F)map.operator init)(); + } + + // TODO: Change "isNullValue" to "isNullValue", and similar. + + void operator init(V nullValue, bool isNullValue(V) = null) { + using F = void(V, bool isNullValue(V)=null); // The default value here is ignored. + if (isNullValue == null) { + ((F)map.operator init)(nullValue); // Let operator init supply its own default. + } else { + ((F)map.operator init)(nullValue, isNullValue); + } + } + + map.size = pairs.size; + + map.contains = new bool(K key) { + return pairs.contains((key, map.nullValue)); + }; + + map.operator[] = new V(K key) { + Pair_K_V pair = pairs.get((key, map.nullValue)); + if (!alias(pair, null)) { + return pair.v; + } + assert(map.isNullValue != null, 'Key not found in map'); + return map.nullValue; + }; + + map.operator [=] = new void(K key, V value) { + if (map.isNullValue != null && map.isNullValue(value)) { + pairs.delete((key, value)); + } else { + pairs.update((key, value)); + } + }; + + map.delete = new void(K key) { + Pair_K_V removed = pairs.delete((key, map.nullValue)); + assert(!alias(removed, null), 'Nonexistent key cannot be deleted'); + }; + + map.operator iter = new Iter_K() { + Iter_K_V it = pairs.operator iter(); + Iter_K result; + result.valid = it.valid; + result.advance = it.advance; + result.get = new K() { return it.get().k; }; + return result; + }; + + autounravel Iterable_K operator cast(HashMap_K_V map) { + return Iterable_K(map.map.operator iter); + } + autounravel K[] operator ecast(HashMap_K_V map) { + return (K[])(Iterable_K)map; + } + autounravel Map_K_V operator cast(HashMap_K_V map) { + return map.map; + } + + unravel map; +} \ No newline at end of file diff --git a/base/collections/hashrepset.asy b/base/collections/hashrepset.asy new file mode 100644 index 000000000..949b8d53d --- /dev/null +++ b/base/collections/hashrepset.asy @@ -0,0 +1,259 @@ +typedef import(T); + +from collections.repset(T=T) access Iter_T, Iterable_T, RepSet_T; + +private struct HashEntry { + T item; + int hash = -1; + HashEntry newer = null; + HashEntry older = null; +} + +struct HashRepSet_T { + struct _ { autounravel restricted RepSet_T super; } + from super unravel nullT, equiv, isNullT; + + // These fields are mutable. + private HashEntry[] buckets = array(16, (HashEntry)null); + buckets.cyclic = true; + private int size = 0; + private int zombies = 0; + private int numChanges = 0; // Detect concurrent modification. + HashEntry newest = null; + HashEntry oldest = null; + + void operator init() { + using F = void(); + ((F)super.operator init)(); + } + void operator init(T nullT, + bool equiv(T a, T b) = operator ==, + bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) { + typedef void F(T, bool equiv(T, T), bool isNullT(T)); + ((F)super.operator init)(nullT, equiv, isNullT); + } + + RepSet_T newEmpty() { + return HashRepSet_T(nullT, equiv, isNullT).super; + } + + super.size = new int() { + return size; + }; + + super.contains = new bool(T item) { + int bucket = item.hash(); + for (int i = 0; i < buckets.length; ++i) { + HashEntry entry = buckets[bucket + i]; + if (entry == null) { + return false; + } + if (entry.hash == bucket && equiv(entry.item, item)) { + return true; + } + } + return false; + }; + + super.get = new T(T item) { + int bucket = item.hash(); + for (int i = 0; i < buckets.length; ++i) { + HashEntry entry = buckets[bucket + i]; + if (entry == null) { + return super.nullT; + } + if (entry.hash == bucket && equiv(entry.item, item)) { + return entry.item; + } + } + assert(isNullT != null, 'Item is not present.'); + return super.nullT; + }; + + super.operator iter = new Iter_T() { + Iter_T result = new Iter_T; + HashEntry current = oldest; + int expectedChanges = numChanges; + result.valid = new bool() { + assert(numChanges == expectedChanges, 'Concurrent modification'); + return current != null; + }; + result.get = new T() { + assert(numChanges == expectedChanges, 'Concurrent modification'); + assert(result.valid()); + return current.item; + }; + result.advance = new void() { + assert(numChanges == expectedChanges, 'Concurrent modification'); + assert(result.valid()); + current = current.newer; + }; + return result; + }; + + private void changeCapacity() { + ++numChanges; + int newCapacity = (zombies > size ? buckets.length : 2 * buckets.length); + zombies = 0; + buckets = array(newCapacity, (HashEntry)null); + buckets.cyclic = true; + for (HashEntry current = oldest; current != null; current = current.newer) { + int bucket = current.hash; + for (int i = 0; i < buckets.length; ++i) { + if (buckets[bucket + i] == null) { + buckets[bucket + i] = current; + break; + } + assert(i < buckets.length - 1, 'No space in hash table; ' + 'is the linked list circular?'); + } + } + } + + // Returns an int as follows (note: "index" is modulo buckets.length): + // * If an equivalent item is in the set, returns its index. + // * Otherwise, if least one bucket is empty, returns the index of the empty + // bucket in which the item should be placed if added. + // * Otherwise, returns -1. + private int find(T item, int hash) { + for (int i = 0; i < buckets.length; ++i) { + int index = hash + i; + HashEntry entry = buckets[index]; + if (entry == null) { + return index; + } + if (entry.hash == hash && equiv(entry.item, item)) { + return index; + } + } + return -1; + } + + super.add = new bool(T item) { + ++numChanges; + if (isNullT != null && isNullT(item)) { + return false; + } + if (2 * (size + zombies) >= buckets.length) { + changeCapacity(); + } + int bucket = item.hash(); + int index = find(item, bucket); + if (index == -1) { + changeCapacity(); + index = find(item, bucket); + assert(index != -1, 'No space in hash table'); + } + HashEntry entry = buckets[index]; + if (entry != null) { + return false; + } + + ++numChanges; + if (2 * (size + zombies) >= buckets.length) { + changeCapacity(); + index = find(item, bucket); + assert(index != -1); + assert(buckets[index] == null); + } + entry = buckets[index] = new HashEntry; + entry.item = item; + entry.hash = bucket; + entry.older = newest; + if (newest != null) { + newest.newer = entry; + } + newest = entry; + if (oldest == null) { + oldest = entry; + } + ++size; + return true; + }; + + super.update = new T(T item) { + if (isNullT != null && isNullT(item)) { + return nullT; + } + int bucket = item.hash(); + int index = find(item, bucket); + if (index == -1) { + changeCapacity(); + index = find(item, bucket); + assert(index != -1, 'No space in hash table'); + } + HashEntry entry = buckets[index]; + if (entry != null) { + T result = entry.item; + entry.item = item; + return result; + } + ++numChanges; + if (2 * (size + zombies) >= buckets.length) { + changeCapacity(); + index = find(item, bucket); + assert(index != -1); + assert(buckets[index] == null); + } + entry = buckets[index] = new HashEntry; + assert(isNullT != null, + 'Adding item via update() without defining nullT.'); + entry.item = item; + entry.hash = bucket; + entry.older = newest; + if (newest != null) { + newest.newer = entry; + } + newest = entry; + if (oldest == null) { + oldest = entry; + } + ++size; + return nullT; + }; + + super.delete = new T(T item) { + int bucket = item.hash(); + int index = find(item, bucket); + HashEntry entry = buckets[index]; + if (index == -1) { + assert(false, 'Overcrowded hash table; zombies: ' + string(zombies) + + '; size: ' + string(size) + + '; buckets.length: ' + string(buckets.length)); + return nullT; + } + if (entry == null) { + assert(isNullT != null, 'Item is not present.'); + return nullT; + } + ++numChanges; + T result = entry.item; + entry.hash = -1; + ++zombies; + if (entry.older != null) { + entry.older.newer = entry.newer; + } else { + oldest = entry.newer; + } + if (entry.newer != null) { + entry.newer.older = entry.older; + } else { + newest = entry.older; + } + --size; + if (2 * (size + zombies) > buckets.length) { + changeCapacity(); + } + return result; + }; + + autounravel RepSet_T operator cast(HashRepSet_T set) { + return set.super; + } + + autounravel Iterable_T operator cast(HashRepSet_T set) { + return Iterable_T(set.super.operator iter); + } + unravel super; +} + \ No newline at end of file diff --git a/base/collections/iter.asy b/base/collections/iter.asy new file mode 100644 index 000000000..75b9b0ff1 --- /dev/null +++ b/base/collections/iter.asy @@ -0,0 +1,49 @@ +typedef import(T); + +struct Iter_T { + // Returns the current item. Error if the iterator is not valid. + T get(); + // Advances the iterator to the next item. Error if the iterator is not valid. + void advance(); + // Returns true if the iterator is valid. If the iterator is used without + // modifying the datastructure, it will be valid as long as there is a next + // item. + // + // QUESTION: Do we want best-effort fail-fast iterators that set valid to false + // if the datastructure is modified, or do we want to leave it the behavior + // undefined in this case? + bool valid(); +} + +Iter_T Iter_T(T[] items) { + int index = 0; + Iter_T retv; + unravel retv; + advance = new void() { ++index; }; + get = new T() { return items[index]; }; + valid = new bool() { return index < items.length; }; + return retv; +} + +struct Iterable_T { + // Returns an iterator over the collection. + Iter_T operator iter(); + void operator init(Iter_T iter()) { + this.operator iter = iter; + } + void operator init(T[] items) { + this.operator iter = new Iter_T() { + return Iter_T(items); + }; + } + autounravel T[] operator ecast(Iterable_T iterable) { + T[] result; + for (T item : iterable) { + result.push(item); + } + return result; + } +} + +Iterable_T Iterable(Iter_T iter()) = Iterable_T; +Iterable_T Iterable(T[] items) = Iterable_T; diff --git a/base/collections/map.asy b/base/collections/map.asy new file mode 100644 index 000000000..49ce57cce --- /dev/null +++ b/base/collections/map.asy @@ -0,0 +1,169 @@ +typedef import(K, V); + +from collections.genericpair(K=K, V=V) access Pair_K_V; +from collections.iter(T=K) access Iter_T as Iter_K, Iterable_T as Iterable_K; +from collections.iter(T=Pair_K_V) access + Iter_T as Iter_K_V, + Iterable_T as Iterable_K_V; + +struct Map_K_V { + restricted V nullValue; + restricted bool isNullValue(V) = null; + void operator init() {} + void operator init(V nullValue, + bool isNullValue(V) = new bool(V v) { return v == nullValue; } + ) { + this.nullValue = nullValue; + this.isNullValue = isNullValue; + assert(isNullValue(nullValue), 'nullValue must satisfy isNullValue'); + } + // Remaining methods are not implemented here. + int size(); + bool empty() { return size() == 0; } + bool contains(K key); + // If the key was not present already, returns nullValue, or throws error + // if nullValue was never set. + V operator [] (K key); + // Adds the key-value pair, replacing both the key and value if the key was + // already present. + void operator [=] (K key, V value); + // Removes the entry with the given key, if it exists. + // QUESTION: Should we throw an error if the key was not present? (Current + // implementation: yes, unless there is a nullValue to return.) + void delete(K key); + + Iter_K operator iter(); + + autounravel Iterable_K operator cast(Map_K_V map) { + return Iterable_K(map.operator iter); + } + + // Makes the notation `for (K key: (K[])map)` work for now, albeit inefficiently. + autounravel K[] operator ecast(Map_K_V map) { + return (K[])(Iterable_K)map; + } + + void addAll(Iterable_K_V other) { + for (Pair_K_V kv : other) { + this[kv.k] = kv.v; + } + } + void removeAll(Iterable_K other) { + for (K key : other) { + delete(key); + } + } +} + +// Reference implementation for testing purposes. +struct NaiveMap_K_V { + private K[] keys; + private V[] values; + private int size; + private int numChanges = 0; + restricted Map_K_V map; + void operator init() { + keys = new K[0]; + values = new V[0]; + size = 0; + using F = void(); + ((F)map.operator init)(); + } + void operator init(V nullValue, bool isNullValue(V) = null) { + keys = new K[0]; + values = new V[0]; + size = 0; + if (isNullValue == null) { + map.operator init(nullValue); // Let operator init supply its own default. + } else { + map.operator init(nullValue, isNullValue); + } + } + map.size = new int() { return size; }; + map.contains = new bool(K key) { + for (int i = 0; i < size; ++i) { + if (keys[i] == key) { + return true; + } + } + return false; + }; + map.operator[] = new V(K key) { + for (int i = 0; i < size; ++i) { + if (keys[i] == key) { + return values[i]; + } + } + assert(map.isNullValue != null, 'Key not found in map'); + return map.nullValue; + }; + map.operator[=] = new void(K key, V value) { + bool delete = false; + if (map.isNullValue != null && map.isNullValue(value)) { + delete = true; + } + for (int i = 0; i < size; ++i) { + if (keys[i] == key) { + if (delete) { + keys.delete(i); + values.delete(i); + ++numChanges; + --size; + } else { + keys[i] = key; + values[i] = value; + } + return; + } + } + if (!delete) { + keys.push(key); + values.push(value); + ++numChanges; + ++size; + } + }; + map.delete = new void(K key) { + ++numChanges; + for (int i = 0; i < size; ++i) { + if (keys[i] == key) { + keys.delete(i); + values.delete(i); + --size; + return; + } + } + assert(false, 'Nonexistent key cannot be deleted'); + }; + map.operator iter = new Iter_K() { + int numChangesAtStart = numChanges; + int i = 0; + Iter_K result; + result.valid = new bool() { + assert(numChanges == numChangesAtStart, + 'Map keys changed during iteration'); + return i < size; + }; + result.advance = new void() { + assert(numChanges == numChangesAtStart, + 'Map keys changed during iteration'); + ++i; + }; + result.get = new K() { + assert(numChanges == numChangesAtStart, + 'Map keys changed during iteration'); + return keys[i]; + }; + return result; + }; + autounravel Iterable_K operator cast(NaiveMap_K_V map) { + return Iterable_K(map.map.operator iter); + } + autounravel K[] operator ecast(NaiveMap_K_V map) { + return copy(map.keys); + } + autounravel Map_K_V operator cast(NaiveMap_K_V map) { + return map.map; + } + from map unravel *; +} \ No newline at end of file diff --git a/base/collections/queue.asy b/base/collections/queue.asy new file mode 100644 index 000000000..17c9e5d82 --- /dev/null +++ b/base/collections/queue.asy @@ -0,0 +1,207 @@ +typedef import(T); + +from collections.iter(T=T) access Iter_T, Iterable_T; + +struct Queue_T { + void push(T value); + T peek(); + T pop(); + int size(); + Iter_T operator iter(); + autounravel Iterable_T operator cast(Queue_T queue) { + return Iterable_T(queue.operator iter); + } +} + +Queue_T makeNaiveQueue(T[] initialData) { + Queue_T queue = new Queue_T; + T[] data = new T[0]; + data.append(initialData); + queue.push = new void(T value) { + data.push(value); + }; + queue.peek = new T() { + return data[0]; + }; + queue.pop = new T() { + T retv = data[0]; + data.delete(0); + return retv; + }; + queue.size = new int() { + return data.length; + }; + queue.operator iter = new Iter_T() { + return Iter_T(data); + }; + return queue; +} + +struct ArrayQueue_T { + T[] data = new T[8]; + data.cyclic = true; + int start = 0; + int size = 0; + + private void resize() { + T[] newData = new T[data.length * 2]; + newData.cyclic = true; + newData[:size] = data[start : start+size]; + data = newData; + start = 0; + } + + Iter_T operator iter() { + int i = 0; + Iter_T result; + result.advance = new void() { + ++i; + }; + result.get = new T() { + return data[start+i]; + }; + result.valid = new bool() { + return i < size; + }; + return result; + } + + void operator init(T[] initialData) { + if (initialData.length == 0 || alias(initialData, null)) { + return; + } + int desiredLength = data.length; + // TODO: Do this computation using CLZ. + while (desiredLength < initialData.length) { + desiredLength *= 2; + } + if (desiredLength != data.length) { + data = new T[desiredLength]; + data.cyclic = true; + } + size = initialData.length; + data[:size] = initialData; + } + + void push(T value) { + if (size == data.length) { + resize(); + } + data[start+size] = value; + ++size; + } + + T peek() { + return data[start]; + } + + T pop() { + T retv = data[start]; + ++start; + --size; + return retv; + } + + int size() { + return size; + } + + autounravel Iterable_T operator cast(ArrayQueue_T queue) { + return Iterable_T(queue.operator iter); + } + + autounravel Queue_T operator cast(ArrayQueue_T queue) { + Queue_T queue_ = new Queue_T; + queue_.push = queue.push; + queue_.peek = queue.peek; + queue_.pop = queue.pop; + queue_.size = queue.size; + queue_.operator iter = queue.operator iter; + return queue_; + } + + +} + +Queue_T makeArrayQueue(T[] initialData /*specify type for overloading*/) { + return ArrayQueue_T(initialData); +} + +struct LinkedQueue_T { + struct Node { + T value; + Node next; + } + Node head = null; + Node tail = null; + int size = 0; + + Iter_T operator iter() { + Node node = head; + Iter_T result; + result.advance = new void() { + node = node.next; + }; + result.get = new T() { + return node.value; + }; + result.valid = new bool() { + return node != null; + }; + return result; + } + + void push(T value) { + Node node = new Node; + node.value = value; + if (size == 0) { + head = node; + tail = node; + } else { + tail.next = node; + tail = node; + } + ++size; + } + + T peek() { + return head.value; + } + + T pop() { + T retv = head.value; + head = head.next; + --size; + return retv; + } + + int size() { + return size; + } + + autounravel Queue_T operator cast(LinkedQueue_T queue) { + Queue_T queue_ = new Queue_T; + queue_.push = queue.push; + queue_.peek = queue.peek; + queue_.pop = queue.pop; + queue_.size = queue.size; + queue_.operator iter = queue.operator iter; + return queue_; + } + + autounravel Iterable_T operator cast(LinkedQueue_T queue) { + return Iterable_T(queue.operator iter); + } + +} + +Queue_T makeLinkedQueue(T[] initialData) { + var queue = new LinkedQueue_T; + for (T value : initialData) { + queue.push(value); + } + return queue; +} + +// Specify a "default" queue implementation. +Queue_T makeQueue(T[]) = makeArrayQueue; \ No newline at end of file diff --git a/base/collections/repset.asy b/base/collections/repset.asy new file mode 100644 index 000000000..b788061a0 --- /dev/null +++ b/base/collections/repset.asy @@ -0,0 +1,225 @@ +typedef import(T); +from collections.iter(T=T) access Iter_T, Iterable_T; + +// RepSet: set of representatives of equivalence classes. Contains at most one +// element from each equivalence class. + + +struct RepSet_T { + restricted T nullT; + restricted bool equiv(T, T) = operator ==; + restricted bool isNullT(T) = null; + restricted void operator init() {} + restricted void operator init(T nullT, + bool equiv(T a, T b) = operator ==, + bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) { + this.nullT = nullT; + this.equiv = equiv; + this.isNullT = isNullT; + } + + // Creates a new, empty RepSet with the same implemention, nullT, + // isNullT, and equiv as this one. + RepSet_T newEmpty(); + + int size(); + bool empty() { + return size() == 0; + } + bool contains(T item); + // Returns the equivalent item in the set, or nullT if the set + // contains no equivalent item. Throws error if nullT was never set. + T get(T item); + // Returns an iterator over the items in the set. + Iter_T operator iter(); + // If an equivalent item was already present, returns false. Otherwise, adds + // the item and returns true. Noop if isNullT is defined and item is empty. + bool add(T item); + // Inserts item, and returns the item that was replaced, or nullT if + // no item was replaced. Throws error if nullT was never set. + // Noop if isNullT is defined and item is empty. + // QUESTION: Should we throw an error even if nullT was not needed, + // i.e., if there was already an equivalent item in the collection? + T update(T item); + // Removes the equivalent item from the set, and returns it. Returns + // nullT if there is no equivalent item. Throws error if + // there is not equivalent item and nullT was never set. + T delete(T item); + + autounravel Iterable_T operator cast(RepSet_T set) { + return Iterable_T(set.operator iter); + } + + void addAll(Iterable_T other) { + for (T item : other) { + add(item); + } + } + void removeAll(Iterable_T other) { + for (T item : other) { + delete(item); + } + } + + autounravel bool operator <=(RepSet_T a, RepSet_T b) { + for (var item : a) { + if (!b.contains(item)) { + return false; + } + } + return true; + } + + autounravel bool operator >=(RepSet_T a, RepSet_T b) { + return b <= a; + } + + autounravel bool operator ==(RepSet_T a, RepSet_T b) { + return a <= b && a >= b; + } + + autounravel bool operator !=(RepSet_T a, RepSet_T b) { + return !(a == b); + } + + autounravel bool sameElementsInOrder(RepSet_T a, RepSet_T b) { + bool equiv(T ai, T bi) { + return a.equiv(ai, bi) && b.equiv(ai, bi); + } + var iterA = a.operator iter(); + var iterB = b.operator iter(); + while (iterA.valid() && iterB.valid()) { + if (!equiv(iterA.get(), iterB.get())) { + return false; + } + iterA.advance(); + iterB.advance(); + } + return iterA.valid() == iterB.valid(); + } + + autounravel RepSet_T operator +(RepSet_T a, Iterable_T b) { + RepSet_T result = a.newEmpty(); + for (T item : a) { + result.add(item); + } + for (T item : b) { + result.add(item); + } + return result; + } + + autounravel RepSet_T operator -(RepSet_T a, RepSet_T b) { + RepSet_T result = a.newEmpty(); + for (T item : a) { + if (!b.contains(item)) { + result.add(item); + } + } + return result; + } + +} + + +// A reference implementation, inefficient but suitable for testing. +struct NaiveRepSet_T { + RepSet_T super; + unravel super; + private T[] items; + restricted void operator init() { + typedef void F(); + ((F)super.operator init)(); + } + restricted void operator init(T nullT, + bool equiv(T a, T b) = operator ==, + bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) { + typedef void F(T, bool equiv(T, T), bool isNullT(T)); + ((F)super.operator init)(nullT, equiv, isNullT); + } + + super.size = new int() { + return items.length; + }; + + super.contains = new bool(T item) { + for (T i : items) { + if (equiv(i, item)) { + return true; + } + } + return false; + }; + + super.get = new T(T item) { + for (T i : items) { + if (equiv(i, item)) { + return i; + } + } + return nullT; + }; + + super.operator iter = new Iter_T() { + return Iter_T(items); + }; + + super.add = new bool(T item) { + if (isNullT != null && isNullT(item)) { + return false; + } + if (contains(item)) { + return false; + } + items.push(item); + return true; + }; + + super.update = new T(T item) { + if (isNullT != null && isNullT(item)) { + return nullT; + } + for (int i = 0; i < items.length; ++i) { + if (equiv(items[i], item)) { + T result = items[i]; + items[i] = item; + return result; + } + } + items.push(item); + assert(isNullT != null, 'item not found'); + return nullT; + }; + + super.delete = new T(T item) { + for (int i = 0; i < items.length; ++i) { + if (equiv(items[i], item)) { + T result = items[i]; + items.delete(i); + return result; + } + } + assert(isNullT != null, 'item not found'); + return nullT; + }; + + autounravel Iterable_T operator cast(NaiveRepSet_T set) { + return Iterable_T(set.operator iter); + } + + autounravel RepSet_T operator cast(NaiveRepSet_T set) { + return set.super; + } + + super.newEmpty = new RepSet_T() { + return NaiveRepSet_T(nullT, equiv, isNullT); + }; + + autounravel T[] operator ecast(NaiveRepSet_T set) { + T[] result; + for (T item : set.items) { + result.push(item); + } + return result; + } +} diff --git a/tests/template/imports/sortedset.asy b/base/collections/sortedset.asy similarity index 74% rename from tests/template/imports/sortedset.asy rename to base/collections/sortedset.asy index bcef6889e..70799c7d6 100644 --- a/tests/template/imports/sortedset.asy +++ b/base/collections/sortedset.asy @@ -1,6 +1,6 @@ typedef import(T); -from "template/imports/pureset"(T=T) access Set_T, operator cast, makeNaiveSet; +from pureset(T=T) access Set_T, makeNaiveSet; struct SortedSet_T { int size(); @@ -21,59 +21,66 @@ struct SortedSet_T { T popMin(); // Returns emptyresponse if collection is empty. T max(); // Returns emptyresponse if collection is empty. T popMax(); // Returns emptyresponse if collection is empty. - bool insert(T item); // Returns true iff the collection is modified. - T replace(T item); // Inserts item, and returns the item that was + bool add(T item); // Returns true iff the collection is modified. + T update(T item); // Inserts item, and returns the item that was // replaced, or emptyresponse if no item was replaced. - bool delete(T item); // Returns true iff the collection is modified. + T delete(T item); // Returns the removed item, or emptyresponse if no + // such item was found. // Calls process on each item in the collection, in ascending order, // until process returns false. void forEach(bool process(T item)); -} -T[] operator cast(SortedSet_T set) { - T[] result; - set.forEach(new bool(T item) { - result.push(item); - return true; - }); - return result; -} + autounravel T[] operator cast(SortedSet_T set) { + T[] result; + set.forEach(new bool(T item) { + result.push(item); + return true; + }); + return result; + } + + autounravel Set_T operator cast(SortedSet_T sorted_set) { + Set_T set = new Set_T; + set.size = sorted_set.size; + set.empty = sorted_set.empty; + set.contains = sorted_set.contains; + set.add = sorted_set.add; + set.update = sorted_set.update; + set.get = sorted_set.get; + set.delete = sorted_set.delete; + set.forEach = sorted_set.forEach; + return set; + } -Set_T unSort(SortedSet_T sorted_set) { - Set_T set = new Set_T; - set.size = sorted_set.size; - set.empty = sorted_set.empty; - set.contains = sorted_set.contains; - set.insert = sorted_set.insert; - set.replace = sorted_set.replace; - set.get = sorted_set.get; - set.delete = sorted_set.delete; - set.forEach = sorted_set.forEach; - return set; } -Set_T operator cast(SortedSet_T) = unSort; +Set_T unSort(SortedSet_T sorted_set) = new Set_T(SortedSet_T sorted_set) { return sorted_set; }; // For testing purposes, we provide a naive implementation of SortedSet_T. // This implementation is highly inefficient, but it is correct, and can be // used to test other implementations of SortedSet_T. struct NaiveSortedSet_T { - private bool lt(T a, T b); + private bool lt(T a, T b) = null; private T[] buffer = new T[0]; private T emptyresponse; - private bool leq(T a, T b) { + private bool leq(T, T), gt(T, T), geq(T, T), equiv(T, T); + + leq = new bool(T a, T b) { return !lt(b, a); - } - private bool gt(T a, T b) { + }; + + gt = new bool(T a, T b) { return lt(b, a); - } - private bool geq(T a, T b) { + }; + + geq = new bool(T a, T b) { return leq(b, a); - } - private bool equiv(T a, T b) { + }; + + equiv = new bool(T a, T b) { return leq(a, b) && leq(b, a); - } + }; void operator init(bool lessThan(T, T), T emptyresponse) { this.lt = lessThan; @@ -128,7 +135,7 @@ struct NaiveSortedSet_T { return buffer.pop(); } - bool insert(T item) { + bool add(T item) { for (int i = 0; i < buffer.length; ++i) { if (equiv(buffer[i], item)) return false; else if (gt(buffer[i], item)) { @@ -140,7 +147,7 @@ struct NaiveSortedSet_T { return true; } - T replace(T item) { + T update(T item) { for (int i = 0; i < buffer.length; ++i) { if (equiv(buffer[i], item)) { T toreturn = buffer[i]; @@ -163,14 +170,15 @@ struct NaiveSortedSet_T { return emptyresponse; } - bool delete(T item) { + T delete(T item) { for (int i = 0; i < buffer.length; ++i) { - if (equiv(buffer[i], item)) { + T candidate = buffer[i]; + if (equiv(candidate, item)) { buffer.delete(i); - return true; + return candidate; } } - return false; + return emptyresponse; } void forEach(bool process(T item)) { @@ -190,8 +198,8 @@ SortedSet_T operator cast(NaiveSortedSet_T naive) { toreturn.popMin = naive.popMin; toreturn.max = naive.max; toreturn.popMax = naive.popMax; - toreturn.insert = naive.insert; - toreturn.replace = naive.replace; + toreturn.add = naive.add; + toreturn.update = naive.update; toreturn.get = naive.get; toreturn.delete = naive.delete; toreturn.forEach = naive.forEach; diff --git a/tests/template/imports/splaytree.asy b/base/collections/splaytree.asy similarity index 91% rename from tests/template/imports/splaytree.asy rename to base/collections/splaytree.asy index d5850482f..cae7eb111 100644 --- a/tests/template/imports/splaytree.asy +++ b/base/collections/splaytree.asy @@ -1,9 +1,6 @@ typedef import(T); -from "template/imports/sortedset"(T=T) access - Set_T, - SortedSet_T, - operator cast; +from sortedset(T=T) access Set_T, SortedSet_T; private struct treenode { treenode leftchild; @@ -152,9 +149,10 @@ private treenode splay(treenode[] ancestors, bool lessthan(T a, T b)) { struct SplayTree_T { private treenode root = null; restricted int size = 0; - private bool operator < (T a, T b); + private T emptyresponse; + private bool operator < (T a, T b); void operator init(bool lessthan(T,T), T emptyresponse) { operator< = lessthan; this.emptyresponse = emptyresponse; @@ -355,7 +353,7 @@ struct SplayTree_T { /* * returns true iff the tree was modified */ - bool insert(T value) { + bool add(T value) { if (root == null) { root = treenode(value); ++size; @@ -392,9 +390,9 @@ struct SplayTree_T { return true; } - T replace(T item) { + T update(T item) { if (root == null) { - insert(item); + add(item); return emptyresponse; } treenode[] ancestors = new treenode[0]; @@ -456,9 +454,9 @@ struct SplayTree_T { } /* - * returns true iff the tree was modified + * returns the removed item, or emptyresponse if the item was not found */ - bool delete(T value) { + T delete(T value) { treenode[] ancestors = new treenode[0]; ancestors.cyclic = true; // Makes ancestors[-1] refer to the last entry. ancestors.push(root); @@ -468,7 +466,7 @@ struct SplayTree_T { if (current == null) { ancestors.pop(); root = splay(ancestors, operator<); - return false; + return emptyresponse; } if (value < current.value) ancestors.push(current.leftchild); @@ -478,6 +476,7 @@ struct SplayTree_T { } treenode toDelete = ancestors.pop(); + T retv = toDelete.value; treenode parent = null; if (ancestors.length > 0) parent = ancestors[-1]; @@ -510,40 +509,39 @@ struct SplayTree_T { if (parent != null) root = splay(ancestors, operator<); --size; - return true; + return retv; } void forEach(bool run(T)) { inOrderNonRecursive(root, run); } - -} -SortedSet_T operator cast(SplayTree_T splaytree) { - SortedSet_T result = new SortedSet_T; - result.size = splaytree.size; - result.empty = splaytree.empty; - result.contains = splaytree.contains; - result.after = splaytree.after; - result.before = splaytree.before; - result.firstGEQ = splaytree.firstGEQ; - result.firstLEQ = splaytree.firstLEQ; - result.min = splaytree.min; - result.popMin = splaytree.popMin; - result.max = splaytree.max; - result.popMax = splaytree.popMax; - result.insert = splaytree.insert; - result.replace = splaytree.replace; - result.get = splaytree.get; - result.delete = splaytree.delete; - result.forEach = splaytree.forEach; - return result; -} + autounravel SortedSet_T operator cast(SplayTree_T splaytree) { + SortedSet_T result = new SortedSet_T; + result.size = splaytree.size; + result.empty = splaytree.empty; + result.contains = splaytree.contains; + result.after = splaytree.after; + result.before = splaytree.before; + result.firstGEQ = splaytree.firstGEQ; + result.firstLEQ = splaytree.firstLEQ; + result.min = splaytree.min; + result.popMin = splaytree.popMin; + result.max = splaytree.max; + result.popMax = splaytree.popMax; + result.add = splaytree.add; + result.update = splaytree.update; + result.get = splaytree.get; + result.delete = splaytree.delete; + result.forEach = splaytree.forEach; + return result; + } -Set_T operator cast(SplayTree_T splaytree) { - return (SortedSet_T)splaytree; -} + autounravel Set_T operator cast(SplayTree_T splaytree) { + return (SortedSet_T)splaytree; + } -T[] operator cast(SplayTree_T splaytree) { - return (SortedSet_T)splaytree; + autounravel T[] operator cast(SplayTree_T splaytree) { + return (SortedSet_T)splaytree; + } } \ No newline at end of file diff --git a/base/collections/wrapper.asy b/base/collections/wrapper.asy new file mode 100644 index 000000000..0ba93178d --- /dev/null +++ b/base/collections/wrapper.asy @@ -0,0 +1,19 @@ +typedef import(T); + +struct Wrapped_T { + T t; + void operator init(T t) { + this.t = t; + } + autounravel bool operator ==(Wrapped_T a, Wrapped_T b) { + return a.t == b.t; + } + autounravel bool operator !=(Wrapped_T a, Wrapped_T b) { + // Let's not assume that != was overloaded. + return !(a.t == b.t); + } +} + +Wrapped_T wrap(T t) { + return Wrapped_T(t); +} \ No newline at end of file diff --git a/tests/template/imports/zip.asy b/base/collections/zip.asy similarity index 100% rename from tests/template/imports/zip.asy rename to base/collections/zip.asy diff --git a/base/set_smallpositiveint.asy b/base/set_smallpositiveint.asy new file mode 100644 index 000000000..bbe17c470 --- /dev/null +++ b/base/set_smallpositiveint.asy @@ -0,0 +1,94 @@ +from pureset(int) access + Set_T as set_int; + +struct Set_smallPositiveInt { + bool[] buffer = new bool[]; + + int size() { + return sum(buffer); + } + + bool empty() { + return all(!buffer); + } + + bool contains(int item) { + if (item < 0 || item >= buffer.length) { + return false; + } + return buffer[item]; + } + + bool insert(int item) { + if (item < 0) { + return false; + } + while (item >= buffer.length) { + buffer.push(false); + } + if (buffer[item]) { + return false; + } + buffer[item] = true; + return true; + } + + int replace(int item) { + if (item < 0) { + return -1; + } + while (item >= buffer.length) { + buffer.push(false); + } + if (buffer[item]) { + return item; + } + buffer[item] = true; + return -1; + } + + int get(int item) { + if (item < 0 || item >= buffer.length) { + return -1; + } + if (buffer[item]) { + return item; + } + return -1; + } + + bool delete(int item) { + if (item < 0 || item >= buffer.length) { + return false; + } + if (buffer[item]) { + buffer[item] = false; + return true; + } + return false; + } + + void foreach(bool process(int item)) { + for (int i = 0; i < buffer.length; ++i) { + if (buffer[i]) { + if (!process(i)) { + return; + } + } + } + } + +} + +Set_int operator cast(Set_smallPositiveInt set) { + Set_int result = new Set_int; + result.size = set.size; + result.empty = set.empty; + result.contains = set.contains; + result.insert = set.insert; + result.replace = set.replace; + result.get = set.get; + result.delete = set.delete; + result.foreach = set.foreach; + return result; +} \ No newline at end of file diff --git a/camp.l b/camp.l index d6b456606..e6469ad5a 100644 --- a/camp.l +++ b/camp.l @@ -420,7 +420,7 @@ operator {adjust(); BEGIN opname; } BEGIN INITIAL; return ID; } -[-+*/#%^!<>]|==|!=|<=|>=|&|\||\^\^|\.\.|::|--|---|\+\+|{EXTRAOPS} { +[-+*/#%^!<>]|==|!=|<=|>=|&|\||\^\^|\.\.|::|--|---|\+\+|\[\]|\[=\]|{EXTRAOPS} { makeopsymbol(); BEGIN INITIAL; return ID;} diff --git a/camp.y b/camp.y index b8831bc9f..0f1dd50fb 100644 --- a/camp.y +++ b/camp.y @@ -50,6 +50,20 @@ bool checkKeyword(position pos, symbol sym) return true; } +// Check if the symbol given is "as". Returns true in this case and +// returns false and reports an error otherwise. +bool checkAs(position pos, symbol sym) +{ + if (sym != symbol::trans("as")) { + em.error(pos); + em << "expected 'as' here"; + + return false; + } + return true; +} + + namespace absyntax { file *root; } using namespace absyntax; @@ -317,7 +331,8 @@ idpairlist: ; strid: - ID { $$ = $1; } + name { $$.pos = $1->getPos(); + $$.sym = $1->asPath(); } | STRING { $$.pos = $1->getPos(); $$.sym = symbol::literalTrans($1->getString()); } ; diff --git a/cmake-scripts/asy-base-files.cmake b/cmake-scripts/asy-base-files.cmake index 0622c21cb..21a927ec2 100644 --- a/cmake-scripts/asy-base-files.cmake +++ b/cmake-scripts/asy-base-files.cmake @@ -16,6 +16,11 @@ set(ASY_STATIC_SHADER_FILES sum3 vertex zero ) +set(ASY_STATIC_BASE_COLLECTIONS_FILES + enumerate genericpair hashmap hashrepset iter map queue repset sortedset splaytree + wrapper zip +) + set(OTHER_STATIC_BASE_FILES nopapersize.ps) # base dir @@ -49,6 +54,11 @@ foreach(ASY_STATIC_BASE_FILE ${ASY_STATIC_BASE_FILES}) copy_base_file(${ASY_STATIC_BASE_FILE}.asy) endforeach () +file(MAKE_DIRECTORY ${ASY_BUILD_BASE_DIR}/collections) +foreach (ASY_COLLECTION_BASE_FILE ${ASY_STATIC_BASE_COLLECTIONS_FILES}) + copy_base_file(collections/${ASY_COLLECTION_BASE_FILE}.asy) +endforeach() + foreach(OTHER_STATIC_BASE_FILE ${OTHER_STATIC_BASE_FILES}) copy_base_file(${OTHER_STATIC_BASE_FILE}) endforeach () diff --git a/cmake-scripts/asy-files.cmake b/cmake-scripts/asy-files.cmake index efd15e46d..66c341329 100644 --- a/cmake-scripts/asy-files.cmake +++ b/cmake-scripts/asy-files.cmake @@ -22,7 +22,7 @@ set(CORE_BUILD_FILES ${CAMP_BUILD_FILES} ${SYMBOL_STATIC_BUILD_FILES} env genv stm dec errormsg callable name symbol entry exp newexp stack exithandlers - access virtualfieldaccess absyn record interact fileio + access virtualfieldaccess absyn record interact fileio hashing fftw++asy parallel simpson coder coenv impdatum locate asyparser program application varinit fundec refaccess envcompleter asyprocess constructor array memory Delaunay predicates glrender tr shaders jsfile v3dfile diff --git a/cmake-scripts/subrepo-projects.cmake b/cmake-scripts/subrepo-projects.cmake index a93c0367f..27b77ddd7 100644 --- a/cmake-scripts/subrepo-projects.cmake +++ b/cmake-scripts/subrepo-projects.cmake @@ -4,6 +4,15 @@ set(LSP_REPO_ROOT ${ASY_SUBREPO_CLONE_ROOT}/LspCpp) set(TINYEXR_SUBREPO_ROOT ${ASY_SUBREPO_CLONE_ROOT}/tinyexr) set(BOEHM_GC_ROOT ${ASY_SUBREPO_CLONE_ROOT}/gc) set(LIBATOMIC_OPS_ROOT ${ASY_SUBREPO_CLONE_ROOT}/libatomic_ops) +set(HIGHWAYHASH_ROOT ${ASY_SUBREPO_CLONE_ROOT}/highwayhash) + +# highwayhash +set(OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) +set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "highwayhash shared libs flag") +add_subdirectory(${HIGHWAYHASH_ROOT}) +unset(BUILD_SHARED_LIBS CACHE) +set(BUILD_SHARED_LIBS ${OLD_BUILD_SHARED_LIBS}) +list(APPEND ASY_STATIC_LIBARIES highwayhash) # boehm gc if (ENABLE_GC) diff --git a/cmake-scripts/tests-asy.cmake b/cmake-scripts/tests-asy.cmake index 7e990cfec..fe7876a94 100644 --- a/cmake-scripts/tests-asy.cmake +++ b/cmake-scripts/tests-asy.cmake @@ -84,7 +84,7 @@ add_asy_tests( if (ENABLE_GC) add_asy_tests( TEST_DIR gc - TESTS array funcall guide label path shipout string struct transform + TESTS array file funcall guide label path pen shipout string struct transform TEST_ARTIFACTS .eps TEST_NOT_PART_OF_CHECK_TEST true ) @@ -98,7 +98,7 @@ if (ENABLE_GSL) endif() add_asy_tests(TEST_DIR imp TESTS unravel) -add_asy_tests(TEST_DIR io TESTS csv) +add_asy_tests(TEST_DIR io TESTS csv read) add_asy_tests(TEST_DIR output TESTS circle line TEST_ARTIFACTS circle.eps line.eps) add_asy_tests(TEST_DIR pic TESTS trans) add_asy_tests( @@ -109,14 +109,21 @@ add_asy_tests( TEST_DIR types TESTS autounravel builtinOps cast constructor ecast guide - init keyword order resolve shadow spec var + init keyword order overrideEquals resolve shadow spec var ) add_asy_tests( TEST_DIR template TESTS initTest functionTest mapArrayTest multiImport nestedImport - singletype sortedsetTest splaytreeTest structTest + singletype structTest +) + +add_asy_tests( + TEST_DIR datastructures + TESTS + bracketsTest changeWhileIterTest enumerateTest hashmapTest + hashrepsetTest hashTest iterTest queueTest ) add_asy_tests( diff --git a/common.h b/common.h index d968d51be..1a4f427bd 100644 --- a/common.h +++ b/common.h @@ -53,6 +53,15 @@ using nonstd::optional; using nonstd::nullopt; using nonstd::make_optional; + +#if __cplusplus < 202002L +# include "span.hpp" +using nonstd::span; +#else +# include +using std::span; +#endif + using std::make_pair; #if !defined(FOR_SHARED) && \ diff --git a/dec.cc b/dec.cc index f8673f9c8..70acc9f9a 100644 --- a/dec.cc +++ b/dec.cc @@ -71,9 +71,11 @@ bool usableInTemplate(ty *t) { } -trans::tyEntry *astType::transAsTyEntry(coenv &e, record *where) +trans::tyEntry* astType::transAsTyEntry(coenv& e, record* where) { - return new trans::tyEntry(trans(e, false), nullptr, where, getPos()); + return new trans::tyEntry( + trans(e, ErrorMode::NORMAL), nullptr, where, getPos() + ); } @@ -113,17 +115,16 @@ void addNameOps(coenv &e, record *r, record *qt, varEntry *qv, position pos) { void nameTy::addOps(coenv &e, record *r, AutounravelOption opt) { - if (opt == AutounravelOption::Apply) - { - if (record* qt= dynamic_cast(id->getType(e, true)); qt) - { + if (opt == AutounravelOption::Apply) { + if (record* qt= dynamic_cast(id->getType(e, ErrorMode::SUPPRESS)); + qt) { varEntry* qv= id->getVarEntry(e); addNameOps(e, r, qt, qv, getPos()); } } } -types::ty *nameTy::trans(coenv &e, bool tacit) +types::ty *nameTy::trans(coenv &e, ErrorMode tacit) { return id->typeTrans(e, tacit); } @@ -144,9 +145,9 @@ void dimensions::prettyprint(ostream &out, Int indent) out << "dimensions (" << depth << ")\n"; } -types::array *dimensions::truetype(types::ty *base, bool tacit) +types::array *dimensions::truetype(types::ty *base, ErrorMode tacit) { - if (!tacit && base->kind == ty_void) { + if (tacit==ErrorMode::NORMAL && base->kind == ty_void) { em.error(getPos()); em << "cannot declare array of type void"; } @@ -172,7 +173,7 @@ void arrayTy::prettyprint(ostream &out, Int indent) // NOTE: Can this be merged with trans somehow? void arrayTy::addOps(coenv &e, record *r, AutounravelOption) { - types::ty *t=trans(e, true); + types::ty *t=trans(e, ErrorMode::SUPPRESS); // Only add ops if it is an array (and not, say, an error) if (t->kind == types::ty_array) { @@ -184,7 +185,7 @@ void arrayTy::addOps(coenv &e, record *r, AutounravelOption) } } -types::ty *arrayTy::trans(coenv &e, bool tacit) +types::ty *arrayTy::trans(coenv &e, ErrorMode tacit) { types::ty *ct = cell->trans(e, tacit); assert(ct); @@ -220,7 +221,7 @@ void tyEntryTy::prettyprint(ostream &out, Int indent) out << "tyEntryTy: " << *(ent->t) << "\n"; } -types::ty *tyEntryTy::trans(coenv &, bool) { +types::ty *tyEntryTy::trans(coenv &, ErrorMode) { return ent->t; } @@ -566,7 +567,7 @@ void decidstart::prettyprint(ostream &out, Int indent) dims->prettyprint(out, indent+1); } -types::ty *decidstart::getType(types::ty *base, coenv &, bool) +types::ty *decidstart::getType(types::ty *base, coenv &, ErrorMode) { return dims ? dims->truetype(base) : base; } @@ -575,7 +576,8 @@ trans::tyEntry *decidstart::getTyEntry(trans::tyEntry *base, coenv &e, record *where) { return dims ? new trans::tyEntry( - getType(base->t, e, false), nullptr, where, getPos() + getType(base->t, e, ErrorMode::NORMAL), nullptr, where, + getPos() ) : base; } @@ -604,7 +606,7 @@ void decidstart::addOps(types::ty *base, coenv &e, record *r) params->prettyprint(out, indent+1); } -types::ty *fundecidstart::getType(types::ty *base, coenv &e, bool tacit) +types::ty *fundecidstart::getType(types::ty *base, coenv &e, ErrorMode tacit) { types::ty *result = decidstart::getType(base, e, tacit); @@ -617,21 +619,23 @@ types::ty *fundecidstart::getType(types::ty *base, coenv &e, bool tacit) } } -trans::tyEntry *fundecidstart::getTyEntry(trans::tyEntry *base, coenv &e, - record *where) +trans::tyEntry* +fundecidstart::getTyEntry(trans::tyEntry* base, coenv& e, record* where) { - return new trans::tyEntry(getType(base->t,e,false), nullptr, where, getPos()); + return new trans::tyEntry( + getType(base->t, e, ErrorMode::NORMAL), nullptr, where, getPos() + ); } -void fundecidstart::addOps(types::ty *base, coenv &e, record *r) +void fundecidstart::addOps(types::ty* base, coenv& e, record* r) { decidstart::addOps(base, e, r); params->addOps(e, r); - types::function *ft=dynamic_cast(getType(base, e, true)); + types::function* ft= + dynamic_cast(getType(base, e, ErrorMode::SUPPRESS)); assert(ft); - } @@ -1113,7 +1117,7 @@ void recordInitializer(coenv &e, symbol id, record *r, position here) assert(r); { e.c.pushModifier(AUTOUNRAVEL); - function *ft = fun.transType(e, false); + function *ft = fun.transType(e, ErrorMode::NORMAL); assert(ft); symbol initSym=symbol::opTrans("init"); @@ -1200,7 +1204,9 @@ class PermissionSetter { coder &c; permission oldPerm; public: - PermissionSetter(coder &c, permission newPerm) : c(c), oldPerm(c.getPermission()) { + PermissionSetter(coder& c, permission newPerm) + : c(c), oldPerm(c.getPermission()) + { c.setPermission(newPerm); } ~PermissionSetter() { @@ -1295,7 +1301,7 @@ void unraveldec::prettyprint(ostream &out, Int indent) fromdec::qualifier unraveldec::getQualifier(coenv &e, record *) { // getType is where errors in the qualifier are reported. - record *qt=dynamic_cast(id->getType(e, false)); + record *qt=dynamic_cast(id->getType(e, ErrorMode::NORMAL)); if (!qt) { em.error(getPos()); em << "qualifier is not a record"; @@ -1442,6 +1448,7 @@ void recorddec::transAsField(coenv &e, record *parent) // the default initializer first. re.c.closeRecord(); + r->computeKVTypes(getPos()); // Add types and variables defined during the record that should be added to // the enclosing environment. These are the implicit constructors defined by diff --git a/dec.h b/dec.h index 95b1aef83..c34118617 100644 --- a/dec.h +++ b/dec.h @@ -66,7 +66,7 @@ class astType : public absyn { // Returns the internal representation of the type. This method can // be called by exp::getType which does not report errors, so tacit is // needed to silence errors in this case. - virtual types::ty *trans(coenv &e, bool tacit = false) = 0; + virtual types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) = 0; virtual trans::tyEntry *transAsTyEntry(coenv &e, record *where); @@ -93,7 +93,7 @@ class nameTy : public astType { void addOps(coenv& e, record* r, AutounravelOption opt= AutounravelOption::Apply) override; - types::ty *trans(coenv &e, bool tacit = false) override; + types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override; trans::tyEntry *transAsTyEntry(coenv &e, record *where) override; virtual operator string() const override; @@ -114,7 +114,7 @@ class dimensions : public absyn { return depth; } - types::array *truetype(types::ty *base, bool tacit=false); + types::array *truetype(types::ty *base, ErrorMode tacit=ErrorMode::NORMAL); }; class arrayTy : public astType { @@ -134,7 +134,7 @@ class arrayTy : public astType { addOps(coenv& e, record* r, AutounravelOption opt= AutounravelOption::Apply) override; - types::ty *trans(coenv &e, bool tacit = false) override; + types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override; operator string() const override; }; @@ -151,7 +151,7 @@ class tyEntryTy : public astType { void prettyprint(ostream &out, Int indent) override; - types::ty *trans(coenv &e, bool tacit = false) override; + types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override; trans::tyEntry *transAsTyEntry(coenv &, record *) override { return ent; } @@ -362,7 +362,8 @@ class decidstart : public absyn { virtual void prettyprint(ostream &out, Int indent) override; - virtual types::ty *getType(types::ty *base, coenv &, bool = false); + virtual types::ty* + getType(types::ty* base, coenv&, ErrorMode tacit= ErrorMode::NORMAL); virtual trans::tyEntry *getTyEntry(trans::tyEntry *base, coenv &e, record *where); @@ -391,9 +392,10 @@ class fundecidstart : public decidstart { void prettyprint(ostream &out, Int indent); - types::ty *getType(types::ty *base, coenv &e, bool tacit = false); - trans::tyEntry *getTyEntry(trans::tyEntry *base, coenv &e, record *where); - void addOps(types::ty *base, coenv &e, record *r); + types::ty* + getType(types::ty* base, coenv& e, ErrorMode tacit= ErrorMode::NORMAL); + trans::tyEntry* getTyEntry(trans::tyEntry* base, coenv& e, record* where); + void addOps(types::ty* base, coenv& e, record* r); }; class decid : public absyn { diff --git a/errormsg.cc b/errormsg.cc index 61992d963..856635bf1 100644 --- a/errormsg.cc +++ b/errormsg.cc @@ -63,6 +63,8 @@ void errorstream::clear() void errorstream::message(position pos, const string& s) { + if (mode == ErrorMode::SUPPRESS) + return; if (floating) out << endl; out << pos << ": " << s; floating = true; @@ -70,6 +72,7 @@ void errorstream::message(position pos, const string& s) void errorstream::compiler(position pos) { + mode = ErrorMode::FORCE; message(pos,"Compiler bug; report to https://github.com/vectorgraphics/asymptote/issues:\n"); anyErrors = true; } @@ -81,36 +84,47 @@ void errorstream::compiler() void errorstream::runtime(position pos) { + if (mode == ErrorMode::SUPPRESS) + return; message(pos,"runtime: "); anyErrors = true; } void errorstream::error(position pos) { + if (mode == ErrorMode::SUPPRESS) + return; message(pos,""); anyErrors = true; } void errorstream::warning(position pos, string s) { + if (mode == ErrorMode::SUPPRESS) + return; message(pos,"warning ["+s+"]: "); anyWarnings = true; } void errorstream::warning(position pos) { + if (mode == ErrorMode::SUPPRESS) + return; message(pos,"warning: "); anyWarnings = true; } void errorstream::fatal(position pos) { + mode = ErrorMode::FORCE; message(pos,"abort: "); anyErrors = true; } void errorstream::trace(position pos) { + if (mode == ErrorMode::SUPPRESS) + return; static position lastpos; if(!pos || (pos.match(lastpos.filename()) && pos.match(lastpos.Line()))) return; diff --git a/errormsg.h b/errormsg.h index df6398a2c..b00a9c3f8 100644 --- a/errormsg.h +++ b/errormsg.h @@ -162,6 +162,13 @@ inline bool operator == (const position& a, const position& b) string warning(string s); +enum class ErrorMode +{ + SUPPRESS,// Suppress warnings and errors. + NORMAL, + FORCE,// Like normal mode, but ignores attempts to change the mode. +}; + class errorstream { ostream& out; bool anyErrors; @@ -171,6 +178,13 @@ class errorstream { // Is there an error that warrants the asy process to return 1 instead of 0? bool anyStatusErrors; + ErrorMode mode; + void setMode(ErrorMode newMode) + { + if (mode != ErrorMode::FORCE) + mode= newMode; + } + public: static bool interrupt; // Is there a pending interrupt? @@ -179,7 +193,7 @@ class errorstream { errorstream(ostream& out = cerr) : out(out), anyErrors(false), anyWarnings(false), floating(false), - anyStatusErrors(false) {} + anyStatusErrors(false), mode(ErrorMode::NORMAL) {} void clear(); @@ -218,8 +232,10 @@ class errorstream { // NOTE: May later make it do automatic line breaking for long messages. template errorstream& operator << (const T& x) { - flush(out); - out << x; + if (mode != ErrorMode::SUPPRESS) { + flush(out); + out << x; + } return *this; } @@ -246,6 +262,21 @@ class errorstream { bool processStatus() const { return !anyStatusErrors; } + + class ModeGuard + { + errorstream& es; + ErrorMode oldMode; + + public: + ModeGuard(errorstream& es, ErrorMode newMode) : es(es), oldMode(es.mode) + { + es.setMode(newMode); + } + ~ModeGuard() { es.setMode(oldMode); } + }; + + ModeGuard modeGuard(ErrorMode newMode) { return ModeGuard(*this, newMode); } }; extern errorstream em; diff --git a/errors b/errors index a679ea016..33c2e9986 100644 --- a/errors +++ b/errors @@ -151,9 +151,8 @@ errortest.asy: 438.17: cannot cast 'int' to 'var' errortest.asy: 442.7: could not infer type of initializer errortest.asy: 446.7: could not infer type of initializer errortest.asy: 448.7: could not infer type of initializer -errortest.asy: 452.16: expression is not an array of inferable type -errortest.asy: 457.16: expression is not an array of inferable type -errortest.asy: 463.16: expression is not an array of inferable type +errortest.asy: 452.16: cannot iterate over expression of type 'int' +errortest.asy: 457.16: cannot resolve type for iteration errortest.asy: 470.7: array expression cannot be used as an address errortest.asy: 519.29: expected 'as' errortest.asy: 521.30: expected 'as' @@ -204,3 +203,12 @@ errortest.asy: 626.9: accessing private field outside of structure errortest.asy: 627.4: accessing private field outside of structure errortest.asy: 628.4: accessing private field outside of structure errortest.asy: 639.4: accessing private field outside of structure +errortest.asy: 643.3: multiple operator[] definitions in one struct +errortest.asy: 650.3: operator[=] defined without operator[] +errortest.asy: 656.3: operator[=] must return void +errortest.asy: 667.3: no matching variable '.valid' +errortest.asy: 667.3: no matching variable '.get' +errortest.asy: 667.3: no matching variable '.advance' +errortest.asy: 677.16: cannot iterate over expression of type 'int(int i)' +errortest.asy: 684.17: cannot call 'int f(int i)' with parameter 'string' +errortest.asy: 687.17: cannot call 'int f(int i)' with parameter 'string' diff --git a/errortest.asy b/errortest.asy index e5ffbb59b..50dcf3c7b 100644 --- a/errortest.asy +++ b/errortest.asy @@ -638,3 +638,52 @@ } T.x; // incorrectly accessing private field } +{ + // multiple signatures for operator[] + struct A { + int operator[](string); + int operator[](int); + } +} +{ + // operator[=] without operator[] + struct A { + void operator[=](int); + } +} +{ + // non-void operator[=] + struct A { + int operator[](string); + int operator[=](string, int); + } +} +{ + // operator iter returns a non-iterable type + struct A { + int operator iter() { return 0; } + } + A a; + for (var i : a) + ; +} +{ + // Implicitly cast a function to an array + using Function = int(int); + int[] operator cast(Function f) { + return sequence(f, 10); + } + int f(int i) { return i + 17; } + for (var i : f) // This would work if we used `int` rather than `var`. + ; +} +{ + // Iterate over an ill-formed expression + int f(int i) { return 7; } + // cannot call 'int f(int i)' with parameter 'string' + for (int i : f('asdf')) + ; + // cannot call 'int f(int i)' with parameter 'string' + for (var i : f('asdf')) + ; +} \ No newline at end of file diff --git a/exp.cc b/exp.cc index 28f16c477..1efcc748c 100644 --- a/exp.cc +++ b/exp.cc @@ -52,6 +52,15 @@ void exp::transToType(coenv &e, types::ty *target) { types::ty *ct=cgetType(e); + // stringstream ss; + // target->print(ss); + // ss << " <- "; + // this->cgetType(e)->print(ss); + // string targetStr = ss.str(); + // if (targetStr == "int <- B") { + // cout << "transToType: " << targetStr << endl; + // } + if (equivalent(target, ct)) { transAsType(e, target); return; @@ -141,6 +150,15 @@ types::ty *tempExp::trans(coenv &e) { return t; } +exp *tempExp::evaluate(coenv &e, types::ty *target) { + if (equivalent(target, t)) { + // A tempExp, by design, has no side effects. + return this; + } + // Apply implicit cast. + return new tempExp(e, this, target); +} + varEntryExp::varEntryExp(position pos, types::ty *t, access *a) : exp(pos), v(new trans::varEntry(t, a, 0, nullPos)) {} @@ -186,6 +204,14 @@ void nameExp::prettyprint(ostream &out, Int indent) value->prettyprint(out, indent+1); } +exp *nameExp::evaluate(coenv &e, types::ty *target) { + // Names have no side effects unless an implicit cast is needed. + if (equivalent(target, cgetType(e))) { + // No side effects. + return this; + } + return new tempExp(e, this, target); +} void fieldExp::pseudoName::prettyprint(ostream &out, Int indent) { @@ -214,14 +240,30 @@ types::ty *fieldExp::getObject(coenv& e) return t; } +exp *fieldExp::evaluate(coenv &e, types::ty *t) { + if (equivalent(cgetType(e), t)) { + // Evaluate the object. + return new fieldExp(getPos(), + object->evaluate(e, getObject(e)), + field); + } + // Evaluate `this` and cast it to the correct type. + return new tempExp(e, this, t); +} -array *arrayExp::getArrayType(coenv &e) +types::ty *bracketsExp::getObjectType(coenv &e) { + types::ty *t = object->cgetType(e); + if (t->kind == ty_overloaded) { + t = ((overloaded *)t)->signatureless(); + } + return t; +} + +array *bracketsExp::getArrayType(coenv &e) { - types::ty *a = set->cgetType(e); - if (a->kind == ty_overloaded) { - a = ((overloaded *)a)->signatureless(); - if (!a) - return 0; + types::ty *a = getObjectType(e); + if (a == nullptr) { + return nullptr; } switch (a->kind) { @@ -234,19 +276,19 @@ array *arrayExp::getArrayType(coenv &e) } } -array *arrayExp::transArray(coenv &e) +array *bracketsExp::transArray(coenv &e) { - types::ty *a = set->cgetType(e); + types::ty *a = object->cgetType(e); if (a->kind == ty_overloaded) { a = ((overloaded *)a)->signatureless(); if (!a) { - em.error(set->getPos()); + em.error(object->getPos()); em << "expression is not an array"; return 0; } } - set->transAsType(e, a); + object->transAsType(e, a); switch (a->kind) { case ty_array: @@ -254,7 +296,7 @@ array *arrayExp::transArray(coenv &e) case ty_error: return 0; default: - em.error(set->getPos()); + em.error(object->getPos()); em << "expression is not an array"; return 0; } @@ -275,12 +317,38 @@ void subscriptExp::prettyprint(ostream &out, Int indent) prettyindent(out, indent); out << "subscriptExp\n"; - set->prettyprint(out, indent+1); + object->prettyprint(out, indent+1); index->prettyprint(out, indent+1); } +callExp *buildSubscriptReadCall(exp *object, exp *index) { + // Convert object[index] into + // object.operator[](index) + const static symbol SYM_BRACKETS = symbol::trans("[]"); + position pos = object->getPos(); + return new callExp( + pos, new fieldExp(pos, object, SYM_BRACKETS), index + ); +} + +callExp *buildSubscriptWriteCall(exp *object, exp *index, exp *value) { + // Convert object[index] = value into + // object.operator[=](index, value) + const static symbol SYM_BRACKETS_ASSIGN = symbol::trans("[=]"); + position pos = object->getPos(); + return new callExp( + pos, new fieldExp(pos, object, SYM_BRACKETS_ASSIGN), index, value + ); +} + types::ty *subscriptExp::trans(coenv &e) { + // EXPERIMENTAL + if (!isAnArray(e, object)) { + callExp *call = buildSubscriptReadCall(object, index); + return call->trans(e); + } + array *a = transArray(e); if (!a) return primError(); @@ -301,6 +369,22 @@ types::ty *subscriptExp::trans(coenv &e) types::ty *subscriptExp::getType(coenv &e) { + // EXPERIMENTAL + if (!isAnArray(e, object)) { + ty *t = object->cgetType(e); + if (t->kind == ty_overloaded) { + t = ((overloaded *)t)->signatureless(); + if (!t) + return primError(); + } + if (t->kind != ty_record) { + return primError(); + } + return static_cast(t)->valType(); + // callExp *call = buildSubscriptReadCall(set, index); + // return call->getType(e); + } + array *a = getArrayType(e); return a ? (isAnArray(e, index) ? a : a->celltype) : primError(); @@ -308,6 +392,34 @@ types::ty *subscriptExp::getType(coenv &e) void subscriptExp::transWrite(coenv &e, types::ty *t, exp *value) { + // EXPERIMENTAL + if (!isAnArray(e, object)) { + // Find the types of object and index. + types::ty *objectType = getObjectType(e); + assert(objectType); + types::ty *indexType = objectType->keyType(); + if (!indexType || indexType->kind == ty_error) { + em.error(object->getPos()); + em << "object does not have operator[]"; + return; + } + // Evaluate them to control the order in which side effects occur. + // We need value evaluated because we use it twice. We need the other two + // because any side effects from their translation should occur before the + // side effects from translating value. + exp *objectEvaluated = object->evaluate(e, objectType); + exp *indexEvaluated = index->evaluate(e, indexType); + exp *valueEvaluated = value->evaluate(e, t); + // Call object.operator[=](index, value). + callExp* call= buildSubscriptWriteCall( + objectEvaluated, indexEvaluated, valueEvaluated + ); + call->trans(e); + // Push the value back on the stack as the result of the assignment. + valueEvaluated->transAsType(e, t); + return; + } + // Put array, index, and value on the stack in that order, then call // arrayWrite. array *a = transArray(e); @@ -331,6 +443,31 @@ void subscriptExp::transWrite(coenv &e, types::ty *t, exp *value) e.c.encode(inst::builtin, run::arrayWrite); } +exp *subscriptExp::evaluate(coenv &e, types::ty *) +{ + types::ty *base = object->cgetType(e); + if (base->kind == ty_overloaded) { + base = ((overloaded *)base)->signatureless(); + } + if (!base) { + em.error(object->getPos()); + em << "object to index cannot be resolved"; + return nullptr; + } + types::ty *indexType = base->keyType(); + if (indexType->kind == ty_error) { + em.error(object->getPos()); + em << "object does not have operator[=] set up correctly"; + return nullptr; + } + // Force object and index to be evaluated in the correct order. + // (Note that in C++, the order of evaluation of function arguments is + // unspecified.) + exp *a = object->evaluate(e, base); + exp *b = index->evaluate(e, indexType); + return new subscriptExp(getPos(), a, b); +} + void slice::prettyprint(ostream &out, Int indent) { @@ -361,7 +498,7 @@ void slice::trans(coenv &e) void sliceExp::prettyprint(ostream &out, Int indent) { prettyname(out, "sliceExp", indent, getPos()); - set->prettyprint(out, indent+1); + object->prettyprint(out, indent+1); index->prettyprint(out, indent+1); } @@ -1157,7 +1294,7 @@ types::ty *castExp::trans(coenv &e) types::ty *castExp::getType(coenv &e) { - return target->trans(e, true); + return target->trans(e, ErrorMode::SUPPRESS); } diff --git a/exp.h b/exp.h index 152df76de..4a6e572de 100644 --- a/exp.h +++ b/exp.h @@ -193,13 +193,15 @@ class tempExp : public exp { public: tempExp(coenv &e, varinit *v, types::ty *t); - void prettyprint(ostream &out, Int indent); + void prettyprint(ostream &out, Int indent) override; - types::ty *trans(coenv &e); + types::ty *trans(coenv &e) override; - types::ty *getType(coenv &) { + types::ty *getType(coenv &) override { return t; } + + exp *evaluate(coenv &e, types::ty *target) override; }; // Wrap a varEntry so that it can be used as an expression. @@ -267,10 +269,8 @@ class nameExp : public exp { em << "use of variable \'" << *value << "\' is ambiguous"; return types::primError(); } - else { - transAsType(e, t); - return t; - } + transAsType(e, t); + return t; } types::ty *getType(coenv &e) override { @@ -298,10 +298,7 @@ class nameExp : public exp { ct=0; // See note in transAsType. } - exp *evaluate(coenv &, types::ty *) override { - // Names have no side-effects. - return this; - } + exp *evaluate(coenv &, types::ty *) override; }; // Most fields accessed are handled as parts of qualified names, but in cases @@ -336,8 +333,8 @@ class fieldExp : public nameExp { } // As a type: - types::ty *typeTrans(coenv &, bool tacit = false) { - if (!tacit) { + types::ty *typeTrans(coenv &, ErrorMode tacit = ErrorMode::NORMAL) { + if (tacit == ErrorMode::NORMAL) { em.error(getPos()); em << "expression is not a type"; } @@ -364,6 +361,10 @@ class fieldExp : public nameExp { void print(ostream& out) const { out << ""; } + void printPath(ostream& out) const { + em.compiler(getPos()); + em << "expression cannot be used as a path"; + } symbol getName() const { return object->getName(); @@ -371,7 +372,8 @@ class fieldExp : public nameExp { AsymptoteLsp::SymbolLit getLit() const { - return AsymptoteLsp::SymbolLit(static_cast(object->getName())); + return AsymptoteLsp::SymbolLit(static_cast(object->getName()) + ); } }; @@ -392,33 +394,30 @@ class fieldExp : public nameExp { return field; } - exp *evaluate(coenv &e, types::ty *) { - // Evaluate the object. - return new fieldExp(getPos(), - new tempExp(e, object, getObject(e)), - field); - } + exp *evaluate(coenv &e, types::ty *); }; -class arrayExp : public exp { +// Common functionality for subscriptExp and sliceExp. +class bracketsExp : public exp { protected: - exp *set; + exp *object; + types::ty *getObjectType(coenv &e); array *getArrayType(coenv &e); array *transArray(coenv &e); public: - arrayExp(position pos, exp *set) - : exp(pos), set(set) {} + bracketsExp(position pos, exp *set) + : exp(pos), object(set) {} }; -class subscriptExp : public arrayExp { +class subscriptExp : public bracketsExp { exp *index; public: subscriptExp(position pos, exp *set, exp *index) - : arrayExp(pos, set), index(index) {} + : bracketsExp(pos, set), index(index) {} void prettyprint(ostream &out, Int indent); @@ -426,11 +425,7 @@ class subscriptExp : public arrayExp { types::ty *getType(coenv &e); void transWrite(coenv &e, types::ty *t, exp *value); - exp *evaluate(coenv &e, types::ty *) { - return new subscriptExp(getPos(), - new tempExp(e, set, getArrayType(e)), - new tempExp(e, index, types::primInt())); - } + exp *evaluate(coenv &e, types::ty *); }; class slice : public absyn { @@ -458,12 +453,12 @@ class slice : public absyn { } }; -class sliceExp : public arrayExp { +class sliceExp : public bracketsExp { slice *index; public: sliceExp(position pos, exp *set, slice *index) - : arrayExp(pos, set), index(index) {} + : bracketsExp(pos, set), index(index) {} void prettyprint(ostream &out, Int indent); @@ -473,7 +468,7 @@ class sliceExp : public arrayExp { exp *evaluate(coenv &e, types::ty *) { return new sliceExp(getPos(), - new tempExp(e, set, getArrayType(e)), + new tempExp(e, object, getArrayType(e)), index->evaluate(e)); } }; @@ -826,8 +821,8 @@ class callExp : public exp { using colorInfo = std::tuple; /** - * @return nullopt if callExp is not a color, pair if color is RGB, - * and pair if color is RGBA. + * @return nullopt if callExp is not a color, pair if color is + * RGB, and pair if color is RGBA. */ optional, AsymptoteLsp::posInFile, AsymptoteLsp::posInFile>> getColorInformation(); diff --git a/fundec.cc b/fundec.cc index 99e8f436c..8b13cc4a1 100644 --- a/fundec.cc +++ b/fundec.cc @@ -31,17 +31,17 @@ void formal::prettyprint(ostream &out, Int indent) if (defval) defval->prettyprint(out, indent+1); } -types::formal formal::trans(coenv &e, bool encodeDefVal, bool tacit) { +types::formal formal::trans(coenv &e, bool encodeDefVal, ErrorMode tacit) { return types::formal(getType(e,tacit), getName(), encodeDefVal ? (bool) getDefaultValue() : 0, getExplicit()); } -types::ty *formal::getType(coenv &e, bool tacit) { +types::ty *formal::getType(coenv &e, ErrorMode tacit) { types::ty *bt = base->trans(e, tacit); types::ty *t = start ? start->getType(bt, e, tacit) : bt; - if (t->kind == ty_void && !tacit) { + if (t->kind == ty_void && tacit != ErrorMode::SUPPRESS) { em.error(getPos()); em << "cannot declare parameters of type void"; return primError(); @@ -53,7 +53,7 @@ types::ty *formal::getType(coenv &e, bool tacit) { void formal::addOps(coenv &e, record *r) { base->addOps(e, r); if (start) - start->addOps(base->trans(e, true), e, r); + start->addOps(base->trans(e, ErrorMode::SUPPRESS), e, r); } void formals::prettyprint(ostream &out, Int indent) @@ -65,7 +65,7 @@ void formals::prettyprint(ostream &out, Int indent) } void formals::addToSignature(signature& sig, - coenv &e, bool encodeDefVal, bool tacit) + coenv &e, bool encodeDefVal, ErrorMode tacit) { for (list::iterator p = fields.begin(); p != fields.end(); ++p) { formal& f=**p; @@ -78,7 +78,7 @@ void formals::addToSignature(signature& sig, } if (rest) { - if (!tacit && rest->getDefaultValue()) { + if (tacit!=ErrorMode::SUPPRESS && rest->getDefaultValue()) { em.error(rest->getPos()); em << "rest parameters cannot have default values"; } @@ -89,7 +89,7 @@ void formals::addToSignature(signature& sig, // Returns the types of each parameter as a signature. // encodeDefVal means that it will also encode information regarding // the default values into the signature -signature *formals::getSignature(coenv &e, bool encodeDefVal, bool tacit) +signature *formals::getSignature(coenv &e, bool encodeDefVal, ErrorMode tacit) { signature *sig = new signature; addToSignature(*sig,e,encodeDefVal,tacit); @@ -101,7 +101,7 @@ signature *formals::getSignature(coenv &e, bool encodeDefVal, bool tacit) // value of types::ty *result. function *formals::getType(types::ty *result, coenv &e, bool encodeDefVal, - bool tacit) + ErrorMode tacit) { function *ft = new function(result); addToSignature(ft->sig,e,encodeDefVal,tacit); @@ -177,7 +177,7 @@ void formal::transAsVar(coenv &e, Int index) { // Suppress error messages because they will already be reported // when the formals are translated to yield the type earlier. - types::ty *t = getType(e, true); + types::ty *t = getType(e, ErrorMode::SUPPRESS); varEntry *v = new varEntry(t, a, 0, getPos()); // Translate the default argument before adding the formal to the @@ -212,12 +212,12 @@ void fundef::prettyprint(ostream &out, Int indent) body->prettyprint(out, indent+1); } -function *fundef::transType(coenv &e, bool tacit) { +function *fundef::transType(coenv &e, ErrorMode tacit) { bool encodeDefVal=true; return params->getType(result->trans(e, tacit), e, encodeDefVal, tacit); } -function *fundef::transTypeAndAddOps(coenv &e, record *r, bool tacit) { +function *fundef::transTypeAndAddOps(coenv &e, record *r, ErrorMode tacit) { result->addOps(e,r); params->addOps(e,r); @@ -284,7 +284,7 @@ types::ty *fundef::trans(coenv &e) { // new guide[] (guide f(int)) { // return sequence(f, 10); // }; - function *ft=transTypeAndAddOps(e, (record *)0, false); + function *ft=transTypeAndAddOps(e, (record *)0, ErrorMode::NORMAL); assert(ft); baseTrans(e, ft); @@ -307,7 +307,7 @@ void fundec::trans(coenv &e) void fundec::transAsField(coenv &e, record *r) { - function *ft = fun.transTypeAndAddOps(e, r, false); + function *ft = fun.transTypeAndAddOps(e, r, ErrorMode::NORMAL); assert(ft); createVar(getPos(), e, r, id, ft, fun.makeVarInit(ft)); diff --git a/fundec.h b/fundec.h index f90d2c3f2..3e499311b 100644 --- a/fundec.h +++ b/fundec.h @@ -30,13 +30,14 @@ class formal : public absyn { virtual void prettyprint(ostream &out, Int indent) override; // Build the corresponding types::formal to put into a signature. - types::formal trans(coenv &e, bool encodeDefVal, bool tacit=false); + types::formal + trans(coenv& e, bool encodeDefVal, ErrorMode tacit= ErrorMode::NORMAL); // Add the formal parameter to the environment to prepare for the // function body's translation. virtual void transAsVar(coenv &e, Int index); - types::ty *getType(coenv &e, bool tacit=false); + types::ty *getType(coenv &e, ErrorMode tacit=ErrorMode::NORMAL); absyntax::astType *getAbsyntaxType() { return base; } @@ -84,7 +85,7 @@ class formals : public absyn { bool keywordOnly; void addToSignature(types::signature& sig, - coenv &e, bool encodeDefVal, bool tacit); + coenv &e, bool encodeDefVal, ErrorMode tacit); public: formals(position pos) : absyn(pos), rest(0), keywordOnly(false) {} @@ -126,13 +127,13 @@ class formals : public absyn { // the default values into the signature types::signature *getSignature(coenv &e, bool encodeDefVal = false, - bool tacit = false); + ErrorMode tacit = ErrorMode::NORMAL); // Returns the corresponding function type, assuming it has a return // value of "result." types::function *getType(types::ty *result, coenv &e, bool encodeDefVal = false, - bool tacit = false); + ErrorMode tacit = ErrorMode::NORMAL); mem::vector *getFields(); @@ -167,10 +168,11 @@ class fundef : public exp { virtual void baseTrans(coenv &e, types::function *ft); virtual types::ty *trans(coenv &e) override; - virtual types::function *transType(coenv &e, bool tacit); - virtual types::function *transTypeAndAddOps(coenv &e, record *r, bool tacit); + virtual types::function *transType(coenv &e, ErrorMode tacit); + virtual types::function* + transTypeAndAddOps(coenv& e, record* r, ErrorMode tacit); virtual types::ty *getType(coenv &e) override { - return transType(e, true); + return transType(e, ErrorMode::SUPPRESS); } void createSymMap(AsymptoteLsp::SymbolContext* symContext) override; diff --git a/hashing.cc b/hashing.cc new file mode 100644 index 000000000..7e2f45b3f --- /dev/null +++ b/hashing.cc @@ -0,0 +1,73 @@ +#include "hashing.h" + +#include // For Debugging ONLY +#include +#include +#include + +#include +#include + +namespace hashing { +using namespace highwayhash; + +// uint64_t highwayHash() { +// HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; +// char in[8] = {1}; +// HHResult64 result; // or HHResult128 or HHResult256 +// InstructionSets::Run(key, in, 8, &result); +// return result; +// } + +uint64_t constexpr shiftLeftDefined(uint64_t x, int8_t shift) { + return shift >= 64 ? 0 : x << shift; +} + +uint64_t random_bits(int8_t bits) { + static std::random_device *rd = new std::random_device(); + static auto *gen = new std::mt19937_64((*rd)()); + // uint64_t max = (bits >= 64 ? UINT64_C(-1) : (UINT64_C(1) << bits) - 1); + std::uniform_int_distribution dist( + 0, shiftLeftDefined(1, bits) - 1); + return dist(*gen); +} + +uint64_t hashSpan(span s) { + HH_ALIGNAS(32) static const HHKey key = {random_bits(64), random_bits(64), + random_bits(64), random_bits(64)}; + HHResult64 result; + InstructionSets::Run(key, s.data(), s.size(), &result); + return result & (shiftLeftDefined(1, 62) - 1); +} + +uint64_t hashSpan(span s) { + span sChar = {reinterpret_cast(s.data()), + s.size() * (sizeof(uint64_t) / sizeof(char))}; + return hashSpan(sChar); +} + +std::array fingerprint(span s) { + // The following key was generated using the Python `secrets` module. + // However, since the key is public, the resulting hash is not secure. + // (While HighwayHash makes cryptographic claims, those claims rely on + // the secrecy of the key.) + HH_ALIGNAS(32) static constexpr HHKey key= { + UINT64_C(0x6e1b31ab5e83c15a), + UINT64_C(0x6648d2208b67c4af), + UINT64_C(0xcddc6e8f557f7103), + UINT64_C(0x0729a6dd6e86d99a) + }; + HHResult256 result; + InstructionSets::Run(key, s.data(), s.size(), &result); + std::array fingerprint; + std::copy_n(result, 4, fingerprint.begin()); + return fingerprint; +} + +uint64_t hashInt(uint64_t i) { + span s = {&i, 1}; + return hashSpan(s); +} + + +} // namespace hashing diff --git a/hashing.h b/hashing.h new file mode 100644 index 000000000..724c9d392 --- /dev/null +++ b/hashing.h @@ -0,0 +1,13 @@ +#include +#include + +#include "common.h" + +namespace hashing { + +uint64_t hashSpan(span s); +uint64_t hashSpan(span s); +uint64_t hashInt(uint64_t i); +std::array fingerprint(span s); + +} // namespace hashing \ No newline at end of file diff --git a/highwayhash/.gitignore b/highwayhash/.gitignore new file mode 100644 index 000000000..1da3cef1b --- /dev/null +++ b/highwayhash/.gitignore @@ -0,0 +1,13 @@ +bin +lib +obj +deps.mk +OWNERS +*.a +*.o +benchmark +nanobenchmark_example +profiler_example +sip_hash_test +vector_test +highwayhash_test diff --git a/highwayhash/.gitrepo b/highwayhash/.gitrepo new file mode 100644 index 000000000..58fee452f --- /dev/null +++ b/highwayhash/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = https://github.com/google/highwayhash.git + branch = master + commit = 5ad3bf8444cfc663b11bf367baaa31f36e7ff7c8 + parent = 4c385fa31d75be1faebf40a246d57a10b944c6fb + method = merge + cmdver = 0.4.6 diff --git a/highwayhash/.travis.yml b/highwayhash/.travis.yml new file mode 100644 index 000000000..e05097581 --- /dev/null +++ b/highwayhash/.travis.yml @@ -0,0 +1,10 @@ +language: cpp + +dist: trusty + +compiler: + - clang + - gcc + +script: + - make diff --git a/highwayhash/CMakeLists.txt b/highwayhash/CMakeLists.txt new file mode 100644 index 000000000..1af921225 --- /dev/null +++ b/highwayhash/CMakeLists.txt @@ -0,0 +1,251 @@ + +project(highwayhash C CXX) + +cmake_minimum_required(VERSION 3.18) + +# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to make +# it prominent in the GUI. +option(BUILD_SHARED_LIBS "Build library as shared." OFF) + +# Force PIC on unix when building shared libs +# see: https://en.wikipedia.org/wiki/Position-independent_code +if(BUILD_SHARED_LIBS AND UNIX) + option(CMAKE_POSITION_INDEPENDENT_CODE "Build with Position Independant Code." ON) +endif() + + +set(PROCESSOR_IS_ARM FALSE) +set(PROCESSOR_IS_AARCH64 FALSE) +set(PROCESSOR_IS_X86 FALSE) +set(PROCESSOR_IS_POWER FALSE) + +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)") + set(PROCESSOR_IS_AARCH64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set(PROCESSOR_IS_ARM TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") + set(PROCESSOR_IS_X86 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + set(PROCESSOR_IS_POWER TRUE) +endif() + +message(STATUS "Processor: ARM=${PROCESSOR_IS_ARM}, AARCH64=${PROCESSOR_IS_AARCH64}, X86=${PROCESSOR_IS_X86}, POWER=${PROCESSOR_IS_POWER}") + + +if(CMAKE_COMPILER_IS_GNUCXX OR CLANG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O3 -fPIC -pthread -Wno-maybe-uninitialized") + if(PROCESSOR_IS_ARM) + # aarch64 and ARM use the same code, although ARM usually needs an extra flag for NEON. + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -march=armv7-a -mfpu=neon") + endif() +endif() + + +# +# library : highwayhash +# + +set(HH_INCLUDES + ${PROJECT_SOURCE_DIR}/highwayhash/c_bindings.h + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash.h +) + +set(HH_SOURCES + ${PROJECT_SOURCE_DIR}/highwayhash/c_bindings.cc + ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.cc + ${PROJECT_SOURCE_DIR}/highwayhash/arch_specific.cc + + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_target.cc + ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.cc + + ${PROJECT_SOURCE_DIR}/highwayhash/scalar_sip_tree_hash.cc + ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash.cc + ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.cc + + ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.h + ${PROJECT_SOURCE_DIR}/highwayhash/state_helpers.h + + ${PROJECT_SOURCE_DIR}/highwayhash/arch_specific.h + ${PROJECT_SOURCE_DIR}/highwayhash/compiler_specific.h + ${PROJECT_SOURCE_DIR}/highwayhash/load3.h + ${PROJECT_SOURCE_DIR}/highwayhash/vector128.h + ${PROJECT_SOURCE_DIR}/highwayhash/vector256.h + ${PROJECT_SOURCE_DIR}/highwayhash/endianess.h + ${PROJECT_SOURCE_DIR}/highwayhash/iaca.h + ${PROJECT_SOURCE_DIR}/highwayhash/hh_types.h + ${PROJECT_SOURCE_DIR}/highwayhash/hh_buffer.h + + ${PROJECT_SOURCE_DIR}/highwayhash/scalar_sip_tree_hash.h + ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash.h + ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.h +) + +if(PROCESSOR_IS_ARM OR PROCESSOR_IS_AARCH64) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_neon.cc) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_neon.h) + +elseif(PROCESSOR_IS_POWER) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.cc) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.h) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/benchmark.cc + PROPERTIES COMPILE_FLAGS -mvsx) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.cc + PROPERTIES COMPILE_FLAGS -mvsx) + +elseif(PROCESSOR_IS_X86) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.cc) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.cc) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.h) + list(APPEND HH_SOURCES ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.h) + + # TODO: Portability: Have AVX2 be optional so benchmarking can be done on older machines. + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/benchmark.cc + PROPERTIES COMPILE_FLAGS -mavx2) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.cc + PROPERTIES COMPILE_FLAGS -mavx2) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.cc + PROPERTIES COMPILE_FLAGS -mavx2) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.cc + PROPERTIES COMPILE_FLAGS -msse4.1) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.cc + PROPERTIES COMPILE_FLAGS -DHH_TARGET_NAME=Portable) + +else() + # Unknown architecture. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHH_DISABLE_TARGET_SPECIFIC") +endif() + + +add_library(highwayhash ${HH_INCLUDES} ${HH_SOURCES}) +set_target_properties(highwayhash PROPERTIES PUBLIC_HEADER "${HH_INCLUDES}") + +target_include_directories(highwayhash + PUBLIC $ +) +target_include_directories(highwayhash + PUBLIC $ +) + +if(NOT WIN32 AND NOT ANDROID) + target_link_libraries(highwayhash pthread) +endif() + +add_library(highwayhash::highwayhash ALIAS highwayhash) + + +# +# Tests & Similar +# + +add_library(nanobenchmark OBJECT + ${PROJECT_SOURCE_DIR}/highwayhash/nanobenchmark.h + ${PROJECT_SOURCE_DIR}/highwayhash/nanobenchmark.cc + + ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.h + ${PROJECT_SOURCE_DIR}/highwayhash/os_specific.h + ${PROJECT_SOURCE_DIR}/highwayhash/profiler.h + ${PROJECT_SOURCE_DIR}/highwayhash/tsc_timer.h + + ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.cc + ${PROJECT_SOURCE_DIR}/highwayhash/os_specific.cc +) +target_include_directories(nanobenchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + + +add_executable(highwayhash_test) +target_sources(highwayhash_test PRIVATE + + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test.cc + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_portable.cc + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_target.h +) +target_link_libraries(highwayhash_test highwayhash nanobenchmark) + + +add_executable(vector_test) +target_sources(vector_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test.cc + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_portable.cc + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_target.h +) +target_link_libraries(vector_test highwayhash nanobenchmark) + + +if(PROCESSOR_IS_ARM OR PROCESSOR_IS_AARCH64) + target_sources(highwayhash_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_neon.cc + ) + target_sources(vector_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_neon.cc + ) + +elseif(PROCESSOR_IS_X86) + target_sources(highwayhash_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_avx2.cc + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_sse41.cc + ) + target_sources(vector_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_avx2.cc + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_sse41.cc + ) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_avx2.cc + PROPERTIES COMPILE_FLAGS -mavx2) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_sse41.cc + PROPERTIES COMPILE_FLAGS -msse4.1) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_avx2.cc + PROPERTIES COMPILE_FLAGS -mavx2) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_sse41.cc + PROPERTIES COMPILE_FLAGS -msse4.1) + +elseif(PROCESSOR_IS_POWER) + target_sources(highwayhash_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_vsx.cc + ) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_vsx.cc + PROPERTIES COMPILE_FLAGS -mvsx) + + set_source_files_properties( + ${PROJECT_SOURCE_DIR}/highwayhash/vector_test.cc + PROPERTIES COMPILE_FLAGS -DHH_DISABLE_TARGET_SPECIFIC) + +endif() + + +add_executable(sip_hash_test) +target_sources(sip_hash_test PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash_test.cc +) +target_link_libraries(sip_hash_test highwayhash) + + +add_executable(example) +target_sources(example PRIVATE + ${PROJECT_SOURCE_DIR}/highwayhash/example.cc + ) +target_link_libraries(example highwayhash) + diff --git a/highwayhash/CONTRIBUTING b/highwayhash/CONTRIBUTING new file mode 100644 index 000000000..bd6072591 --- /dev/null +++ b/highwayhash/CONTRIBUTING @@ -0,0 +1,27 @@ +Want to contribute? Great! First, read this page (including the small print at the end). + +### Before you contribute +Before we can use your code, you must sign the +[Google Individual Contributor License Agreement] +(https://cla.developers.google.com/about/google-individual) +(CLA), which you can do online. The CLA is necessary mainly because you own the +copyright to your changes, even after your contribution becomes part of our +codebase, so we need your permission to use and distribute your code. We also +need to be sure of various other things-for instance that you'll tell us if you +know that your code infringes on other people's patents. You don't have to sign +the CLA until after you've submitted your code for review and a member has +approved it, but you must do it before we can put your code into our codebase. +Before you start working on a larger contribution, you should get in touch with +us first through the issue tracker with your idea so that we can help out and +possibly guide you. Coordinating up front makes it much easier to avoid +frustration later on. + +### Code reviews +All submissions, including submissions by project members, require review. We +use Github pull requests for this purpose. + +### The small print +Contributions made by corporations are covered by a different agreement than +the one above, the +[Software Grant and Corporate Contributor License Agreement] +(https://cla.developers.google.com/about/google-corporate). diff --git a/highwayhash/LICENSE b/highwayhash/LICENSE new file mode 100644 index 000000000..6b0b1270f --- /dev/null +++ b/highwayhash/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/highwayhash/Makefile b/highwayhash/Makefile new file mode 100644 index 000000000..a312bc263 --- /dev/null +++ b/highwayhash/Makefile @@ -0,0 +1,140 @@ +# We assume X64 unless HH_POWER, HH_ARM, or HH_AARCH64 are defined. + +override CPPFLAGS += -I. +override CXXFLAGS += -std=c++11 -Wall -O3 -fPIC -pthread +override LDFLAGS += -pthread + +PREFIX ?= /usr/local +INCDIR ?= $(PREFIX)/include +LIBDIR ?= $(PREFIX)/lib + +SIP_OBJS := $(addprefix obj/, \ + sip_hash.o \ + sip_tree_hash.o \ + scalar_sip_tree_hash.o \ +) + +DISPATCHER_OBJS := $(addprefix obj/, \ + arch_specific.o \ + instruction_sets.o \ + nanobenchmark.o \ + os_specific.o \ +) + +HIGHWAYHASH_OBJS := $(DISPATCHER_OBJS) obj/hh_portable.o +HIGHWAYHASH_TEST_OBJS := $(DISPATCHER_OBJS) obj/highwayhash_test_portable.o +VECTOR_TEST_OBJS := $(DISPATCHER_OBJS) obj/vector_test_portable.o + +# aarch64 and ARM use the same code, although ARM usually needs an extra flag for NEON. +ifdef HH_ARM +CXXFLAGS += -mfloat-abi=hard -march=armv7-a -mfpu=neon +HH_AARCH64 = 1 +endif + +ifdef HH_AARCH64 +HH_X64 = +HIGHWAYHASH_OBJS += obj/hh_neon.o +HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_neon.o +VECTOR_TEST_OBJS += obj/vector_test_neon.o +else +ifdef HH_POWER +HH_X64 = +HIGHWAYHASH_OBJS += obj/hh_vsx.o +HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_vsx.o +else +HH_X64 = 1 +HIGHWAYHASH_OBJS += obj/hh_avx2.o obj/hh_sse41.o +HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_avx2.o obj/highwayhash_test_sse41.o +VECTOR_TEST_OBJS += obj/vector_test_avx2.o obj/vector_test_sse41.o +endif +endif + +# In case highwayhash_test defines PRINT_RESULTS. +HIGHWAYHASH_TEST_OBJS += $(HIGHWAYHASH_OBJS) + +all: $(addprefix bin/, \ + profiler_example nanobenchmark_example vector_test sip_hash_test \ + highwayhash_test benchmark) lib/libhighwayhash.a + +obj/%.o: highwayhash/%.cc + @mkdir -p -- $(dir $@) + $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@ + +bin/%: obj/%.o + @mkdir -p -- $(dir $@) + $(CXX) $(LDFLAGS) $^ -o $@ + +.DELETE_ON_ERROR: +deps.mk: $(wildcard highwayhash/*.cc) $(wildcard highwayhash/*.h) Makefile + set -eu; for file in highwayhash/*.cc; do \ + target=obj/$${file##*/}; target=$${target%.*}.o; \ + [ "$$target" = "obj/highwayhash_target.o" ] || \ + [ "$$target" = "obj/data_parallel_benchmark.o" ] || \ + [ "$$target" = "obj/data_parallel_test.o" ] || \ + $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -DHH_DISABLE_TARGET_SPECIFIC -MM -MT \ + "$$target" "$$file"; \ + done | sed -e ':b' -e 's-../[^./]*/--' -e 'tb' >$@ +-include deps.mk + +bin/profiler_example: $(DISPATCHER_OBJS) + +bin/nanobenchmark_example: $(DISPATCHER_OBJS) obj/nanobenchmark.o + +ifdef HH_X64 +# TODO: Portability: Have AVX2 be optional so benchmarking can be done on older machines. +obj/sip_tree_hash.o: CXXFLAGS+=-mavx2 +# (Compiled from same source file with different compiler flags) +obj/highwayhash_test_avx2.o: CXXFLAGS+=-mavx2 +obj/highwayhash_test_sse41.o: CXXFLAGS+=-msse4.1 +obj/hh_avx2.o: CXXFLAGS+=-mavx2 +obj/hh_sse41.o: CXXFLAGS+=-msse4.1 +obj/vector_test_avx2.o: CXXFLAGS+=-mavx2 +obj/vector_test_sse41.o: CXXFLAGS+=-msse4.1 + +obj/benchmark.o: CXXFLAGS+=-mavx2 +endif + +ifdef HH_POWER +obj/highwayhash_test_vsx.o: CXXFLAGS+=-mvsx +obj/hh_vsx.o: CXXFLAGS+=-mvsx +obj/benchmark.o: CXXFLAGS+=-mvsx +# Skip file - vector library/test not supported on PPC +obj/vector_test_target.o: CXXFLAGS+=-DHH_DISABLE_TARGET_SPECIFIC +obj/vector_test.o: CXXFLAGS+=-DHH_DISABLE_TARGET_SPECIFIC +endif + +lib/libhighwayhash.a: $(SIP_OBJS) $(HIGHWAYHASH_OBJS) obj/c_bindings.o + @mkdir -p -- $(dir $@) + $(AR) rcs $@ $^ + +lib/libhighwayhash.so: $(SIP_OBJS) $(HIGHWAYHASH_OBJS) obj/c_bindings.o + @mkdir -p -- $(dir $@) + $(CXX) $(CXXFLAGS) $(LDFLAGS) -shared $^ -o $@.0 -Wl,-soname,libhighwayhash.so.0 + @cd $(dir $@); ln -s libhighwayhash.so.0 libhighwayhash.so + +bin/highwayhash_test: $(HIGHWAYHASH_TEST_OBJS) + +bin/benchmark: obj/benchmark.o $(HIGHWAYHASH_TEST_OBJS) +bin/benchmark: $(SIP_OBJS) $(HIGHWAYHASH_OBJS) +bin/vector_test: $(VECTOR_TEST_OBJS) + +clean: + [ ! -d obj ] || $(RM) -r -- obj/ + +distclean: clean + [ ! -d bin ] || $(RM) -r -- bin/ + [ ! -d lib ] || $(RM) -r -- lib/ + +# Mode bits are from issue #58, thanks to yurivict for suggesting. +# Also added owner-write for stripping the .so in post-install. +install: lib/libhighwayhash.a lib/libhighwayhash.so + mkdir -p $(DESTDIR)/$(LIBDIR) + mkdir -p $(DESTDIR)/$(INCDIR)/highwayhash + install -m0444 lib/libhighwayhash.a $(DESTDIR)/$(LIBDIR) + install -m0755 lib/libhighwayhash.so $(DESTDIR)/$(LIBDIR) + install -m0444 highwayhash/*.h $(DESTDIR)/$(INCDIR)/highwayhash/ + +post-install: + ${STRIP_CMD} $(DESTDIR)/$(LIBDIR)/libhighwayhash.so + +.PHONY: clean distclean all install post-install diff --git a/highwayhash/README.md b/highwayhash/README.md new file mode 100644 index 000000000..d59f7ab6d --- /dev/null +++ b/highwayhash/README.md @@ -0,0 +1,404 @@ +Strong (well-distributed and unpredictable) hashes: + +* Portable implementation of + [SipHash](https://www.131002.net/siphash/siphash.pdf) +* HighwayHash, a 5x faster SIMD hash with [security + claims](https://arxiv.org/abs/1612.06257) + +## Quick Start + +To build on a Linux or Mac platform, simply run `make`. For Windows, we provide +a Visual Studio 2015 project in the `msvc` subdirectory. + +Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test` +ensure the implementations return known-good values for a given set of inputs. + +64-bit SipHash for any CPU: + +``` + #include "highwayhash/sip_hash.h" + using namespace highwayhash; + HH_ALIGNAS(16) const HH_U64 key2[2] = {1234, 5678}; + char in[8] = {1}; + return SipHash(key2, in, 8); +``` + +64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags: + +``` + #include "highwayhash/highwayhash.h" + using namespace highwayhash; + HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + HHStateT state(key); + HighwayHashT(&state, in, 8, &result); +``` + +64, 128 or 256 bit HighwayHash for the CPU on which we're currently running: + +``` + #include "highwayhash/highwayhash_target.h" + #include "highwayhash/instruction_sets.h" + using namespace highwayhash; + HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + InstructionSets::Run(key, in, 8, &result); +``` + +C-callable 64-bit HighwayHash for the CPU on which we're currently running: + + #include "highwayhash/c_bindings.h" + const uint64_t key[4] = {1, 2, 3, 4}; + char in[8] = {1}; + return HighwayHash64(key, in, 8); + +Printing a 256-bit result in a hexadecimal format similar to sha1sum: + + HHResult256 result; + printf("%016"PRIx64"%016"PRIx64"%016"PRIx64"%016"PRIx64"\n", + result[3], result[2], result[1], result[0]); + +## Introduction + +Hash functions are widely used, so it is desirable to increase their speed and +security. This package provides two 'strong' (well-distributed and +unpredictable) hash functions: a faster version of SipHash, and an even faster +algorithm we call HighwayHash. + +SipHash is a fast but 'cryptographically strong' pseudo-random function by +Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf]. + +HighwayHash is a new way of mixing inputs which may inspire new +cryptographically strong hashes. Large inputs are processed at a rate of 0.24 +cycles per byte, and latency remains low even for small inputs. HighwayHash is +faster than SipHash for all input sizes, with 5 times higher throughput at 1 +KiB. We discuss design choices and provide statistical analysis and preliminary +cryptanalysis in https://arxiv.org/abs/1612.06257. + +## Applications + +Unlike prior strong hashes, these functions are fast enough to be recommended +as safer replacements for weak hashes in many applications. The additional CPU +cost appears affordable, based on profiling data indicating C++ hash functions +account for less than 0.25% of CPU usage. + +Hash-based selection of random subsets is useful for A/B experiments and similar +applications. Such random generators are idempotent (repeatable and +deterministic), which is helpful for parallel algorithms and testing. To avoid +bias, it is important that the hash function be unpredictable and +indistinguishable from a uniform random generator. We have verified the bit +distribution and avalanche properties of SipHash and HighwayHash. + +64-bit hashes are also useful for authenticating short-lived messages such as +network/RPC packets. This requires that the hash function withstand +differential, length extension and other attacks. We have published a formal +security analysis for HighwayHash. New cryptanalysis tools may still need to be +developed for further analysis. + +Strong hashes are also important parts of methods for protecting hash tables +against unacceptable worst-case behavior and denial of service attacks +(see "hash flooding" below). + +128 and 256-bit hashes can be useful for verifying data integrity (checksums). + +## SipHash + +Our SipHash implementation is a fast and portable drop-in replacement for +the reference C code. Outputs are identical for the given test cases (messages +between 0 and 63 bytes). + +Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1 +(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate +instructions prior to AVX-512. + +SipHash13 is a faster but weaker variant with one mixing round per update and +three during finalization. + +We also provide a data-parallel 'tree hash' variant that enables efficient SIMD +while retaining safety guarantees. This is about twice as fast as SipHash, but +does not return the same results. + +## HighwayHash + +We have devised a new way of mixing inputs with SIMD multiply and permute +instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible +to reverse. Permuting equalizes the distribution of the resulting bytes. + +The internal state is quite large (1024 bits) but fits within SIMD registers. +Due to limitations of the AVX2 instruction set, the registers are partitioned +into two 512-bit halves that remain independent until the reduce phase. The +algorithm outputs 64 bit digests or up to 256 bits at no extra cost. + +In addition to high throughput, the algorithm is designed for low finalization +cost. The result is more than twice as fast as SipTreeHash. + +We also provide an SSE4.1 version (80% as fast for large inputs and 95% as fast +for short inputs), an implementation for VSX on POWER and a portable version +(10% as fast). A third-party ARM implementation is referenced below. + +Statistical analyses and preliminary cryptanalysis are given in +https://arxiv.org/abs/1612.06257. + +## Versioning and stability + +Now that 21 months have elapsed since their initial release, we have declared +all (64/128/256 bit) variants of HighwayHash frozen, i.e. unchanging forever. + +SipHash and HighwayHash are 'fingerprint functions' whose input -> hash +mapping will not change. This is important for applications that write hashes to +persistent storage. + +## Speed measurements + +To measure the CPU cost of a hash function, we can either create an artificial +'microbenchmark' (easier to control, but probably not representative of the +actual runtime), or insert instrumentation directly into an application (risks +influencing the results through observer overhead). We provide novel variants of +both approaches that mitigate their respective disadvantages. + +profiler.h uses software write-combining to stream program traces to memory +with minimal overhead. These can be analyzed offline, or when memory is full, +to learn how much time was spent in each (possibly nested) zone. + +nanobenchmark.h enables cycle-accurate measurements of very short functions. +It uses CPU fences and robust statistics to minimize variability, and also +avoids unrealistic branch prediction effects. + +We compile the 64-bit C++ implementations with a patched GCC 4.9 and run on a +single idle core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured +as cycles per byte for various input sizes: + +Algorithm | 8 | 31 | 32 | 63 | 64 | 1024 +---------------- | ----- | ---- | ---- | ---- | ---- | ---- +HighwayHashAVX2 | 7.34 | 1.81 | 1.71 | 1.04 | 0.95 | 0.24 +HighwayHashSSE41 | 8.00 | 2.11 | 1.75 | 1.13 | 0.96 | 0.30 +SipTreeHash | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57 +SipTreeHash13 | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33 +SipHash | 8.13 | 2.58 | 2.73 | 1.87 | 1.93 | 1.26 +SipHash13 | 6.96 | 2.09 | 2.12 | 1.32 | 1.33 | 0.68 + +SipTreeHash is slower than SipHash for small inputs because it processes blocks +of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input +sizes due to their highly optimized handling of partial vectors. + +Note that previous measurements included the initialization of their input, +which dramatically increased timings especially for small inputs. + +## CPU requirements + +SipTreeHash(13) requires an AVX2-capable CPU (e.g. Haswell). HighwayHash +includes a dispatcher that chooses the implementation (AVX2, SSE4.1, VSX or +portable) at runtime, as well as a directly callable function template that can +only run on the CPU for which it was built. SipHash(13) and +ScalarSipTreeHash(13) have no particular CPU requirements. + +### AVX2 vs SSE4 + +When both AVX2 and SSE4 are available, the decision whether to use AVX2 is +non-obvious. AVX2 vectors are twice as wide, but require a higher power license +(integer multiplications count as 'heavy' instructions) and can thus reduce the +clock frequency of the core or entire socket(!) on Haswell systems. This +partially explains the observed 1.25x (not 2x) speedup over SSE4. Moreover, it +is inadvisable to only sporadically use AVX2 instructions because there is also +a ~56K cycle warmup period during which AVX2 operations are slower, and Haswell +can even stall during this period. Thus, we recommend avoiding AVX2 for +infrequent hashing if the rest of the application is also not using AVX2. For +any input larger than 1 MiB, it is probably worthwhile to enable AVX2. + +### SIMD implementations + +Our x86 implementations use custom vector classes with overloaded operators +(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs. +compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`). +The VSX implementation uses built-in vector types alongside Altivec intrinsics. +A high-performance third-party ARM implementation is mentioned below. + +### Dispatch + +Our instruction_sets dispatcher avoids running newer instructions on older CPUs +that do not support them. However, intrinsics, and therefore also any vector +classes that use them, require (on GCC < 4.9 or Clang < 3.9) a compiler flag +that also allows the compiler to generate code for that CPU. This means the +intrinsics must be placed in separate translation units that are compiled with +the required flags. It is important that these source files and their headers +not define any inline functions, because that might break the one definition +rule and cause crashes. + +To minimize dispatch overhead when hashes are computed often (e.g. in a loop), +we can inline the hash function into its caller using templates. The dispatch +overhead will only be paid once (e.g. before the loop). The template mechanism +also avoids duplicating code in each CPU-specific implementation. + +## Defending against hash flooding + +To mitigate hash flooding attacks, we need to take both the hash function and +the data structure into account. + +We wish to defend (web) services that utilize hash sets/maps against +denial-of-service attacks. Such data structures assign attacker-controlled +input messages `m` to a hash table bin `b` by computing the hash `H(s, m)` +using a hash function `H` seeded by `s`, and mapping it to a bin with some +narrowing function `b = R(h)`, discussed below. + +Attackers may attempt to trigger 'flooding' (excessive work in insertions or +lookups) by finding multiple `m` that map to the same bin. If the attacker has +local access, they can do far worse, so we assume the attacker can only issue +remote requests. If the attacker is able to send large numbers of requests, +they can already deny service, so we need only ensure the attacker's cost is +sufficiently large compared to the service's provisioning. + +If the hash function is 'weak', attackers can easily generate 'hash collisions' +(inputs mapping to the same hash values) that are independent of the seed. In +other words, certain input messages will cause collisions regardless of the seed +value. The author of SipHash has published C++ programs to generate such +'universal (key-independent) multicollisions' for CityHash and Murmur. Similar +'differential' attacks are likely possible for any hash function consisting only +of reversible operations (e.g. addition/multiplication/rotation) with a constant +operand. `n` requests with such inputs cause `n^2` work for an unprotected hash +table, which is unacceptable. + +By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible +attacker effort to find a hash collision (an expected 2^32 guesses of `m` per +the birthday paradox) or recover the seed (2^63 requests). These security claims +assume the seed is secret. It is reasonable to suppose `s` is initially unknown +to attackers, e.g. generated on startup or even per-connection. A timing attack +by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using +millions of requests, which takes several days (even assuming unrealistic 150 us +round-trip times). It appears infeasible to recover 64-bit seeds in this way. + +However, attackers are only looking for multiple `m` mapping to the same bin +rather than identical hash values. We assume they know or are able to discover +the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient +`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be +easier for attackers to compute partial collisions where only the lower `i` bits +match. This can be prevented by choosing a prime `p` so that `R(h) := h % p` +incorporates all hash bits. The costly modulo operation can be avoided by +multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative +suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R` +function can be computed in just 3 cycles using PEXT from the BMI2 instruction +set. This is expected to defend against SAT-solver attacks on the hash bits at a +slightly lower cost than the multiplicative inverse method, and still allows +power-of-two table sizes. + +Summary thus far: given a strong hash function and secret seed, it appears +infeasible for attackers to generate hash collisions because `s` and/or `R` are +unknown. However, they can still observe the timings of data structure +operations for various `m`. With typical table sizes of 2^10 to 2^17 entries, +attackers can detect some 'bin collisions' (inputs mapping to the same bin). +Although this will be costly for the attacker, they can then send many instances +of such inputs, so we need to limit the resulting work for our data structure. + +Hash tables with separate chaining typically store bin entries in a linked list, +so worst-case inputs lead to unacceptable linear-time lookup cost. We instead +seek optimal asymptotic worst-case complexity for each operation (insertion, +deletion and lookups), which is a constant factor times the logarithm of the +data structure size. This naturally leads to a tree-like data structure for each +bin. The Java8 HashMap only replaces its linked list with trees when needed. +This leads to additional cost and complexity for deciding whether a bin is a +list or tree. + +Our first proposal (suggested by Github user funny-falcon) avoids this overhead +by always storing one tree per bin. It may also be worthwhile to store the first +entry directly in the bin, which avoids allocating any tree nodes in the common +case where bins are sparsely populated. What kind of tree should be used? + +Given SipHash and HighwayHash provide high quality randomness, depending on +expecting attack surface simple non-balancing binary search tree could perform +reasonably well. [Wikipedia says](https://en.wikipedia.org/wiki/Binary_search_tree#Definition) +> After a long intermixed sequence of random insertion and deletion, the +> expected height of the tree approaches square root of the number of keys, √n, +> which grows much faster than log n. + +While `O(√n)` is much larger than `O(log n)`, it is still much smaller than `O(n)`. +And it will certainly complicate the timing attack, since the time of operation +on collisioned bin will grow slower. + +If stronger safety guarantees are needed, then a balanced tree should be used. +Scapegoat and splay trees only offer amortized complexity guarantees, whereas +treaps require an entropy source and have higher constant factors in practice. +Self-balancing structures such as 2-3 or red-black trees require additional +bookkeeping information. We can hope to reduce rebalancing cost by realizing +that the output bits of strong `H` functions are uniformly distributed. When +using them as keys instead of the original message `m`, recent relaxed balancing +schemes such as left-leaning red-black or weak AVL trees may require fewer tree +rotations to maintain their invariants. Note that `H` already determines the +bin, so we should only use the remaining bits. 64-bit hashes are likely +sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems +unlikely that attackers can craft inputs resulting in worst cases for both the +bin index and tree key without being able to generate hash collisions, which +would contradict the security claims of strong hashes. Even if they succeed, the +relaxed tree balancing still guarantees an upper bound on height and therefore +the worst-case operation cost. For the AVL variant, the constant factors are +slightly lower than for red-black trees. + +The second proposed approach uses augmented/de-amortized cuckoo hash tables +(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all +operations, but only if the hash function is 'indistinguishable from random' +(uniformly distributed regardless of the input distribution), which is claimed +for SipHash and HighwayHash but certainly not for weak hashes. + +Both alternatives retain good average case performance and defend against +flooding by limiting the amount of extra work an attacker can cause. The first +approach guarantees an upper bound of `log n` additional work even if the hash +function is compromised. + +In summary, a strong hash function is not, by itself, sufficient to protect a +chained hash table from flooding attacks. However, strong hash functions are +important parts of two schemes for preventing denial of service. Using weak hash +functions can slightly accelerate the best-case and average-case performance of +a service, but at the risk of greatly reduced attack costs and worst-case +performance. + +## Third-party implementations / bindings + +Thanks to Damian Gryski and Frank Wessels for making us aware of these +third-party implementations or bindings. Please feel free to get in touch or +raise an issue and we'll add yours as well. + +By | Language | URL +--- | --- | --- +Damian Gryski | Go and x64 assembly | https://github.com/dgryski/go-highway/ +Simon Abdullah | NPM package | https://www.npmjs.com/package/highwayhash-nodejs +Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash +Andreas Sonnleitner | [WebAssembly](https://github.com/asonnleitner/highwayhash-wasm) and NPM package | https://www.npmjs.com/package/highwayhash-wasm +Nick Babcock | Rust port | https://github.com/nickbabcock/highway-rs +Caleb Zulawski | Rust portable SIMD | https://github.com/calebzulawski/autobahn-hash +Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs +Frank Wessels & Andreas Auernhammer | Go and ARM assembly | https://github.com/minio/highwayhash +Phil Demetriou | Python 3 bindings | https://github.com/kpdemetriou/highwayhash-cffi +Jonathan Beard | C++20 constexpr | https://gist.github.com/jonathan-beard/632017faa1d9d1936eb5948ac9186657 +James Cook | Ruby bindings | https://github.com/jamescook/highwayhash + +## Modules + +### Hashes + +* c_bindings.h declares C-callable versions of SipHash/HighwayHash. +* sip_hash.cc is the compatible implementation of SipHash, and also provides + the final reduction for sip_tree_hash. +* sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash. +* scalar_sip_tree_hash.cc is a non-SIMD version. +* state_helpers.h simplifies the implementation of the SipHash variants. +* highwayhash.h is our new, fast hash function. +* hh_{avx2,sse41,vsx,portable}.h are its various implementations. +* highwayhash_target.h chooses the best available implementation at runtime. + +### Infrastructure + +* arch_specific.h offers byte swapping and CPUID detection. +* compiler_specific.h defines some compiler-dependent language extensions. +* data_parallel.h provides a C++11 ThreadPool and PerThread (similar to + OpenMP). +* instruction_sets.h and targets.h enable efficient CPU-specific dispatching. +* nanobenchmark.h measures elapsed times with < 1 cycle variability. +* os_specific.h sets thread affinity and priority for benchmarking. +* profiler.h is a low-overhead, deterministic hierarchical profiler. +* tsc_timer.h obtains high-resolution timestamps without CPU reordering. +* vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1. + +By Jan Wassenberg and Jyrki Alakuijala +, updated 2023-03-29 + +This is not an official Google product. diff --git a/highwayhash/c/highwayhash.c b/highwayhash/c/highwayhash.c new file mode 100644 index 000000000..bf4863ecd --- /dev/null +++ b/highwayhash/c/highwayhash.c @@ -0,0 +1,261 @@ +#include "c/highwayhash.h" + +#include +#include +#include + +/* +This code is compatible with C90 with the additional requirement of +supporting uint64_t. +*/ + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Internal implementation */ +/*////////////////////////////////////////////////////////////////////////////*/ + +void HighwayHashReset(const uint64_t key[4], HighwayHashState* state) { + state->mul0[0] = 0xdbe6d5d5fe4cce2full; + state->mul0[1] = 0xa4093822299f31d0ull; + state->mul0[2] = 0x13198a2e03707344ull; + state->mul0[3] = 0x243f6a8885a308d3ull; + state->mul1[0] = 0x3bd39e10cb0ef593ull; + state->mul1[1] = 0xc0acf169b5f18a8cull; + state->mul1[2] = 0xbe5466cf34e90c6cull; + state->mul1[3] = 0x452821e638d01377ull; + state->v0[0] = state->mul0[0] ^ key[0]; + state->v0[1] = state->mul0[1] ^ key[1]; + state->v0[2] = state->mul0[2] ^ key[2]; + state->v0[3] = state->mul0[3] ^ key[3]; + state->v1[0] = state->mul1[0] ^ ((key[0] >> 32) | (key[0] << 32)); + state->v1[1] = state->mul1[1] ^ ((key[1] >> 32) | (key[1] << 32)); + state->v1[2] = state->mul1[2] ^ ((key[2] >> 32) | (key[2] << 32)); + state->v1[3] = state->mul1[3] ^ ((key[3] >> 32) | (key[3] << 32)); +} + +static void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0, + uint64_t* add1, uint64_t* add0) { + *add0 += (((v0 & 0xff000000ull) | (v1 & 0xff00000000ull)) >> 24) | + (((v0 & 0xff0000000000ull) | (v1 & 0xff000000000000ull)) >> 16) | + (v0 & 0xff0000ull) | ((v0 & 0xff00ull) << 32) | + ((v1 & 0xff00000000000000ull) >> 8) | (v0 << 56); + *add1 += (((v1 & 0xff000000ull) | (v0 & 0xff00000000ull)) >> 24) | + (v1 & 0xff0000ull) | ((v1 & 0xff0000000000ull) >> 16) | + ((v1 & 0xff00ull) << 24) | ((v0 & 0xff000000000000ull) >> 8) | + ((v1 & 0xffull) << 48) | (v0 & 0xff00000000000000ull); +} + +static void Update(const uint64_t lanes[4], HighwayHashState* state) { + int i; + for (i = 0; i < 4; ++i) { + state->v1[i] += state->mul0[i] + lanes[i]; + state->mul0[i] ^= (state->v1[i] & 0xffffffff) * (state->v0[i] >> 32); + state->v0[i] += state->mul1[i]; + state->mul1[i] ^= (state->v0[i] & 0xffffffff) * (state->v1[i] >> 32); + } + ZipperMergeAndAdd(state->v1[1], state->v1[0], &state->v0[1], &state->v0[0]); + ZipperMergeAndAdd(state->v1[3], state->v1[2], &state->v0[3], &state->v0[2]); + ZipperMergeAndAdd(state->v0[1], state->v0[0], &state->v1[1], &state->v1[0]); + ZipperMergeAndAdd(state->v0[3], state->v0[2], &state->v1[3], &state->v1[2]); +} + +static uint64_t Read64(const uint8_t* src) { + return (uint64_t)src[0] | ((uint64_t)src[1] << 8) | + ((uint64_t)src[2] << 16) | ((uint64_t)src[3] << 24) | + ((uint64_t)src[4] << 32) | ((uint64_t)src[5] << 40) | + ((uint64_t)src[6] << 48) | ((uint64_t)src[7] << 56); +} + +void HighwayHashUpdatePacket(const uint8_t* packet, HighwayHashState* state) { + uint64_t lanes[4]; + lanes[0] = Read64(packet + 0); + lanes[1] = Read64(packet + 8); + lanes[2] = Read64(packet + 16); + lanes[3] = Read64(packet + 24); + Update(lanes, state); +} + +static void Rotate32By(uint64_t count, uint64_t lanes[4]) { + int i; + for (i = 0; i < 4; ++i) { + uint32_t half0 = lanes[i] & 0xffffffff; + uint32_t half1 = (lanes[i] >> 32); + lanes[i] = (half0 << count) | (half0 >> (32 - count)); + lanes[i] |= (uint64_t)((half1 << count) | (half1 >> (32 - count))) << 32; + } +} + +void HighwayHashUpdateRemainder(const uint8_t* bytes, const size_t size_mod32, + HighwayHashState* state) { + int i; + const size_t size_mod4 = size_mod32 & 3; + const uint8_t* remainder = bytes + (size_mod32 & ~3); + uint8_t packet[32] = {0}; + for (i = 0; i < 4; ++i) { + state->v0[i] += ((uint64_t)size_mod32 << 32) + size_mod32; + } + Rotate32By(size_mod32, state->v1); + for (i = 0; i < remainder - bytes; i++) { + packet[i] = bytes[i]; + } + if (size_mod32 & 16) { + for (i = 0; i < 4; i++) { + packet[28 + i] = remainder[i + size_mod4 - 4]; + } + } else { + if (size_mod4) { + packet[16 + 0] = remainder[0]; + packet[16 + 1] = remainder[size_mod4 >> 1]; + packet[16 + 2] = remainder[size_mod4 - 1]; + } + } + HighwayHashUpdatePacket(packet, state); +} + +static void Permute(const uint64_t v[4], uint64_t* permuted) { + permuted[0] = (v[2] >> 32) | (v[2] << 32); + permuted[1] = (v[3] >> 32) | (v[3] << 32); + permuted[2] = (v[0] >> 32) | (v[0] << 32); + permuted[3] = (v[1] >> 32) | (v[1] << 32); +} + +void PermuteAndUpdate(HighwayHashState* state) { + uint64_t permuted[4]; + Permute(state->v0, permuted); + Update(permuted, state); +} + +static void ModularReduction(uint64_t a3_unmasked, uint64_t a2, uint64_t a1, + uint64_t a0, uint64_t* m1, uint64_t* m0) { + uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull; + *m1 = a1 ^ ((a3 << 1) | (a2 >> 63)) ^ ((a3 << 2) | (a2 >> 62)); + *m0 = a0 ^ (a2 << 1) ^ (a2 << 2); +} + +static uint64_t HighwayHashFinalize64(HighwayHashState* state) { + int i; + for (i = 0; i < 4; i++) { + PermuteAndUpdate(state); + } + return state->v0[0] + state->v1[0] + state->mul0[0] + state->mul1[0]; +} + +static void HighwayHashFinalize128(HighwayHashState* state, uint64_t hash[2]) { + int i; + for (i = 0; i < 6; i++) { + PermuteAndUpdate(state); + } + hash[0] = state->v0[0] + state->mul0[0] + state->v1[2] + state->mul1[2]; + hash[1] = state->v0[1] + state->mul0[1] + state->v1[3] + state->mul1[3]; +} + +static void HighwayHashFinalize256(HighwayHashState* state, uint64_t hash[4]) { + int i; + /* We anticipate that 256-bit hashing will be mostly used with long messages + because storing and using the 256-bit hash (in contrast to 128-bit) + carries a larger additional constant cost by itself. Doing extra rounds + here hardly increases the per-byte cost of long messages. */ + for (i = 0; i < 10; i++) { + PermuteAndUpdate(state); + } + ModularReduction(state->v1[1] + state->mul1[1], state->v1[0] + state->mul1[0], + state->v0[1] + state->mul0[1], state->v0[0] + state->mul0[0], + &hash[1], &hash[0]); + ModularReduction(state->v1[3] + state->mul1[3], state->v1[2] + state->mul1[2], + state->v0[3] + state->mul0[3], state->v0[2] + state->mul0[2], + &hash[3], &hash[2]); +} + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Non-cat API: single call on full data */ +/*////////////////////////////////////////////////////////////////////////////*/ + +static void ProcessAll(const uint8_t* data, size_t size, const uint64_t key[4], + HighwayHashState* state) { + size_t i; + HighwayHashReset(key, state); + for (i = 0; i + 32 <= size; i += 32) { + HighwayHashUpdatePacket(data + i, state); + } + if ((size & 31) != 0) HighwayHashUpdateRemainder(data + i, size & 31, state); +} + +uint64_t HighwayHash64(const uint8_t* data, size_t size, + const uint64_t key[4]) { + HighwayHashState state; + ProcessAll(data, size, key, &state); + return HighwayHashFinalize64(&state); +} + +void HighwayHash128(const uint8_t* data, size_t size, + const uint64_t key[4], uint64_t hash[2]) { + HighwayHashState state; + ProcessAll(data, size, key, &state); + HighwayHashFinalize128(&state, hash); +} + +void HighwayHash256(const uint8_t* data, size_t size, + const uint64_t key[4], uint64_t hash[4]) { + HighwayHashState state; + ProcessAll(data, size, key, &state); + HighwayHashFinalize256(&state, hash); +} + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Cat API: allows appending with multiple calls */ +/*////////////////////////////////////////////////////////////////////////////*/ + +void HighwayHashCatStart(const uint64_t key[4], HighwayHashCat* state) { + HighwayHashReset(key, &state->state); + state->num = 0; +} + +void HighwayHashCatAppend(const uint8_t* bytes, size_t num, + HighwayHashCat* state) { + size_t i; + if (state->num != 0) { + size_t num_add = num > (32u - state->num) ? (32u - state->num) : num; + for (i = 0; i < num_add; i++) { + state->packet[state->num + i] = bytes[i]; + } + state->num += num_add; + num -= num_add; + bytes += num_add; + if (state->num == 32) { + HighwayHashUpdatePacket(state->packet, &state->state); + state->num = 0; + } + } + while (num >= 32) { + HighwayHashUpdatePacket(bytes, &state->state); + num -= 32; + bytes += 32; + } + for (i = 0; i < num; i++) { + state->packet[state->num] = bytes[i]; + state->num++; + } +} + +uint64_t HighwayHashCatFinish64(const HighwayHashCat* state) { + HighwayHashState copy = state->state; + if (state->num) { + HighwayHashUpdateRemainder(state->packet, state->num, ©); + } + return HighwayHashFinalize64(©); +} + +void HighwayHashCatFinish128(const HighwayHashCat* state, uint64_t hash[2]) { + HighwayHashState copy = state->state; + if (state->num) { + HighwayHashUpdateRemainder(state->packet, state->num, ©); + } + HighwayHashFinalize128(©, hash); +} + +void HighwayHashCatFinish256(const HighwayHashCat* state, uint64_t hash[4]) { + HighwayHashState copy = state->state; + if (state->num) { + HighwayHashUpdateRemainder(state->packet, state->num, ©); + } + HighwayHashFinalize256(©, hash); +} diff --git a/highwayhash/c/highwayhash.h b/highwayhash/c/highwayhash.h new file mode 100644 index 000000000..10c877fdc --- /dev/null +++ b/highwayhash/c/highwayhash.h @@ -0,0 +1,100 @@ +#ifndef C_HIGHWAYHASH_H_ +#define C_HIGHWAYHASH_H_ + +#include +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Low-level API, use for implementing streams etc... */ +/*////////////////////////////////////////////////////////////////////////////*/ + +typedef struct { + uint64_t v0[4]; + uint64_t v1[4]; + uint64_t mul0[4]; + uint64_t mul1[4]; +} HighwayHashState; + +/* Initializes state with given key */ +static void HighwayHashReset(const uint64_t key[4], HighwayHashState* state); +/* Takes a packet of 32 bytes */ +void HighwayHashUpdatePacket(const uint8_t* packet, HighwayHashState* state); +/* Adds the final 1..31 bytes, do not use if 0 remain */ +void HighwayHashUpdateRemainder(const uint8_t* bytes, const size_t size_mod32, + HighwayHashState* state); +/* Compute final hash value. Makes state invalid. */ +static uint64_t HighwayHashFinalize64(HighwayHashState* state); +static void HighwayHashFinalize128(HighwayHashState* state, uint64_t hash[2]); +static void HighwayHashFinalize256(HighwayHashState* state, uint64_t hash[4]); + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Non-cat API: single call on full data */ +/*////////////////////////////////////////////////////////////////////////////*/ + +uint64_t HighwayHash64(const uint8_t* data, size_t size, const uint64_t key[4]); + +void HighwayHash128(const uint8_t* data, size_t size, + const uint64_t key[4], uint64_t hash[2]); + +void HighwayHash256(const uint8_t* data, size_t size, + const uint64_t key[4], uint64_t hash[4]); + +/*////////////////////////////////////////////////////////////////////////////*/ +/* Cat API: allows appending with multiple calls */ +/*////////////////////////////////////////////////////////////////////////////*/ + +typedef struct { + HighwayHashState state; + uint8_t packet[32]; + int num; +} HighwayHashCat; + +/* Allocates new state for a new streaming hash computation */ +void HighwayHashCatStart(const uint64_t key[4], HighwayHashCat* state); + +void HighwayHashCatAppend(const uint8_t* bytes, size_t num, + HighwayHashCat* state); + +/* Computes final hash value */ +uint64_t HighwayHashCatFinish64(const HighwayHashCat* state); +void HighwayHashCatFinish128(const HighwayHashCat* state, uint64_t hash[2]); +void HighwayHashCatFinish256(const HighwayHashCat* state, uint64_t hash[4]); + +/* +Usage examples: + +#include +#include + +void Example64() { + uint64_t key[4] = {1, 2, 3, 4}; + const char* text = "Hello world!"; + size_t size = strlen(text); + uint64_t hash = HighwayHash64((const uint8_t*)text, size, key); + printf("%016"PRIx64"\n", hash); +} + +void Example64Cat() { + uint64_t key[4] = {1, 2, 3, 4}; + HighwayHashCat state; + uint64_t hash; + + HighwayHashCatStart(key, &state); + + HighwayHashCatAppend((const uint8_t*)"Hello", 5, &state); + HighwayHashCatAppend((const uint8_t*)" world!", 7, &state); + + hash = HighwayHashCatFinish64(&state); + printf("%016"PRIx64"\n", hash); +} +*/ + +#if defined(__cplusplus) || defined(c_plusplus) +} /* extern "C" */ +#endif + +#endif // C_HIGHWAYHASH_H_ diff --git a/highwayhash/c/highwayhash_test.c b/highwayhash/c/highwayhash_test.c new file mode 100644 index 000000000..9f9ee3367 --- /dev/null +++ b/highwayhash/c/highwayhash_test.c @@ -0,0 +1,70 @@ +#include "c/highwayhash.h" + +#include +#include +#include + +#define kMaxSize 64 + +static const uint64_t kTestKey1[4] = { + 0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull, + 0x1716151413121110ull, 0x1F1E1D1C1B1A1918ull +}; + +static const uint64_t kTestKey2[4] = { + 1ull, 2ull, 3ull, 4ull +}; + +const uint64_t kExpected64[kMaxSize + 1] = { + 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull, + 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull, + 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull, + 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull, + 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull, + 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull, + 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull, + 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull, + 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full, + 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull, + 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull, + 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull, + 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull, + 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull, + 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull, + 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull, + 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull, + 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull, + 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull, + 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull, + 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull, + 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull}; + +void TestHash64(uint64_t expected, const uint8_t* data, size_t size, + const uint64_t* key) { + uint64_t hash = HighwayHash64(data, size, key); + if (expected != hash) { + printf("Test failed: expected %016"PRIx64", got %016"PRIx64", size: %d\n", + expected, hash, (int) size); + exit(1); + } +} + +int main() { + uint8_t data[kMaxSize + 1] = {0}; + int i; + for (i = 0; i <= kMaxSize; i++) { + data[i] = i; + TestHash64(kExpected64[i], data, i, kTestKey1); + } + + for (i = 0; i < 33; i++) { + data[i] = 128 + i; + } + TestHash64(0x53c516cce478cad7ull, data, 33, kTestKey2); + + /* 128-bit and 256-bit tests to be added when they are declared frozen in the + C++ version */ + + printf("Test success\n"); + return 0; +} diff --git a/highwayhash/google3/third_party/highwayhash/WORKSPACE b/highwayhash/google3/third_party/highwayhash/WORKSPACE new file mode 100644 index 000000000..cca464c25 --- /dev/null +++ b/highwayhash/google3/third_party/highwayhash/WORKSPACE @@ -0,0 +1 @@ +workspace(name = "highwayhash") diff --git a/highwayhash/highwayhash.3 b/highwayhash/highwayhash.3 new file mode 100644 index 000000000..54f3d1d93 --- /dev/null +++ b/highwayhash/highwayhash.3 @@ -0,0 +1,107 @@ +.TH highwayhash 3 "April 25, 2017" + +.SH NAME +highwayhash \- fast strong 64-bit hash functions + +.SH SYNOPSIS + +.B #include /* C */ + + uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size); + + uint64_t SipHash13C(const uint64_t* key, const char* bytes, const uint64_t size); + + uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size); + +.B #include /* C++ */ + + using namespace highwayhash; + + void HighwayHashT(State* HH_RESTRICT state, + const char* HH_RESTRICT bytes, const size_t size, + Result* HH_RESTRICT hash); + +.B #include /* C++ */ + + using namespace highwayhash; + + HH_U64 SipHash(const SipHashState::Key& key, const char* bytes,const HH_U64 size); + +Link with +.I +-lhighwayhash + +.SH DESCRIPTION + +Hash functions are widely used, so it is desirable to increase their speed and +security. This package provides two 'strong' (well-distributed and +unpredictable) hash functions: a faster version of SipHash, and an even faster +algorithm we call HighwayHash. + +SipHash is a fast but 'cryptographically strong' pseudo-random function by +Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf]. + +HighwayHash is a new way of mixing inputs which may inspire new +cryptographically strong hashes. Large inputs are processed at a rate of 0.24 +cycles per byte, and latency remains low even for small inputs. HighwayHash is +faster than SipHash for all input sizes, with 5 times higher throughput at 1 +KiB. We discuss design choices and provide statistical analysis and preliminary +cryptanalysis in https://arxiv.org/abs/1612.06257. + +.I +Note, SipHash wants an uint64_t[2] key while HighwayHash uint64_t[4] . + +.SH EXAMPLES + +64-bit SipHash for any CPU: + + #include "highwayhash/sip_hash.h" + using namespace highwayhash; + HH_ALIGNAS(16) const HH_U64 key2[2] = {1234, 5678}; + char in[8] = {1}; + return SipHash(key2, in, 8); + +64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags: + + #include "highwayhash/highwayhash.h" + using namespace highwayhash; + HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + HHStateT state(key); + HighwayHashT(&state, in, 8, &result); + +64, 128 or 256 bit HighwayHash for the CPU on which we're currently running: + + #include "highwayhash/highwayhash_target.h" + #include "highwayhash/instruction_sets.h" + using namespace highwayhash; + HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; + char in[8] = {1}; + HHResult64 result; // or HHResult128 or HHResult256 + InstructionSets::Run(key, in, 8, &result); + +C-callable 64-bit HighwayHash for the CPU on which we're currently running: + + #include "highwayhash/c_bindings.h" + const uint64_t key[4] = {1, 2, 3, 4}; + char in[8] = {1}; + return HighwayHash64(key, in, 8); + +.SH SEE ALSO + +/usr/include/highwayhash/c_bindings.h (C) + +/usr/include/highwayhash/highwayhash.h (C++) + +.SH BUGS + +https://github.com/google/highwayhash/issues + +.SH AUTHOR + +Upstream authors are Jan Wassenberg and Jyrki Alakuijala , updated 2017-02-07 + +This manpage was created by Adam Borowski , +and completed by Zhou Mo according to upstream readme +and header files. \ No newline at end of file diff --git a/highwayhash/highwayhash/arch_specific.cc b/highwayhash/highwayhash/arch_specific.cc new file mode 100644 index 000000000..2a05860ce --- /dev/null +++ b/highwayhash/highwayhash/arch_specific.cc @@ -0,0 +1,193 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/arch_specific.h" + +#include + +#if HH_ARCH_X64 && !HH_MSC_VERSION +#include +#endif + +#if HH_ARCH_PPC +#if __GLIBC__ +#include // __ppc_get_timebase_freq +#elif __FreeBSD__ +// clang-format off +#include +#include /* must come after sys/types.h */ +// clang-format on +#endif +#endif + +#include // memcpy +#include + +namespace highwayhash { + +const char* TargetName(const TargetBits target_bit) { + switch (target_bit) { + case HH_TARGET_Portable: + return "Portable"; + case HH_TARGET_SSE41: + return "SSE41"; + case HH_TARGET_AVX2: + return "AVX2"; + case HH_TARGET_VSX: + return "VSX"; + case HH_TARGET_NEON: + return "NEON"; + default: + return nullptr; // zero, multiple, or unknown bits + } +} + +#if HH_ARCH_X64 + +namespace { + +std::string BrandString() { + char brand_string[49]; + uint32_t abcd[4]; + + // Check if brand string is supported (it is on all reasonable Intel/AMD) + Cpuid(0x80000000U, 0, abcd); + if (abcd[0] < 0x80000004U) { + return std::string(); + } + + for (int i = 0; i < 3; ++i) { + Cpuid(0x80000002U + i, 0, abcd); + memcpy(brand_string + i * 16, &abcd, sizeof(abcd)); + } + brand_string[48] = 0; + return brand_string; +} + +} // namespace + +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HH_RESTRICT abcd) { +#if HH_MSC_VERSION + int regs[4]; + __cpuidex(regs, level, count); + for (int i = 0; i < 4; ++i) { + abcd[i] = regs[i]; + } +#else + uint32_t a, b, c, d; + __cpuid_count(level, count, a, b, c, d); + abcd[0] = a; + abcd[1] = b; + abcd[2] = c; + abcd[3] = d; +#endif +} + +uint32_t ApicId() { + uint32_t abcd[4]; + Cpuid(1, 0, abcd); + return abcd[1] >> 24; // ebx +} + +#endif // HH_ARCH_X64 + +namespace { + +double DetectNominalClockRate() { +#if HH_ARCH_X64 + const std::string& brand_string = BrandString(); + // Brand strings include the maximum configured frequency. These prefixes are + // defined by Intel CPUID documentation. + const char* prefixes[3] = {"MHz", "GHz", "THz"}; + const double multipliers[3] = {1E6, 1E9, 1E12}; + for (size_t i = 0; i < 3; ++i) { + const size_t pos_prefix = brand_string.find(prefixes[i]); + if (pos_prefix != std::string::npos) { + const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); + if (pos_space != std::string::npos) { + const std::string digits = + brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); + return std::stod(digits) * multipliers[i]; + } + } + } +#elif HH_ARCH_PPC + double freq = -1; +#if __linux__ + char line[200]; + char* s; + char* value; + + FILE* f = fopen("/proc/cpuinfo", "r"); + if (f != nullptr) { + while (fgets(line, sizeof(line), f) != nullptr) { + // NOTE: the ':' is the only character we can rely on + if (!(value = strchr(line, ':'))) continue; + // terminate the valuename + *value++ = '\0'; + // skip any leading spaces + while (*value == ' ') value++; + if ((s = strchr(value, '\n'))) *s = '\0'; + + if (!strncasecmp(line, "clock", strlen("clock")) && + sscanf(value, "%lf", &freq) == 1) { + freq *= 1E6; + break; + } + } + fclose(f); + return freq; + } +#elif __FreeBSD__ + size_t length = sizeof(freq); + sysctlbyname("dev.cpu.0.freq", &freq, &length, NULL, 0); + freq *= 1E6; + return freq; +#endif +#endif + + return 0.0; +} + +} // namespace + +double NominalClockRate() { + // Thread-safe caching - this is called several times. + static const double cycles_per_second = DetectNominalClockRate(); + return cycles_per_second; +} + +double InvariantTicksPerSecond() { +#if HH_ARCH_PPC +#if __GLIBC__ + static const double cycles_per_second = __ppc_get_timebase_freq(); +#elif __FreeBSD__ + double cycles_per_second = 0; + size_t length = sizeof(cycles_per_second); + sysctlbyname("kern.timecounter.tc.timebase.frequency", &cycles_per_second, + &length, NULL, 0); +#elif __OpenBSD__ + /* There is currently no method of retrieving this via userland. + * This value is correct for Power8 and Power9. + */ + static const double cycles_per_second = 512000000; +#endif + return cycles_per_second; +#else + return NominalClockRate(); +#endif +} + +} // namespace highwayhash diff --git a/highwayhash/highwayhash/arch_specific.h b/highwayhash/highwayhash/arch_specific.h new file mode 100644 index 000000000..0b8c38417 --- /dev/null +++ b/highwayhash/highwayhash/arch_specific.h @@ -0,0 +1,179 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_ +#define HIGHWAYHASH_ARCH_SPECIFIC_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. +// +// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD +// intrinsics can be used. These intrinsics are only used within blocks that +// first verify CPU capabilities. However, the flag also allows the compiler to +// generate AVX2 code in other places. This can violate the One Definition Rule, +// which requires multiple instances of a function with external linkage +// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting +// crashes on non-AVX2 CPUs, any header (transitively) included from a +// translation unit compiled with different flags is "restricted". This means +// all function definitions must have internal linkage (e.g. static inline), or +// reside in namespace HH_TARGET_NAME, which expands to a name unique to the +// current compiler flags. +// +// Most C system headers are safe to include, but C++ headers should generally +// be avoided because they often do not specify static linkage and cannot +// reliably be wrapped in a namespace. + +#include "highwayhash/compiler_specific.h" + +#include + +#if HH_MSC_VERSION +#include // _byteswap_* +#endif + +namespace highwayhash { + +#if defined(__x86_64__) || defined(_M_X64) +#define HH_ARCH_X64 1 +#else +#define HH_ARCH_X64 0 +#endif + +#if defined(__aarch64__) || defined(__arm64__) +#define HH_ARCH_AARCH64 1 +#else +#define HH_ARCH_AARCH64 0 +#endif + +#ifdef __arm__ +#define HH_ARCH_ARM 1 +#else +#define HH_ARCH_ARM 0 +#endif + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define HH_ARCH_NEON 1 +#else +#define HH_ARCH_NEON 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define HH_ARCH_PPC 1 +#else +#define HH_ARCH_PPC 0 +#endif + +// Target := instruction set extension(s) such as SSE41. A translation unit can +// only provide a single target-specific implementation because they require +// different compiler flags. + +// Either the build system specifies the target by defining HH_TARGET_NAME +// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose +// the most efficient one that can be compiled given the current flags: +#ifndef HH_TARGET_NAME + +// To avoid excessive code size and dispatch overhead, we only support a few +// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must +// match the HH_TARGET_* suffixes below. +#ifdef __AVX2__ +#define HH_TARGET_NAME AVX2 +// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means +// we at least get SSE4 on machines supporting AVX but not AVX2. +// https://stackoverflow.com/questions/18563978/detect-the-availability-of-sse-sse2-instruction-set-in-visual-studio +#elif defined(__SSE4_1__) || (HH_MSC_VERSION != 0 && defined(__AVX__)) +#define HH_TARGET_NAME SSE41 +#elif defined(__VSX__) +#define HH_TARGET_NAME VSX +#elif HH_ARCH_NEON +#define HH_TARGET_NAME NEON +#else +#define HH_TARGET_NAME Portable +#endif + +#endif // HH_TARGET_NAME + +#define HH_CONCAT(first, second) first##second +// Required due to macro expansion rules. +#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second) +// Appends HH_TARGET_NAME to "identifier_prefix". +#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \ + HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME) + +// HH_TARGET expands to an integer constant. Typical usage: HHStateT. +// This ensures your code will work correctly when compiler flags are changed, +// and benefit from subsequently added targets/specializations. +#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_) + +// Deprecated former name of HH_TARGET; please use HH_TARGET instead. +#define HH_TARGET_PREFERRED HH_TARGET + +// Associate targets with integer literals so the preprocessor can compare them +// with HH_TARGET. Do not instantiate templates with these values - use +// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always +// defined even if unavailable on this HH_ARCH to allow calling TargetName. +// The suffixes must match the HH_TARGET_NAME identifiers. +#define HH_TARGET_Portable 1 +#define HH_TARGET_SSE41 2 +#define HH_TARGET_AVX2 4 +#define HH_TARGET_VSX 8 +#define HH_TARGET_NEON 16 + +// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are +// supported or were called by InstructionSets::RunAll. +using TargetBits = unsigned; + +namespace HH_TARGET_NAME { + +// Calls func(bit_value) for every nonzero bit in "bits". +template +void ForeachTarget(TargetBits bits, const Func& func) { + while (bits != 0) { + const TargetBits lowest = bits & (~bits + 1); + func(lowest); + bits &= ~lowest; + } +} + +} // namespace HH_TARGET_NAME + +// Returns a brief human-readable string literal identifying one of the above +// bits, or nullptr if zero, multiple, or unknown bits are set. +const char* TargetName(const TargetBits target_bit); + +// Returns the nominal (without Turbo Boost) CPU clock rate [Hertz]. Useful for +// (roughly) characterizing the CPU speed. +double NominalClockRate(); + +// Returns tsc_timer frequency, useful for converting ticks to seconds. This is +// unaffected by CPU throttling ("invariant"). Thread-safe. Returns timebase +// frequency on PPC and NominalClockRate on all other platforms. +double InvariantTicksPerSecond(); + +#if HH_ARCH_X64 + +// Calls CPUID instruction with eax=level and ecx=count and returns the result +// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). +void Cpuid(const uint32_t level, const uint32_t count, + uint32_t* HH_RESTRICT abcd); + +// Returns the APIC ID of the CPU on which we're currently running. +uint32_t ApicId(); + +#endif // HH_ARCH_X64 + +} // namespace highwayhash + +#endif // HIGHWAYHASH_ARCH_SPECIFIC_H_ diff --git a/highwayhash/highwayhash/benchmark.cc b/highwayhash/highwayhash/benchmark.cc new file mode 100644 index 000000000..7cc304ffc --- /dev/null +++ b/highwayhash/highwayhash/benchmark.cc @@ -0,0 +1,331 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Measures hash function throughput for various input sizes. + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/instruction_sets.h" +#include "highwayhash/nanobenchmark.h" +#include "highwayhash/robust_statistics.h" + +// Which functions to enable (includes check for compiler support) +#define BENCHMARK_SIP 0 +#define BENCHMARK_SIP_TREE 0 +#define BENCHMARK_HIGHWAY 1 +#define BENCHMARK_HIGHWAY_CAT 1 +#define BENCHMARK_FARM 0 +#define BENCHMARK_INTERNAL 0 + +#include "highwayhash/highwayhash_test_target.h" +#if BENCHMARK_SIP +#include "highwayhash/sip_hash.h" +#endif +#if BENCHMARK_SIP_TREE +#include "highwayhash/scalar_sip_tree_hash.h" +#include "highwayhash/sip_tree_hash.h" +#endif +#if BENCHMARK_FARM +#include "third_party/farmhash/src/farmhash.h" +#endif + +#if BENCHMARK_INTERNAL +// Placeholder for include +#endif + +namespace highwayhash { +namespace { + +// Stores time measurements from benchmarks, with support for printing them +// as LaTeX figures or tables. +class Measurements { + public: + void Add(const char* caption, const size_t bytes, const double cycles) { + const float cpb = static_cast(cycles / bytes); + results_.emplace_back(caption, static_cast(bytes), cpb); + } + + // Prints results as a LaTeX table (only for in_sizes matching the + // desired values). + void PrintTable(const std::vector& in_sizes) { + std::vector unique = in_sizes; + std::sort(unique.begin(), unique.end()); + unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); + + printf("\\begin{tabular}{"); + for (size_t i = 0; i < unique.size() + 1; ++i) { + printf("%s", i == 0 ? "r" : "|r"); + } + printf("}\n\\toprule\nAlgorithm"); + for (const size_t in_size : unique) { + printf(" & %zu", in_size); + } + printf("\\\\\n\\midrule\n"); + + const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique); + for (const auto& item : cpb_for_caption) { + printf("%22s", item.first.c_str()); + for (const float cpb : item.second) { + printf(" & %5.2f", cpb); + } + printf("\\\\\n"); + } + } + + // Prints results suitable for pgfplots. + void PrintPlots() { + const SpeedsForCaption cpb_for_caption = SortByCaption(); + assert(!cpb_for_caption.empty()); + const size_t num_sizes = cpb_for_caption.begin()->second.size(); + + printf("Size "); + // Flatten per-caption vectors into one iterator. + std::vector::const_iterator> iterators; + for (const auto& item : cpb_for_caption) { + printf("%21s ", item.first.c_str()); + assert(item.second.size() == num_sizes); + iterators.push_back(item.second.begin()); + } + printf("\n"); + + const std::vector& sizes = UniqueSizes(); + assert(num_sizes == sizes.size()); + for (int i = 0; i < static_cast(num_sizes); ++i) { + printf("%d ", sizes[i]); + for (auto& it : iterators) { + printf("%5.2f ", 1.0f / *it); // bytes per cycle + ++it; + } + printf("\n"); + } + } + + private: + struct Result { + Result(const char* caption, const int in_size, const float cpb) + : caption(caption), in_size(in_size), cpb(cpb) {} + + // Algorithm name. + std::string caption; + // Size of the input data [bytes]. + int in_size; + // Measured throughput [cycles per byte]. + float cpb; + }; + + // Returns set of all input sizes for the first column of a size/speed plot. + std::vector UniqueSizes() { + std::vector sizes; + sizes.reserve(results_.size()); + for (const Result& result : results_) { + sizes.push_back(result.in_size); + } + std::sort(sizes.begin(), sizes.end()); + sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end()); + return sizes; + } + + using SpeedsForCaption = std::map>; + + SpeedsForCaption SortByCaption() const { + SpeedsForCaption cpb_for_caption; + for (const Result& result : results_) { + cpb_for_caption[result.caption].push_back(result.cpb); + } + return cpb_for_caption; + } + + // Only includes measurement results matching one of the given sizes. + SpeedsForCaption SortByCaptionFilterBySize( + const std::vector& in_sizes) const { + SpeedsForCaption cpb_for_caption; + for (const Result& result : results_) { + for (const size_t in_size : in_sizes) { + if (result.in_size == static_cast(in_size)) { + cpb_for_caption[result.caption].push_back(result.cpb); + } + } + } + return cpb_for_caption; + } + + std::vector results_; +}; + +void AddMeasurements(DurationsForInputs* input_map, const char* caption, + Measurements* measurements) { + for (size_t i = 0; i < input_map->num_items; ++i) { + const DurationsForInputs::Item& item = input_map->items[i]; + std::vector durations(item.durations, + item.durations + item.num_durations); + const float median_ticks = Median(&durations); + const float variability = MedianAbsoluteDeviation(durations, median_ticks); + const double median_cpu_cycles = + (median_ticks / InvariantTicksPerSecond()) * NominalClockRate(); + printf("%s %4zu: median=%6.1f ticks; median L1 norm =%4.1f ticks\n", + caption, item.input, median_ticks, variability); + measurements->Add(caption, item.input, median_cpu_cycles); + } + input_map->num_items = 0; +} + +#if BENCHMARK_SIP || BENCHMARK_FARM || BENCHMARK_INTERNAL || \ + (BENCHMARK_SIP_TREE && defined(__AVX2__)) + +void MeasureAndAdd(DurationsForInputs* input_map, const char* caption, + const Func func, Measurements* measurements) { + MeasureDurations(func, input_map); + AddMeasurements(input_map, caption, measurements); +} + +#endif + +// InstructionSets::RunAll callback. +void AddMeasurementsWithPrefix(const char* prefix, const char* target_name, + DurationsForInputs* input_map, void* context) { + std::string caption(prefix); + caption += target_name; + AddMeasurements(input_map, caption.c_str(), + static_cast(context)); +} + +#if BENCHMARK_SIP + +uint64_t RunSip(const void*, const size_t size) { + HH_ALIGNAS(16) const HH_U64 key2[2] = {0, 1}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipHash(key2, in, size); +} + +uint64_t RunSip13(const void*, const size_t size) { + HH_ALIGNAS(16) const HH_U64 key2[2] = {0, 1}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipHash13(key2, in, size); +} + +#endif + +#if BENCHMARK_SIP_TREE + +uint64_t RunSipTree(const void*, const size_t size) { + HH_ALIGNAS(32) const HH_U64 key4[4] = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipTreeHash(key4, in, size); +} + +uint64_t RunSipTree13(const void*, const size_t size) { + HH_ALIGNAS(32) const HH_U64 key4[4] = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return SipTreeHash13(key4, in, size); +} + +#endif + +#if BENCHMARK_FARM + +uint64_t RunFarm(const void*, const size_t size) { + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return farmhash::Fingerprint64(reinterpret_cast(in), size); +} + +#endif + +#if BENCHMARK_INTERNAL +uint64_t RunInternal(const void*, const size_t size) { + char in[kMaxBenchmarkInputSize]; + memcpy(in, &size, sizeof(size)); + return in[rand() % size]; +} +#endif + +void AddMeasurements(const std::vector& in_sizes, + Measurements* measurements) { + DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40); +#if BENCHMARK_SIP + MeasureAndAdd(&input_map, "SipHash", &RunSip, measurements); + MeasureAndAdd(&input_map, "SipHash13", &RunSip13, measurements); +#endif + +#if BENCHMARK_SIP_TREE && defined(__AVX2__) + MeasureAndAdd(&input_map, "SipTreeHash", &RunSipTree, measurements); + MeasureAndAdd(&input_map, "SipTreeHash13", &RunSipTree13, measurements); +#endif + +#if BENCHMARK_FARM + MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements); +#endif + +#if BENCHMARK_INTERNAL + MeasureAndAdd(&input_map, "Internal", &RunInternal, measurements); +#endif + +#if BENCHMARK_HIGHWAY + InstructionSets::RunAll( + &input_map, &AddMeasurementsWithPrefix, measurements); +#endif + +#if BENCHMARK_HIGHWAY_CAT + InstructionSets::RunAll( + &input_map, &AddMeasurementsWithPrefix, measurements); +#endif +} + +void PrintTable() { + const std::vector in_sizes = { + 7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize}; + Measurements measurements; + AddMeasurements(in_sizes, &measurements); + measurements.PrintTable(in_sizes); +} + +void PrintPlots() { + std::vector in_sizes; + for (int num_vectors = 0; num_vectors < 12; ++num_vectors) { + for (int remainder : {0, 9, 18, 27}) { + in_sizes.push_back(num_vectors * 32 + remainder); + assert(in_sizes.back() <= kMaxBenchmarkInputSize); + } + } + + Measurements measurements; + AddMeasurements(in_sizes, &measurements); + measurements.PrintPlots(); +} + +} // namespace +} // namespace highwayhash + +int main(int argc, char* argv[]) { + // No argument or t => table + if (argc < 2 || argv[1][0] == 't') { + highwayhash::PrintTable(); + } else if (argv[1][0] == 'p') { + highwayhash::PrintPlots(); + } + return 0; +} diff --git a/highwayhash/highwayhash/c_bindings.cc b/highwayhash/highwayhash/c_bindings.cc new file mode 100644 index 000000000..7e0488fb4 --- /dev/null +++ b/highwayhash/highwayhash/c_bindings.cc @@ -0,0 +1,35 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/c_bindings.h" + +#include "highwayhash/highwayhash_target.h" +#include "highwayhash/instruction_sets.h" + +using highwayhash::InstructionSets; +using highwayhash::HighwayHash; + +extern "C" { + +// Ideally this would reside in highwayhash_target.cc, but that file is +// compiled multiple times and we must only define this function once. +uint64_t HighwayHash64(const HHKey key, const char* bytes, + const uint64_t size) { + HHResult64 result; + InstructionSets::Run(*reinterpret_cast(key), bytes, + size, &result); + return result; +} + +} // extern "C" diff --git a/highwayhash/highwayhash/c_bindings.h b/highwayhash/highwayhash/c_bindings.h new file mode 100644 index 000000000..903aabc0f --- /dev/null +++ b/highwayhash/highwayhash/c_bindings.h @@ -0,0 +1,57 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_C_BINDINGS_H_ +#define HIGHWAYHASH_C_BINDINGS_H_ + +// C-callable function prototypes, documented in the other header files. + +#include + +#include "hh_types.h" + +#ifdef __cplusplus +extern "C" { + +// Bring the symbols out of the namespace. +using highwayhash::HHKey; +using highwayhash::HHPacket; +using highwayhash::HHResult128; +using highwayhash::HHResult256; +using highwayhash::HHResult64; +#endif + +uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size); +uint64_t SipHash13C(const uint64_t* key, const char* bytes, + const uint64_t size); + +// Uses the best implementation of HighwayHash for the current CPU and +// calculates 64-bit hash of given data. +uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size); + +// Defined by highwayhash_target.cc, which requires a _Target* suffix. +uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes, + const uint64_t size); +uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes, + const uint64_t size); +uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes, + const uint64_t size); +uint64_t HighwayHash64_TargetVSX(const HHKey key, const char* bytes, + const uint64_t size); + +#ifdef __cplusplus +} +#endif + +#endif // HIGHWAYHASH_C_BINDINGS_H_ diff --git a/highwayhash/highwayhash/compiler_specific.h b/highwayhash/highwayhash/compiler_specific.h new file mode 100644 index 000000000..4789f9a61 --- /dev/null +++ b/highwayhash/highwayhash/compiler_specific.h @@ -0,0 +1,90 @@ +// Copyright 2015 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_ +#define HIGHWAYHASH_COMPILER_SPECIFIC_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +// Compiler + +// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, +// otherwise 100 * major + minor version. Note that other packages check for +// #ifdef COMPILER_MSVC, so we cannot use that same name. + +#ifdef _MSC_VER +#define HH_MSC_VERSION _MSC_VER +#else +#define HH_MSC_VERSION 0 +#endif + +#ifdef __GNUC__ +#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define HH_GCC_VERSION 0 +#endif + +#ifdef __clang__ +#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__) +#else +#define HH_CLANG_VERSION 0 +#endif + +//----------------------------------------------------------------------------- + +#if HH_GCC_VERSION && HH_GCC_VERSION < 408 +#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple))) +#else +#define HH_ALIGNAS(multiple) alignas(multiple) // C++11 +#endif + +#if HH_MSC_VERSION +#define HH_RESTRICT __restrict +#elif HH_GCC_VERSION +#define HH_RESTRICT __restrict__ +#else +#define HH_RESTRICT +#endif + +#if HH_MSC_VERSION +#define HH_INLINE __forceinline +#define HH_NOINLINE __declspec(noinline) +#else +#define HH_INLINE inline +#define HH_NOINLINE __attribute__((noinline)) +#endif + +#if HH_MSC_VERSION +// Unsupported, __assume is not the same. +#define HH_LIKELY(expr) expr +#define HH_UNLIKELY(expr) expr +#else +#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#endif + +#if HH_MSC_VERSION +#include +#pragma intrinsic(_ReadWriteBarrier) +#define HH_COMPILER_FENCE _ReadWriteBarrier() +#elif HH_GCC_VERSION +#define HH_COMPILER_FENCE asm volatile("" : : : "memory") +#else +#define HH_COMPILER_FENCE +#endif + +#endif // HIGHWAYHASH_COMPILER_SPECIFIC_H_ diff --git a/highwayhash/highwayhash/data_parallel.h b/highwayhash/highwayhash/data_parallel.h new file mode 100644 index 000000000..d72afc953 --- /dev/null +++ b/highwayhash/highwayhash/data_parallel.h @@ -0,0 +1,341 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_DATA_PARALLEL_H_ +#define HIGHWAYHASH_DATA_PARALLEL_H_ + +// Portable C++11 alternative to OpenMP for data-parallel computations: +// provides low-overhead ThreadPool, plus PerThread with support for reduction. + +#include +#include // find_if +#include +#include //NOLINT +#include +#include +#include +#include +#include //NOLINT +#include //NOLINT +#include +#include + +#define DATA_PARALLEL_CHECK(condition) \ + while (!(condition)) { \ + printf("data_parallel check failed at line %d\n", __LINE__); \ + abort(); \ + } + +namespace highwayhash { + +// Highly scalable thread pool, especially suitable for data-parallel +// computations in the fork-join model, where clients need to know when all +// tasks have completed. +// +// Thread pools usually store small numbers of heterogeneous tasks in a queue. +// When tasks are identical or differ only by an integer input parameter, it is +// much faster to store just one function of an integer parameter and call it +// for each value. +// +// This thread pool can efficiently load-balance millions of tasks using an +// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and +// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher +// when using std::async, and up to 200x for a queue-based ThreadPool. +// +// Usage: +// ThreadPool pool; +// pool.Run(0, 1000000, [](const int i) { Func1(i); }); +// // When Run returns, all of its tasks have finished. +// +// pool.RunTasks({Func2, Func3, Func4}); +// // The destructor waits until all worker threads have exited cleanly. +class ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + // "num_threads" defaults to one per hyperthread. + explicit ThreadPool( + const int num_threads = std::thread::hardware_concurrency()) + : num_threads_(num_threads) { + DATA_PARALLEL_CHECK(num_threads_ > 0); + threads_.reserve(num_threads_); + for (int i = 0; i < num_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this); + } + + padding_[0] = 0; // avoid unused member warning. + + WorkersReadyBarrier(); + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Waits for all threads to exit. + ~ThreadPool() { + StartWorkers(kWorkerExit); + + for (std::thread& thread : threads_) { + thread.join(); + } + } + + // Runs func(i) on worker thread(s) for every i in [begin, end). + // Not thread-safe - no two calls to Run and RunTasks may overlap. + // Subsequent calls will reuse the same threads. + // + // Precondition: 0 <= begin <= end. + template + void Run(const int begin, const int end, const Func& func) { + DATA_PARALLEL_CHECK(0 <= begin && begin <= end); + if (begin == end) { + return; + } + const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin; + // Ensure the inputs do not result in a reserved command. + DATA_PARALLEL_CHECK(worker_command != kWorkerWait); + DATA_PARALLEL_CHECK(worker_command != kWorkerExit); + + // If Func is large (many captures), this will allocate memory, but it is + // still slower to use a std::ref wrapper. + task_ = func; + num_reserved_.store(0); + + StartWorkers(worker_command); + WorkersReadyBarrier(); + } + + // Runs each task (closure, typically a lambda function) on worker thread(s). + // Not thread-safe - no two calls to Run and RunTasks may overlap. + // Subsequent calls will reuse the same threads. + // + // This is a more conventional interface for heterogeneous tasks that may be + // independent/unrelated. + void RunTasks(const std::vector>& tasks) { + Run(0, static_cast(tasks.size()), + [&tasks](const int i) { tasks[i](); }); + } + + // Statically (and deterministically) splits [begin, end) into ranges and + // calls "func" for each of them. Useful when "func" involves some overhead + // (e.g. for PerThread::Get or random seeding) that should be amortized over + // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end). + template + void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) { + const uint32_t length = end - begin; + + // Use constant rather than num_threads_ for machine-independent splitting. + const uint32_t chunk = std::max(1U, (length + 127) / 128); + std::vector> ranges; // begin/end + ranges.reserve(length / chunk + 1); + for (uint32_t i = 0; i < length; i += chunk) { + ranges.emplace_back(begin + i, begin + std::min(i + chunk, length)); + } + + Run(0, static_cast(ranges.size()), [&ranges, func](const int i) { + func(i, ranges[i].first, ranges[i].second); + }); + } + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + // Special values; all others encode the begin/end parameters. + static constexpr WorkerCommand kWorkerWait = 0; + static constexpr WorkerCommand kWorkerExit = ~0ULL; + + void WorkersReadyBarrier() { + std::unique_lock lock(mutex_); + workers_ready_cv_.wait(lock, + [this]() { return workers_ready_ == num_threads_; }); + workers_ready_ = 0; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + std::unique_lock lock(mutex_); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + lock.unlock(); + worker_start_cv_.notify_all(); + } + + // Attempts to reserve and perform some work from the global range of tasks, + // which is encoded within "command". Returns after all tasks are reserved. + static void RunRange(ThreadPool* self, const WorkerCommand command) { + const int begin = command & 0xFFFFFFFF; + const int end = command >> 32; + const int num_tasks = end - begin; + + // OpenMP introduced several "schedule" strategies: + // "single" (static assignment of exactly one chunk per thread): slower. + // "dynamic" (allocates k tasks at a time): competitive for well-chosen k. + // "guided" (allocates k tasks, decreases k): computing k = remaining/n + // is faster than halving k each iteration. We prefer this strategy + // because it avoids user-specified parameters. + + for (;;) { + const int num_reserved = self->num_reserved_.load(); + const int num_remaining = num_tasks - num_reserved; + const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1); + const int my_begin = begin + self->num_reserved_.fetch_add(my_size); + const int my_end = std::min(my_begin + my_size, begin + num_tasks); + // Another thread already reserved the last task. + if (my_begin >= my_end) { + break; + } + for (int i = my_begin; i < my_end; ++i) { + self->task_(i); + } + } + } + + static void ThreadFunc(ThreadPool* self) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerExit: + return; // exits thread + } + + lock.unlock(); + RunRange(self, command); + } + } + + const int num_threads_; + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector threads_; + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + int workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + std::function task_; + + // Updated by workers; alignment/padding avoids false sharing. + alignas(64) std::atomic num_reserved_{0}; + int padding_[15]; +}; + +// Thread-local storage with support for reduction (combining into one result). +// The "T" type must be unique to the call site because the list of threads' +// copies is a static member. (With knowledge of the underlying threads, we +// could eliminate this list and T allocations, but that is difficult to +// arrange and we prefer this to be usable independently of ThreadPool.) +// +// Usage: +// for (int i = 0; i < N; ++i) { +// // in each thread: +// T& my_copy = PerThread::Get(); +// my_copy.Modify(); +// +// // single-threaded: +// T& combined = PerThread::Reduce(); +// Use(combined); +// PerThread::Destroy(); +// } +// +// T is duck-typed and implements the following interface: +// +// // Returns true if T is default-initialized or Destroy was called without +// // any subsequent re-initialization. +// bool IsNull() const; +// +// // Releases any resources. Postcondition: IsNull() == true. +// void Destroy(); +// +// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull(). +// void Assimilate(const T& victim); +template +class PerThread { + public: + // Returns reference to this thread's T instance (dynamically allocated, + // so its address is unique). Callers are responsible for any initialization + // beyond the default ctor. + static T& Get() { + static thread_local T* t; + if (t == nullptr) { + t = new T; + static std::mutex mutex; + std::lock_guard lock(mutex); + Threads().push_back(t); + } + return *t; + } + + // Returns vector of all per-thread T. Used inside Reduce() or by clients + // that require direct access to T instead of Assimilating them. + // Function wrapper avoids separate static member variable definition. + static std::vector& Threads() { + static std::vector threads; + return threads; + } + + // Returns the first non-null T after assimilating all other threads' T + // into it. Precondition: at least one non-null T exists (caller must have + // called Get() and initialized the result). + static T& Reduce() { + std::vector& threads = Threads(); + + // Find first non-null T + const auto it = std::find_if(threads.begin(), threads.end(), + [](const T* t) { return !t->IsNull(); }); + if (it == threads.end()) { + abort(); + } + T* const first = *it; + + for (const T* t : threads) { + if (t != first && !t->IsNull()) { + first->Assimilate(*t); + } + } + return *first; + } + + // Calls each thread's T::Destroy to release resources and/or prepare for + // reuse by the same threads/ThreadPool. Note that all T remain allocated + // (we need thread-independent pointers for iterating over each thread's T, + // and deleting them would leave dangling pointers in each thread, which is + // unacceptable because the same thread may call Get() again later.) + static void Destroy() { + for (T* t : Threads()) { + t->Destroy(); + } + } +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_DATA_PARALLEL_H_ diff --git a/highwayhash/highwayhash/data_parallel_benchmark.cc b/highwayhash/highwayhash/data_parallel_benchmark.cc new file mode 100644 index 000000000..b8817c5c1 --- /dev/null +++ b/highwayhash/highwayhash/data_parallel_benchmark.cc @@ -0,0 +1,157 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include //NOLINT +#include + +#include "testing/base/public/gunit.h" +#include "third_party/absl/container/btree_set.h" +#include "third_party/absl/time/clock.h" +#include "third_party/absl/time/time.h" +#include "highwayhash/arch_specific.h" +#include "highwayhash/data_parallel.h" +#include "thread/threadpool.h" + +namespace highwayhash { +namespace { + +constexpr int kBenchmarkTasks = 1000000; + +// Returns elapsed time [nanoseconds] for std::async. +double BenchmarkAsync(uint64_t* total) { + const absl::Time t0 = absl::Now(); + std::atomic sum1{0}; + std::atomic sum2{0}; + + std::vector> futures; + futures.reserve(kBenchmarkTasks); + for (int i = 0; i < kBenchmarkTasks; ++i) { + futures.push_back(std::async( + [&sum1, &sum2](const int i) { + sum1.fetch_add(i); + sum2.fetch_add(1); + }, + i)); + } + + for (auto& future : futures) { + future.get(); + } + + const absl::Time t1 = absl::Now(); + *total = sum1.load() + sum2.load(); + return absl::ToDoubleNanoseconds(t1 - t0); +} + +// Returns elapsed time [nanoseconds] for (atomic) ThreadPool. +double BenchmarkPoolA(uint64_t* total) { + const absl::Time t0 = absl::Now(); + std::atomic sum1{0}; + std::atomic sum2{0}; + + ThreadPool pool; + pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) { + sum1.fetch_add(i); + sum2.fetch_add(1); + }); + + const absl::Time t1 = absl::Now(); + *total = sum1.load() + sum2.load(); + return absl::ToDoubleNanoseconds(t1 - t0); +} + +// Returns elapsed time [nanoseconds] for ::ThreadPool. +double BenchmarkPoolG(uint64_t* total) { + const absl::Time t0 = absl::Now(); + std::atomic sum1{0}; + std::atomic sum2{0}; + + { + ::ThreadPool pool(std::thread::hardware_concurrency()); + pool.StartWorkers(); + for (int i = 0; i < kBenchmarkTasks; ++i) { + pool.Schedule([&sum1, &sum2, i]() { + sum1.fetch_add(i); + sum2.fetch_add(1); + }); + } + } + + const absl::Time t1 = absl::Now(); + *total = sum1.load() + sum2.load(); + return absl::ToDoubleNanoseconds(t1 - t0); +} + +// Compares ThreadPool speed to std::async and ::ThreadPool. +TEST(DataParallelTest, Benchmarks) { + uint64_t sum1, sum2, sum3; + const double async_ns = BenchmarkAsync(&sum1); + const double poolA_ns = BenchmarkPoolA(&sum2); + const double poolG_ns = BenchmarkPoolG(&sum3); + + printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns, + poolA_ns, poolG_ns); + // baseline 20x, 10x with asan or msan, 5x with tsan + EXPECT_GT(async_ns, poolA_ns * 4); + // baseline 200x, 180x with asan, 70x with msan, 50x with tsan. + EXPECT_GT(poolG_ns, poolA_ns * 20); + + // Should reach same result. + EXPECT_EQ(sum1, sum2); + EXPECT_EQ(sum2, sum3); +} + +#if HH_ARCH_X64 +// Ensures multiple hardware threads are used (decided by the OS scheduler). +TEST(DataParallelTest, TestApicIds) { + for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency(); + ++num_threads) { + ThreadPool pool(num_threads); + + std::mutex mutex; + absl::btree_set ids; + double total = 0.0; + pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) { + // Useless computations to keep the processor busy so that threads + // can't just reuse the same processor. + double sum = 0.0; + for (int rep = 0; rep < 900 * (i + 30); ++rep) { + sum += pow(rep, 0.5); + } + + mutex.lock(); + ids.insert(ApicId()); + total += sum; + mutex.unlock(); + }); + + // No core ID / APIC ID available + if (num_threads > 1 && ids.size() == 1) { + EXPECT_EQ(0, *ids.begin()); + } else { + // (The Linux scheduler doesn't use all available HTs, but the + // computations should at least keep most cores busy.) + EXPECT_GT(ids.size() + 2, num_threads / 4); + } + + // (Ensure the busy-work is not elided.) + EXPECT_GT(total, 1E4); + } +} +#endif // HH_ARCH_X64 + +} // namespace +} // namespace highwayhash diff --git a/highwayhash/highwayhash/data_parallel_test.cc b/highwayhash/highwayhash/data_parallel_test.cc new file mode 100644 index 000000000..2728b7d3a --- /dev/null +++ b/highwayhash/highwayhash/data_parallel_test.cc @@ -0,0 +1,175 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "testing/base/public/gunit.h" +#include "highwayhash/data_parallel.h" + +namespace highwayhash { +namespace { + +int PopulationCount(uint64_t bits) { + int num_set = 0; + while (bits != 0) { + num_set += bits & 1; + bits >>= 1; + } + return num_set; +} + +std::atomic func_counts{0}; + +void Func2() { + usleep(200000); + func_counts.fetch_add(4); +} + +void Func3() { + usleep(300000); + func_counts.fetch_add(16); +} + +void Func4() { + usleep(400000); + func_counts.fetch_add(256); +} + +// Exercises the RunTasks feature (running arbitrary tasks/closures) +TEST(DataParallelTest, TestRunTasks) { + ThreadPool pool(4); + pool.RunTasks({Func2, Func3, Func4}); + EXPECT_EQ(276, func_counts.load()); +} + +// Ensures task parameter is in bounds, every parameter is reached, +// pool can be reused (multiple consecutive Run calls), pool can be destroyed +// (joining with its threads). +TEST(DataParallelTest, TestPool) { + for (int num_threads = 1; num_threads <= 18; ++num_threads) { + ThreadPool pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector mementos(num_tasks, 0); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + pool.Run(begin, begin + num_tasks, + [begin, num_tasks, &mementos](const int i) { + // Parameter is in the given range + EXPECT_GE(i, begin); + EXPECT_LT(i, begin + num_tasks); + + // Store mementos to be sure we visited each i. + mementos.at(i - begin) = 1000 + i; + }); + for (int i = begin; i < begin + num_tasks; ++i) { + EXPECT_EQ(1000 + i, mementos.at(i - begin)); + } + } + } + } +} + +TEST(DataParallelTest, TestRunRanges) { + for (int num_threads = 1; num_threads <= 18; ++num_threads) { + ThreadPool pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector mementos(num_tasks, 0); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + pool.RunRanges(begin, begin + num_tasks, + [begin, num_tasks, &mementos](const int chunk, + const uint32_t my_begin, + const uint32_t my_end) { + for (uint32_t i = my_begin; i < my_end; ++i) { + // Parameter is in the given range + EXPECT_GE(i, begin); + EXPECT_LT(i, begin + num_tasks); + + // Store mementos to be sure we visited each i. + mementos.at(i - begin) = 1000 + i; + } + }); + for (int i = begin; i < begin + num_tasks; ++i) { + EXPECT_EQ(1000 + i, mementos.at(i - begin)); + } + } + } + } +} + +// Ensures each of N threads processes exactly 1 of N tasks, i.e. the +// work distribution is perfectly fair for small counts. +TEST(DataParallelTest, TestSmallAssignments) { + for (int num_threads = 1; num_threads <= 64; ++num_threads) { + ThreadPool pool(num_threads); + + std::atomic counter{0}; + // (Avoid mutex because it may perturb the worker thread scheduling) + std::atomic id_bits{0}; + + pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) { + const int id = counter.fetch_add(1); + EXPECT_LT(id, num_threads); + uint64_t bits = id_bits.load(std::memory_order_relaxed); + while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) { + } + }); + + const int num_participants = PopulationCount(id_bits.load()); + EXPECT_EQ(num_threads, num_participants); + } +} + +// Test payload for PerThread. +struct CheckUniqueIDs { + bool IsNull() const { return false; } + void Destroy() { id_bits = 0; } + void Assimilate(const CheckUniqueIDs& victim) { + // Cannot overlap because each PerThread has unique bits. + EXPECT_EQ(0, id_bits & victim.id_bits); + id_bits |= victim.id_bits; + } + + uint64_t id_bits = 0; +}; + +// Ensures each thread has a PerThread instance, that they are successfully +// combined/reduced into a single result, and that reuse is possible after +// Destroy(). +TEST(DataParallelTest, TestPerThread) { + // We use a uint64_t bit array for convenience => no more than 64 threads. + const int max_threads = std::min(64U, std::thread::hardware_concurrency()); + for (int num_threads = 1; num_threads <= max_threads; ++num_threads) { + ThreadPool pool(num_threads); + + std::atomic counter{0}; + pool.Run(0, num_threads, [&counter, num_threads](const int i) { + const int id = counter.fetch_add(1); + EXPECT_LT(id, num_threads); + PerThread::Get().id_bits |= 1ULL << id; + }); + + // Verify each thread's bit is set. + const uint64_t all_bits = PerThread::Reduce().id_bits; + // Avoid shifting by 64 (undefined). + const uint64_t expected = + num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1; + EXPECT_EQ(expected, all_bits); + PerThread::Destroy(); + } +} + +} // namespace +} // namespace highwayhash diff --git a/highwayhash/highwayhash/endianess.h b/highwayhash/highwayhash/endianess.h new file mode 100644 index 000000000..776a02fa2 --- /dev/null +++ b/highwayhash/highwayhash/endianess.h @@ -0,0 +1,108 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_ENDIANESS_H_ +#define HIGHWAYHASH_ENDIANESS_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include + +#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) + + /* Someone has already included or equivalent. */ + +#elif defined(__LITTLE_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN 1 +# define HH_IS_BIG_ENDIAN 0 +# ifdef __BIG_ENDIAN__ +# error "Platform is both little and big endian?" +# endif + +#elif defined(__BIG_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN 0 +# define HH_IS_BIG_ENDIAN 1 + +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + defined(__ORDER_LITTLE_ENDIAN__) + +# define HH_IS_LITTLE_ENDIAN (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define HH_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + +#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \ + defined( __GNU_LIBRARY__ ) + +# include + +#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \ + defined(__DragonFly__) + +# include + +#elif defined(_WIN32) + +#define HH_IS_LITTLE_ENDIAN 1 +#define HH_IS_BIG_ENDIAN 0 + +#else + +# error "Unsupported platform. Cannot determine byte order." + +#endif + + +#ifndef HH_IS_LITTLE_ENDIAN +# define HH_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) +# define HH_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN) +#endif + + +namespace highwayhash { + +#if HH_IS_LITTLE_ENDIAN + +static inline uint32_t le32_from_host(uint32_t x) { return x; } +static inline uint32_t host_from_le32(uint32_t x) { return x; } +static inline uint64_t le64_from_host(uint64_t x) { return x; } +static inline uint64_t host_from_le64(uint64_t x) { return x; } + +#elif !HH_IS_BIG_ENDIAN + +# error "Unsupported byte order." + +#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64) + +#include +static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); } +static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); } +static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);} +static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);} + +#else + +static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);} +static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);} +static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);} +static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);} + +#endif + +} // namespace highwayhash + +#endif // HIGHWAYHASH_ENDIANESS_H_ diff --git a/highwayhash/highwayhash/example.cc b/highwayhash/highwayhash/example.cc new file mode 100644 index 000000000..e3939dd4a --- /dev/null +++ b/highwayhash/highwayhash/example.cc @@ -0,0 +1,40 @@ +// Minimal usage example: prints a hash. Tested on x86, ppc, arm. + +#include +#include +#include + +#include "highwayhash/highwayhash.h" + +using namespace highwayhash; + +int main(int argc, char* argv[]) { + // We read from the args on purpose, to ensure a compile time constant will + // not be used, for verifying assembly on the supported platforms. + if (argc != 2) { + std::cout << "Please provide 1 argument with a text to hash" << std::endl; + return 1; + } + + // Please use a different key to ensure your hashes aren't identical. + HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4}; + + // Aligning inputs to 32 bytes may help but is not required. + const char* in = argv[1]; + const size_t size = strlen(in); + + // Type determines the hash size; can also be HHResult128 or HHResult256. + HHResult64 result; + + // HH_TARGET_PREFERRED expands to the best specialization available for the + // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__). + HHStateT state(key); + HighwayHashT(&state, in, size, &result); + std::cout << "Hash : " << result << std::endl; + + HighwayHashCatT cat(key); + cat.Append(in, size); + cat.Finalize(&result); + std::cout << "HashCat: " << result << std::endl; + return 0; +} diff --git a/highwayhash/highwayhash/hh_avx2.cc b/highwayhash/highwayhash/hh_avx2.cc new file mode 100644 index 000000000..7e3ddff0d --- /dev/null +++ b/highwayhash/highwayhash/hh_avx2.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME AVX2 +#include "highwayhash/highwayhash_target.cc" diff --git a/highwayhash/highwayhash/hh_avx2.h b/highwayhash/highwayhash/hh_avx2.h new file mode 100644 index 000000000..db44f533c --- /dev/null +++ b/highwayhash/highwayhash/hh_avx2.h @@ -0,0 +1,381 @@ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_AVX2_H_ +#define HIGHWAYHASH_HH_AVX2_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_buffer.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" +#include "highwayhash/vector128.h" +#include "highwayhash/vector256.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -mavx2 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; matching it here makes +// it easier use the vector128 symbols, but requires textual inclusion. +namespace HH_TARGET_NAME { + +class HHStateAVX2 { + public: + explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); } + + HH_INLINE void Reset(const HHKey key_lanes) { + // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from + // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016. + // + // We use this python code to generate the fourth number to have + // more even mixture of bits: + /* +def x(a,b,c): + retval = 0 + for i in range(64): + count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1) + if (count <= 1): + retval |= 1 << i + return retval + */ + const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull, + 0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); + const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull, + 0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); + const V4x64U key = LoadUnaligned(key_lanes); + v0 = key ^ init0; + v1 = Rotate64By32(key) ^ init1; + mul0 = init0; + mul1 = init1; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast(packet_bytes); + Update(LoadUnaligned(packet)); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const V8x32U size256( + _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32))); + // Equivalent to storing size_mod32 in packet. + v0 += V4x64U(size256); + // Boosts the avalanche effect of mod32. + v1 = Rotate32By(v1, size256); + + const char* remainder = bytes + (size_mod32 & ~3); + const size_t size_mod4 = size_mod32 & 3; + + const V4x32U size(_mm256_castsi256_si128(size256)); + + // (Branching is faster than a single _mm256_maskload_epi32.) + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const V4x32U packetL = + LoadUnaligned(reinterpret_cast(bytes)); + + const V4x32U int_mask = IntMask<16>()(size); + const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask); + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3)); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const V4x32U int_mask = IntMask<0>()(size); + const V4x32U packetL = MaskedLoadInt(bytes, int_mask); + const uint64_t last3 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + const V4x32U packetH(_mm_cvtsi64_si128(last3)); + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. It is slightly better to permute v0 than v1; + // it will be added to v1. + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + Update(Permute(v0)); + + const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); + const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1)); + const V2x64U hash = sum0 + sum1; + // Each lane is sufficiently mixed, so just truncate to 64 bits. + _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + for (int n = 0; n < 6; n++) { + Update(Permute(v0)); + } + + const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0)); + const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1)); + const V2x64U hash = sum0 + sum1; + _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + for (int n = 0; n < 10; n++) { + Update(Permute(v0)); + } + + const V4x64U sum0 = v0 + mul0; + const V4x64U sum1 = v1 + mul1; + const V4x64U hash = ModularReduction(sum1, sum0); + StoreUnaligned(hash, &(*result)[0]); + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero); + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + const V4x32U size(size_mod32); + const uint32_t* const HH_RESTRICT from_u32 = + reinterpret_cast(from); + uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast(buffer); + if (HH_UNLIKELY(size_mod32 & 16)) { // Copying 16..31 bytes + const V4x32U inL = LoadUnaligned(from_u32); + Store(inL, buffer_u32); + const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>( + from + 16, size_mod32 - 16, size); + Store(inH, buffer_u32 + V4x32U::N); + } else { // Copying 0..15 bytes + const V4x32U inL = Load0To16<>(from, size_mod32, size); + Store(inL, buffer_u32); + // No need to change upper 16 bytes of buffer. + } + } + + // "buffer" must be 32-byte aligned. + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + const V4x32U size(size_mod32); + uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast(buffer); + // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. + if (HH_UNLIKELY(buffer_valid & 16)) { + const V4x32U suffix = Load0To16<>(from, size_mod32, size); + const V4x32U bufferH = Load(buffer_u32 + V4x32U::N); + const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix); + Store(outH, buffer_u32 + V4x32U::N); + } else { // Appending 0..32 bytes starting at offset 0..15. + const V4x32U bufferL = Load(buffer_u32); + const V4x32U suffixL = Load0To16<>(from, size_mod32, size); + const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL); + Store(outL, buffer_u32); + const size_t offsetH = sizeof(V4x32U) - buffer_valid; + // Do we have enough input to start filling the upper 16 buffer bytes? + if (size_mod32 > offsetH) { + const size_t sizeH = size_mod32 - offsetH; + const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); + Store(outH, buffer_u32 + V4x32U::N); + } + } + } + + // "buffer" must be 32-byte aligned. + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + const V4x32U size(size_mod32); + const uint32_t* const HH_RESTRICT buffer_u32 = + reinterpret_cast(buffer); + // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes. + if (HH_UNLIKELY(buffer_valid & 16)) { + const V4x32U suffix = Load0To16<>(from, size_mod32, size); + const V4x32U packetL = Load(buffer_u32); + const V4x32U bufferH = Load(buffer_u32 + V4x32U::N); + const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix); + Update(packetH, packetL); + } else { // Appending 0..32 bytes starting at offset 0..15. + const V4x32U bufferL = Load(buffer_u32); + const V4x32U suffixL = Load0To16<>(from, size_mod32, size); + const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL); + const size_t offsetH = sizeof(V4x32U) - buffer_valid; + V4x32U packetH = packetL - packetL; + // Do we have enough input to start filling the upper 16 packet bytes? + if (size_mod32 > offsetH) { + const size_t sizeH = size_mod32 - offsetH; + packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH)); + } + + Update(packetH, packetL); + } + } + + private: + static HH_INLINE V4x32U MaskedLoadInt(const char* from, + const V4x32U& int_mask) { + // No faults will be raised when reading n=0..3 ints from "from" provided + // int_mask[n] = 0. + const int* HH_RESTRICT int_from = reinterpret_cast(from); + return V4x32U(_mm_maskload_epi32(int_from, int_mask)); + } + + // Loads <= 16 bytes without accessing any byte outside [from, from + size). + // from[i] is loaded into lane i; from[i >= size] is undefined. + template + static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32, + const V4x32U& size) { + const char* remainder = from + (size_mod32 & ~3); + const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3); + const V4x32U int_mask = IntMask()(size); + const V4x32U int_lanes = MaskedLoadInt(from, int_mask); + return Insert4AboveMask(last3, int_mask, int_lanes); + } + + static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) { + return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); + } + + // Rotates 32-bit lanes by "count" bits. + static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) { + // Use variable shifts because sll_epi32 has 4 cycle latency (presumably + // to broadcast the shift count). + const V4x64U shifted_left(_mm256_sllv_epi32(v, count)); + const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count)); + return shifted_left | shifted_right; + } + + static HH_INLINE V4x64U Permute(const V4x64U& v) { + // For complete mixing, we need to swap the upper and lower 128-bit halves; + // we also swap all 32-bit halves. This is faster than extracti128 plus + // inserti128 followed by Rotate64By32. + const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull, + 0x0000000600000007ull, 0x0000000400000005ull); + return V4x64U(_mm256_permutevar8x32_epi32(v, indices)); + } + + static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) { + return V4x64U(_mm256_mul_epu32(a, b)); + } + + static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across the four lanes. + // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult + // to cross the 128-bit wall, but PermuteAndUpdate takes care of that); + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + const uint64_t hi = 0x070806090D0A040Bull; + const uint64_t lo = 0x000F010E05020C03ull; + return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo))); + } + + // Updates four hash lanes in parallel by injecting four 64-bit packets. + HH_INLINE void Update(const V4x64U& packet) { + v1 += packet; + v1 += mul0; + mul0 ^= MulLow32(v1, v0 >> 32); + HH_COMPILER_FENCE; + v0 += mul1; + mul1 ^= MulLow32(v0, v1 >> 32); + HH_COMPILER_FENCE; + v0 += ZipperMerge(v1); + v1 += ZipperMerge(v0); + } + + HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) { + const __m256i packetL256 = _mm256_castsi128_si256(packetL); + Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1))); + } + + // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a. + // Also does the same for the upper 128 bit lane "b". Bit shifts are only + // possible on independent 64-bit lanes. We therefore insert the upper bits + // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments! + static HH_INLINE void XorByShift128Left12(const V4x64U& ba, + V4x64U* HH_RESTRICT out) { + const V4x64U zero = ba ^ ba; + const V4x64U top_bits2 = ba >> (64 - 2); + const V4x64U ones = ba == ba; // FF .. FF + const V4x64U shifted1_unmasked = ba + ba; // (avoids needing port0) + HH_COMPILER_FENCE; + + // Only the lower halves of top_bits1's 128 bit lanes will be used, so we + // can compute it before clearing the upper two bits of ba. + const V4x64U top_bits1 = ba >> (64 - 1); + const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8)); // F 0 F 0 + const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked; + HH_COMPILER_FENCE; + + const V4x64U upper_bit_of_128 = upper_8bytes << 63; // 80..00 80..00 + const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2)); + *out ^= shifted2; + HH_COMPILER_FENCE; + + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked); + *out ^= new_low_bits2; + HH_COMPILER_FENCE; + + const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1)); + *out ^= shifted1; + + *out ^= new_low_bits1; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors. + // The upper and lower 128-bit halves are processed independently. + static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32, + const V4x64U& b10a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + V4x64U out = b10a10; + XorByShift128Left12(b32a32, &out); + return out; + } + + V4x64U v0; + V4x64U v1; + V4x64U mul0; + V4x64U mul1; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_AVX2_H_ diff --git a/highwayhash/highwayhash/hh_buffer.h b/highwayhash/highwayhash/hh_buffer.h new file mode 100644 index 000000000..7b1dad0d1 --- /dev/null +++ b/highwayhash/highwayhash/hh_buffer.h @@ -0,0 +1,116 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_BUFFER_H_ +#define HIGHWAYHASH_HH_BUFFER_H_ + +// Helper functions used by hh_avx2 and hh_sse41. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#if HH_TARGET == HH_TARGET_NEON +#include "highwayhash/vector_neon.h" +#else +#include "highwayhash/vector128.h" +#endif + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -msse4.1 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// To prevent ODR violations when including this from multiple translation +// units (TU) that are compiled with different flags, the contents must reside +// in a namespace whose name is unique to the TU. NOTE: this behavior is +// incompatible with precompiled modules and requires textual inclusion instead. +namespace HH_TARGET_NAME { + +template +struct IntMask {}; // primary template + +template <> +struct IntMask<0> { + // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes. + // Typical case: size = 0..16, nothing deducted. + HH_INLINE V4x32U operator()(const V4x32U& size) const { + // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have + // greater-than comparisons and don't want a negated mask. +#if HH_TARGET == HH_TARGET_NEON + return V4x32U(vcgtq_u32(size, V4x32U(15, 11, 7, 3))); +#else + return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3))); +#endif + } +}; + +template <> +struct IntMask<16> { + // "size" is 16..31; this is for loading the upper half of a packet, so + // effectively deduct 16 from size by changing the comparands. + HH_INLINE V4x32U operator()(const V4x32U& size) const { +#if HH_TARGET == HH_TARGET_NEON + return V4x32U(vcgtq_u32(size, V4x32U(31, 27, 23, 19))); +#else + return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19))); +#endif + } +}; + +// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0. +// Assumes prev[j] == 0 if mask[j] = 0. +HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask, + const V4x32U& prev) { + // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a + // control mask requires a table lookup. We know the shift count is a + // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except + // those where mask != 0. This works because any upper output lanes need not + // be zero. + return prev | AndNot(mask, V4x32U(bytes4)); +} + +#if HH_TARGET == HH_TARGET_AVX2 +// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of +// "prefix", and returns the merged/concatenated bytes. +HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len, + const V4x32U& suffix) { + static const uint64_t table[V16x8U::N][V2x64U::N] = { + {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull}, + {0x06050403020100FFull, 0x0E0D0C0B0A090807ull}, + {0x050403020100FFFFull, 0x0D0C0B0A09080706ull}, + {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull}, + {0x03020100FFFFFFFFull, 0x0B0A090807060504ull}, + {0x020100FFFFFFFFFFull, 0x0A09080706050403ull}, + {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull}, + {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull}, + {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull}, + {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull}, + {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull}, + {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}}; + const V2x64U control = Load(&table[prefix_len][0]); + const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control)); + return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control)); +} +#endif +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_BUFFER_H_ diff --git a/highwayhash/highwayhash/hh_neon.cc b/highwayhash/highwayhash/hh_neon.cc new file mode 100644 index 000000000..981c094db --- /dev/null +++ b/highwayhash/highwayhash/hh_neon.cc @@ -0,0 +1,22 @@ +// Copyright 2017-2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME NEON +// GCC 4.5.4 only defines the former; 5.4 defines both. +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include "highwayhash/highwayhash_target.cc" +#endif diff --git a/highwayhash/highwayhash/hh_neon.h b/highwayhash/highwayhash/hh_neon.h new file mode 100644 index 000000000..286ad7ec0 --- /dev/null +++ b/highwayhash/highwayhash/hh_neon.h @@ -0,0 +1,336 @@ +// Copyright 2015-2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_NEON_H_ +#define HIGHWAYHASH_HH_NEON_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_buffer.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" +#include "highwayhash/vector_neon.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents. +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { + +// See vector_neon.h for why this namespace is necessary; matching it here makes +// it easier use the vector_neon symbols, but requires textual inclusion. +namespace HH_TARGET_NAME { + +// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010 +// Uses the same method that SSE4.1 uses, only with NEON used instead. +class HHStateNEON { + public: + explicit HH_INLINE HHStateNEON(const HHKey key) { Reset(key); } + + HH_INLINE void Reset(const HHKey key) { + // "Nothing up my sleeve numbers"; see HHStateTAVX2. + const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); + const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull); + const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); + const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull); + const V2x64U keyL = LoadUnaligned(key + 0); + const V2x64U keyH = LoadUnaligned(key + 2); + v0L = keyL ^ init0L; + v0H = keyH ^ init0H; + v1L = Rotate64By32(keyL) ^ init1L; + v1H = Rotate64By32(keyH) ^ init1H; + mul0L = init0L; + mul0H = init0H; + mul1L = init1L; + mul1H = init1H; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast(packet_bytes); + const V2x64U packetL = LoadUnaligned(packet + 0); + const V2x64U packetH = LoadUnaligned(packet + 2); + Update(packetH, packetL); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + + // We can't use vshl/vsra because it needs a constant expression. + // In order to do this right now, we would need a switch statement. + const int32x4_t vsize_mod32(vdupq_n_s32(static_cast(size_mod32))); + // -32 - size_mod32 + const int32x4_t shift_right_amt = + vdupq_n_s32(static_cast(size_mod32) + (~32 + 1)); + // Equivalent to storing size_mod32 in packet. + v0L += V2x64U(vreinterpretq_u64_s32(vsize_mod32)); + v0H += V2x64U(vreinterpretq_u64_s32(vsize_mod32)); + + // Boosts the avalanche effect of mod32. + v1L = V2x64U(vreinterpretq_u64_u32( + vorrq_u32(vshlq_u32(vreinterpretq_u32_u64(v1L), vsize_mod32), + vshlq_u32(vreinterpretq_u32_u64(v1L), shift_right_amt)))); + v1H = V2x64U(vreinterpretq_u64_u32( + vorrq_u32(vshlq_u32(vreinterpretq_u32_u64(v1H), vsize_mod32), + vshlq_u32(vreinterpretq_u32_u64(v1H), shift_right_amt)))); + + const size_t size_mod4 = size_mod32 & 3; + const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3); + + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const V2x64U packetL = + LoadUnaligned(reinterpret_cast(bytes)); + + V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32); + + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + packetH = V2x64U(vreinterpretq_u64_u32( + vsetq_lane_u32(last4, vreinterpretq_u32_u64(packetH), 3))); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32); + + const uint64_t last4 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + HH_ALIGNAS(16) uint64_t tmp[2] = {last4, 0}; + const V2x64U packetH(vld1q_u64(tmp)); + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. + for (int n = 0; n < 4; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1L + mul1L; + const V2x64U hash = sum0 + sum1; + vst1q_low_u64(reinterpret_cast(result), hash); + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + for (int n = 0; n < 6; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1H + mul1H; + const V2x64U hash = sum0 + sum1; + StoreUnaligned(hash, &(*result)[0]); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + for (int n = 0; n < 10; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0L = v0L + mul0L; + const V2x64U sum1L = v1L + mul1L; + const V2x64U sum0H = v0H + mul0H; + const V2x64U sum1H = v1H + mul1H; + const V2x64U hashL = ModularReduction(sum1L, sum0L); + const V2x64U hashH = ModularReduction(sum1H, sum0H); + StoreUnaligned(hashL, &(*result)[0]); + StoreUnaligned(hashH, &(*result)[2]); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) { + for (size_t i = 0; i < sizeof(HHPacket); ++i) { + buffer_bytes[i] = 0; + } + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HH_ALIGNAS(32) HHPacket tmp; + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + // Swap 32-bit halves of each lane (caller swaps 128-bit halves) + static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) { + return V2x64U(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64(v)))); + } + + static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across each lane's bytes; + // 2) mixing with bytes from the neighboring lane; + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + + // The positions of each byte in the new vector. + const uint8_t shuffle_positions[] = {3, 12, 2, 5, 14, 1, 15, 0, + 11, 4, 10, 13, 9, 6, 8, 7}; + const uint8x16_t tbl = vld1q_u8(shuffle_positions); + + // Note: vqtbl1q_u8 is polyfilled for ARMv7a in vector_neon.h. + return V2x64U( + vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(v), tbl))); + } + + HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) { + v1L += packetL; + v1H += packetH; + v1L += mul0L; + v1H += mul0H; + // mul0L ^= (v1L & 0xFFFFFFFF) * (v0L >> 32); + mul0L ^= V2x64U(vmull_u32(vmovn_u64(v1L), vshrn_n_u64(v0L, 32))); + // mul0H ^= (v1H & 0xFFFFFFFF) * (v0H >> 32); + mul0H ^= V2x64U(vmull_u32(vmovn_u64(v1H), vshrn_n_u64(v0H, 32))); + v0L += mul1L; + v0H += mul1H; + // mul1L ^= (v0L & 0xFFFFFFFF) * (v1L >> 32); + mul1L ^= V2x64U(vmull_u32(vmovn_u64(v0L), vshrn_n_u64(v1L, 32))); + // mul1H ^= (v0H & 0xFFFFFFFF) * (v1H >> 32); + mul1H ^= V2x64U(vmull_u32(vmovn_u64(v0H), vshrn_n_u64(v1H, 32))); + v0L += ZipperMerge(v1L); + v0H += ZipperMerge(v1H); + v1L += ZipperMerge(v0L); + v1H += ZipperMerge(v0H); + } + + HH_INLINE void PermuteAndUpdate() { + // It is slightly better to permute v0 than v1; it will be added to v1. + Update(Rotate64By32(v0L), Rotate64By32(v0H)); + } + + // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12 + // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32. + static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes, + const size_t size) { + const uint32_t* words = reinterpret_cast(bytes); + // Mask of 1-bits where the final 4 bytes should be inserted (replacement + // for variable shift/insert using broadcast+blend). + alignas(16) const uint64_t mask_pattern[2] = {0xFFFFFFFFULL, 0}; + V2x64U mask4(vld1q_u64(mask_pattern)); // 'insert' into lane 0 + V2x64U ret(vdupq_n_u64(0)); + if (size & 8) { + ret = V2x64U(vld1q_low_u64(reinterpret_cast(words))); + // mask4 = 0 ~0 0 0 ('insert' into lane 2) + mask4 = V2x64U(vshlq_n_u128(mask4, 8)); + words += 2; + } + // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret. + if (size & 4) { + // = 0 word2 0 word2; mask4 will select which lane to keep. + const V2x64U broadcast( + vreinterpretq_u64_u32(vdupq_n_u32(LoadUnaligned(words)))); + // (slightly faster than blendv_epi8) + ret |= V2x64U(broadcast & mask4); + } + return ret; + } + + // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x. + // Bit shifts are only possible on independent 64-bit lanes. We therefore + // insert the upper bits of x[0] that were lost into x[1]. + // Thanks to D. Lemire for helpful comments! + static HH_INLINE void XorByShift128Left12(const V2x64U& x, + V2x64U* HH_RESTRICT out) { + const V4x32U zero(vdupq_n_u32(0)); + const V2x64U sign_bit128( + vreinterpretq_u64_u32(vsetq_lane_u32(0x80000000u, zero, 3))); + const V2x64U top_bits2 = x >> (64 - 2); + HH_COMPILER_FENCE; + const V2x64U shifted1_unmasked = x + x; // (avoids needing port0) + + // Only the lower half of top_bits1 will be used, so we + // can compute it before clearing the upper two bits of x. + const V2x64U top_bits1 = x >> (64 - 1); + const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked; + HH_COMPILER_FENCE; + + const V2x64U new_low_bits2(vshlq_n_u128(top_bits2, 8)); + *out ^= shifted2; + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked); + HH_COMPILER_FENCE; + + const V2x64U new_low_bits1(vshlq_n_u128(top_bits1, 8)); + *out ^= new_low_bits2; + *out ^= shifted1; + *out ^= new_low_bits1; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked, + const V2x64U& a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + V2x64U out = a10; + XorByShift128Left12(a32_unmasked, &out); + return out; + } + + V2x64U v0L; + V2x64U v0H; + V2x64U v1L; + V2x64U v1H; + V2x64U mul0L; + V2x64U mul0H; + V2x64U mul1L; + V2x64U mul1H; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_NEON_H_ diff --git a/highwayhash/highwayhash/hh_portable.cc b/highwayhash/highwayhash/hh_portable.cc new file mode 100644 index 000000000..3e0de9ed9 --- /dev/null +++ b/highwayhash/highwayhash/hh_portable.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME Portable +#include "highwayhash/highwayhash_target.cc" diff --git a/highwayhash/highwayhash/hh_portable.h b/highwayhash/highwayhash/hh_portable.h new file mode 100644 index 000000000..ab6e2faf2 --- /dev/null +++ b/highwayhash/highwayhash/hh_portable.h @@ -0,0 +1,302 @@ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_PORTABLE_H_ +#define HIGHWAYHASH_HH_PORTABLE_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/endianess.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; we match it here for +// consistency. As a result, this header requires textual inclusion. +namespace HH_TARGET_NAME { + +class HHStatePortable { + public: + static const int kNumLanes = 4; + using Lanes = uint64_t[kNumLanes]; + + explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); } + + HH_INLINE void Reset(const HHKey keys) { + static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull, + 0x13198a2e03707344ull, 0x243f6a8885a308d3ull}; + static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull, + 0xbe5466cf34e90c6cull, 0x452821e638d01377ull}; + Lanes rotated_keys; + Rotate64By32(keys, &rotated_keys); + Copy(init0, &mul0); + Copy(init1, &mul1); + Xor(init0, keys, &v0); + Xor(init1, rotated_keys, &v1); + } + + HH_INLINE void Update(const HHPacket& packet) { + Lanes packet_lanes; + CopyPartial(&packet[0], sizeof(HHPacket), + reinterpret_cast(&packet_lanes)); + for (int lane = 0; lane < kNumLanes; ++lane) { + packet_lanes[lane] = host_from_le64(packet_lanes[lane]); + } + Update(packet_lanes); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const uint64_t mod32_pair = + (static_cast(size_mod32) << 32) + size_mod32; + for (int lane = 0; lane < kNumLanes; ++lane) { + v0[lane] += mod32_pair; + } + Rotate32By(reinterpret_cast(&v1), size_mod32); + + const size_t size_mod4 = size_mod32 & 3; + const char* remainder = bytes + (size_mod32 & ~3); + + HH_ALIGNAS(32) HHPacket packet = {0}; + CopyPartial(bytes, remainder - bytes, &packet[0]); + + if (size_mod32 & 16) { // 16..31 bytes left + // Read the last 0..3 bytes and previous 1..4 into the upper bits. + // Insert into the upper four bytes of packet, which are zero. + uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + last4 = host_from_le32(last4); + + CopyPartial(reinterpret_cast(&last4), 4, &packet[28]); + } else { // size_mod32 < 16 + uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4); + last4 = host_from_le64(last4); + + // Rather than insert at packet + 28, it is faster to initialize + // the otherwise empty packet + 16 with up to 64 bits of padding. + CopyPartial(reinterpret_cast(&last4), sizeof(last4), + &packet[16]); + } + Update(packet); + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + for (int n = 0; n < 4; n++) { + PermuteAndUpdate(); + } + + *result = v0[0] + v1[0] + mul0[0] + mul1[0]; + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + for (int n = 0; n < 6; n++) { + PermuteAndUpdate(); + } + + (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2]; + (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3]; + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + for (int n = 0; n < 10; n++) { + PermuteAndUpdate(); + } + + ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1], + v0[0] + mul0[0], &(*result)[1], &(*result)[0]); + ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3], + v0[2] + mul0[2], &(*result)[3], &(*result)[2]); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) { + for (size_t i = 0; i < sizeof(HHPacket); ++i) { + buffer[i] = 0; + } + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HH_ALIGNAS(32) HHPacket tmp; + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] = source[lane]; + } + } + + static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] += source[lane]; + } + } + + template + static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2, + Lanes* HH_RESTRICT dest) { + for (int lane = 0; lane < kNumLanes; ++lane) { + (*dest)[lane] = op1[lane] ^ op2[lane]; + } + } + +// Clears all bits except one byte at the given offset. +#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8))) + + // 16-byte permutation; shifting is about 10% faster than byte loads. + // Adds zipper-merge result to add*. + static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0, + uint64_t* HH_RESTRICT add1, + uint64_t* HH_RESTRICT add0) { + *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) + + ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) + + (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56); + + *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) + + (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) + + (MASK(v1, 0) << 48) + MASK(v0, 7); + } + +#undef MASK + + // For inputs that are already in native byte order (e.g. PermuteAndAdd) + HH_INLINE void Update(const Lanes& packet_lanes) { + Add(packet_lanes, &v1); + Add(mul0, &v1); + + // (Loop is faster than unrolling) + for (int lane = 0; lane < kNumLanes; ++lane) { + const uint32_t v1_32 = static_cast(v1[lane]); + mul0[lane] ^= v1_32 * (v0[lane] >> 32); + v0[lane] += mul1[lane]; + const uint32_t v0_32 = static_cast(v0[lane]); + mul1[lane] ^= v0_32 * (v1[lane] >> 32); + } + + ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]); + ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]); + + ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]); + ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]); + } + + static HH_INLINE uint64_t Rotate64By32(const uint64_t x) { + return (x >> 32) | (x << 32); + } + + template + static HH_INLINE void Rotate64By32(const LanesOrPointer& v, + Lanes* HH_RESTRICT rotated) { + for (int i = 0; i < kNumLanes; ++i) { + (*rotated)[i] = Rotate64By32(v[i]); + } + } + + static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) { + for (int i = 0; i < 2 * kNumLanes; ++i) { + const uint32_t x = halves[i]; + halves[i] = (x << count) | (x >> (32 - count)); + } + } + + static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) { + (*permuted)[0] = Rotate64By32(v[2]); + (*permuted)[1] = Rotate64By32(v[3]); + (*permuted)[2] = Rotate64By32(v[0]); + (*permuted)[3] = Rotate64By32(v[1]); + } + + HH_INLINE void PermuteAndUpdate() { + Lanes permuted; + Permute(v0, &permuted); + Update(permuted); + } + + // Computes a << kBits for 128-bit a = (a1, a0). + // Bit shifts are only possible on independent 64-bit lanes. We therefore + // insert the upper bits of a0 that were lost into a1. This is slightly + // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach. + template + static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1, + uint64_t* HH_RESTRICT a0) { + const uint64_t shifted1 = (*a1) << kBits; + const uint64_t top_bits = (*a0) >> (64 - kBits); + *a0 <<= kBits; + *a1 = shifted1 | top_bits; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE void ModularReduction(const uint64_t a3_unmasked, + const uint64_t a2, const uint64_t a1, + const uint64_t a0, + uint64_t* HH_RESTRICT m1, + uint64_t* HH_RESTRICT m0) { + // The upper two bits must be clear, otherwise a3 << 2 would lose bits, + // in which case we're no longer computing a reduction. + const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull; + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + uint64_t a3_shl1 = a3; + uint64_t a2_shl1 = a2; + uint64_t a3_shl2 = a3; + uint64_t a2_shl2 = a2; + Shift128Left<1>(&a3_shl1, &a2_shl1); + Shift128Left<2>(&a3_shl2, &a2_shl2); + *m1 = a1 ^ a3_shl1 ^ a3_shl2; + *m0 = a0 ^ a2_shl1 ^ a2_shl2; + } + + Lanes v0; + Lanes v1; + Lanes mul0; + Lanes mul1; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HIGHWAYHASH_HH_PORTABLE_H_ diff --git a/highwayhash/highwayhash/hh_sse41.cc b/highwayhash/highwayhash/hh_sse41.cc new file mode 100644 index 000000000..9d6a0b968 --- /dev/null +++ b/highwayhash/highwayhash/hh_sse41.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME SSE41 +#include "highwayhash/highwayhash_target.cc" diff --git a/highwayhash/highwayhash/hh_sse41.h b/highwayhash/highwayhash/hh_sse41.h new file mode 100644 index 000000000..333db1d1b --- /dev/null +++ b/highwayhash/highwayhash/hh_sse41.h @@ -0,0 +1,336 @@ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_SSE41_H_ +#define HIGHWAYHASH_HH_SSE41_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_buffer.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" +#include "highwayhash/vector128.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents (otherwise compilation fails because -msse4.1 is not specified). +#ifndef HH_DISABLE_TARGET_SPECIFIC + +namespace highwayhash { +// See vector128.h for why this namespace is necessary; matching it here makes +// it easier use the vector128 symbols, but requires textual inclusion. +namespace HH_TARGET_NAME { + +template +HH_INLINE T LoadUnaligned(const void* from) { + T ret; + memcpy(&ret, from, sizeof(ret)); + return ret; +} + +// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010 +// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm. +class HHStateSSE41 { + public: + explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); } + + HH_INLINE void Reset(const HHKey key) { + // "Nothing up my sleeve numbers"; see HHStateTAVX2. + const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full); + const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull); + const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull); + const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull); + const V2x64U keyL = LoadUnaligned(key + 0); + const V2x64U keyH = LoadUnaligned(key + 2); + v0L = keyL ^ init0L; + v0H = keyH ^ init0H; + v1L = Rotate64By32(keyL) ^ init1L; + v1H = Rotate64By32(keyH) ^ init1H; + mul0L = init0L; + mul0H = init0H; + mul1L = init1L; + mul1H = init1H; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast(packet_bytes); + const V2x64U packetL = LoadUnaligned(packet + 0); + const V2x64U packetH = LoadUnaligned(packet + 2); + Update(packetH, packetL); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + const V4x32U vsize_mod32(static_cast(size_mod32)); + // Equivalent to storing size_mod32 in packet. + v0L += V2x64U(vsize_mod32); + v0H += V2x64U(vsize_mod32); + // Boosts the avalanche effect of mod32. + Rotate32By(&v1H, &v1L, size_mod32); + + const size_t size_mod4 = size_mod32 & 3; + const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3); + + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const V2x64U packetL = + LoadUnaligned(reinterpret_cast(bytes)); + + V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32); + + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3)); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32); + + const uint64_t last4 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + const V2x64U packetH(_mm_cvtsi64_si128(last4)); + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. + for (int n = 0; n < 4; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1L + mul1L; + const V2x64U hash = sum0 + sum1; + _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash); + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + for (int n = 0; n < 6; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0 = v0L + mul0L; + const V2x64U sum1 = v1H + mul1H; + const V2x64U hash = sum0 + sum1; + StoreUnaligned(hash, &(*result)[0]); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + for (int n = 0; n < 10; n++) { + PermuteAndUpdate(); + } + + const V2x64U sum0L = v0L + mul0L; + const V2x64U sum1L = v1L + mul1L; + const V2x64U sum0H = v0H + mul0H; + const V2x64U sum1H = v1H + mul1H; + const V2x64U hashL = ModularReduction(sum1L, sum0L); + const V2x64U hashH = ModularReduction(sum1H, sum0H); + StoreUnaligned(hashL, &(*result)[0]); + StoreUnaligned(hashH, &(*result)[2]); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) { + __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes); + const __m128i zero = _mm_setzero_si128(); + _mm_store_si128(buffer + 0, zero); + _mm_store_si128(buffer + 1, zero); + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HH_ALIGNAS(32) HHPacket tmp; + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + // Swap 32-bit halves of each lane (caller swaps 128-bit halves) + static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) { + return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1))); + } + + // Rotates 32-bit lanes by "count" bits. + static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH, + V2x64U* HH_RESTRICT vL, + const uint64_t count) { + // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, + // which is broadcast into 32-bit lanes. + const __m128i count_left = _mm_cvtsi64_si128(count); + const __m128i count_right = _mm_cvtsi64_si128(32 - count); + const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left)); + const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left)); + const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right)); + const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right)); + *vL = shifted_leftL | shifted_rightL; + *vH = shifted_leftH | shifted_rightH; + } + + static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across each lane's bytes; + // 2) mixing with bytes from the neighboring lane; + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + const uint64_t hi = 0x070806090D0A040Bull; + const uint64_t lo = 0x000F010E05020C03ull; + return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo))); + } + + HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) { + v1L += packetL; + v1H += packetH; + v1L += mul0L; + v1H += mul0H; + mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L))); + mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32)); + v0L += mul1L; + v0H += mul1H; + mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L))); + mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32)); + v0L += ZipperMerge(v1L); + v0H += ZipperMerge(v1H); + v1L += ZipperMerge(v0L); + v1H += ZipperMerge(v0H); + } + + HH_INLINE void PermuteAndUpdate() { + // It is slightly better to permute v0 than v1; it will be added to v1. + // AVX-2 Permute also swaps 128-bit halves, so swap input operands. + Update(Rotate64By32(v0L), Rotate64By32(v0H)); + } + + // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12 + // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32. + static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes, + const size_t size) { + const uint32_t* words = reinterpret_cast(bytes); + // Mask of 1-bits where the final 4 bytes should be inserted (replacement + // for variable shift/insert using broadcast+blend). + V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0 + V2x64U ret(0); + if (size & 8) { + ret = V2x64U(_mm_loadl_epi64(reinterpret_cast(words))); + // mask4 = 0 ~0 0 0 ('insert' into lane 2) + mask4 = V2x64U(_mm_slli_si128(mask4, 8)); + words += 2; + } + // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret. + if (size & 4) { + const __m128i word2 = _mm_cvtsi32_si128(LoadUnaligned(words)); + // = 0 word2 0 word2; mask4 will select which lane to keep. + const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00)); + // (slightly faster than blendv_epi8) + ret |= V2x64U(broadcast & mask4); + } + return ret; + } + + // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x. + // Bit shifts are only possible on independent 64-bit lanes. We therefore + // insert the upper bits of x[0] that were lost into x[1]. + // Thanks to D. Lemire for helpful comments! + static HH_INLINE void XorByShift128Left12(const V2x64U& x, + V2x64U* HH_RESTRICT out) { + const V2x64U zero(_mm_setzero_si128()); + const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3)); + const V2x64U top_bits2 = x >> (64 - 2); + HH_COMPILER_FENCE; + const V2x64U shifted1_unmasked = x + x; // (avoids needing port0) + + // Only the lower half of top_bits1 will be used, so we + // can compute it before clearing the upper two bits of x. + const V2x64U top_bits1 = x >> (64 - 1); + const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked; + HH_COMPILER_FENCE; + + const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8)); + *out ^= shifted2; + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked); + HH_COMPILER_FENCE; + + const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8)); + *out ^= new_low_bits2; + *out ^= shifted1; + *out ^= new_low_bits1; + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked, + const V2x64U& a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + V2x64U out = a10; + XorByShift128Left12(a32_unmasked, &out); + return out; + } + + V2x64U v0L; + V2x64U v0H; + V2x64U v1L; + V2x64U v1H; + V2x64U mul0L; + V2x64U mul0H; + V2x64U mul1L; + V2x64U mul1H; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_SSE41_H_ diff --git a/highwayhash/highwayhash/hh_types.h b/highwayhash/highwayhash/hh_types.h new file mode 100644 index 000000000..f350d70f6 --- /dev/null +++ b/highwayhash/highwayhash/hh_types.h @@ -0,0 +1,50 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_TYPES_H_ +#define HIGHWAYHASH_HH_TYPES_H_ + +// WARNING: included from c_bindings => must be C-compatible. +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include // size_t +#include + +#ifdef __cplusplus +namespace highwayhash { +#endif + +// 256-bit secret key that should remain unknown to attackers. +// We recommend initializing it to a random value. +typedef uint64_t HHKey[4]; + +// How much input is hashed by one call to HHStateT::Update. +typedef char HHPacket[32]; + +// Hash 'return' types. +typedef uint64_t HHResult64; // returned directly +typedef uint64_t HHResult128[2]; +typedef uint64_t HHResult256[4]; + +// Called if a test fails, indicating which target and size. +typedef void (*HHNotify)(const char*, size_t); + +#ifdef __cplusplus +} // namespace highwayhash +#endif + +#endif // HIGHWAYHASH_HH_TYPES_H_ diff --git a/highwayhash/highwayhash/hh_vsx.cc b/highwayhash/highwayhash/hh_vsx.cc new file mode 100644 index 000000000..6479a7a80 --- /dev/null +++ b/highwayhash/highwayhash/hh_vsx.cc @@ -0,0 +1,22 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME VSX + +#ifdef __VSX__ +#include "highwayhash/highwayhash_target.cc" +#endif diff --git a/highwayhash/highwayhash/hh_vsx.h b/highwayhash/highwayhash/hh_vsx.h new file mode 100644 index 000000000..e503abe1f --- /dev/null +++ b/highwayhash/highwayhash/hh_vsx.h @@ -0,0 +1,335 @@ +// Copyright 2015-2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HH_VSX_H_ +#define HIGHWAYHASH_HH_VSX_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/load3.h" + +// For auto-dependency generation, we need to include all headers but not their +// contents +#ifndef HH_DISABLE_TARGET_SPECIFIC + +#include +#undef vector +#undef pixel +#undef bool + +namespace highwayhash { + +typedef __vector unsigned long long PPC_VEC_U64; // NOLINT +typedef __vector unsigned int PPC_VEC_U32; +typedef __vector unsigned char PPC_VEC_U8; + +// See vector128.h for why this namespace is necessary; +namespace HH_TARGET_NAME { + +// Helper Functions + +// gcc doesn't support vec_mule() and vec_mulo() for vector long. +// Use the generic version, which is defined here only for gcc. + +#ifndef __clang__ +static HH_INLINE PPC_VEC_U64 vec_mule(PPC_VEC_U32 a, PPC_VEC_U32 b) { // NOLINT + PPC_VEC_U64 result; // NOLINT +#ifdef __LITTLE_ENDIAN__ + asm("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b)); +#else + asm("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b)); +#endif + return result; +} +#endif + +// LoadUnaligned uses vec_vsx_ld(offset, address) format, +// Offset here is number of bytes and is 0 for this implementation. +static HH_INLINE PPC_VEC_U64 +LoadUnaligned(const uint64_t* const HH_RESTRICT from) { + const PPC_VEC_U64* const HH_RESTRICT p = + reinterpret_cast(from); + return vec_vsx_ld(0, p); +} + +static HH_INLINE void StoreUnaligned(const PPC_VEC_U64& hash, + uint64_t* const HH_RESTRICT to) { + PPC_VEC_U64* HH_RESTRICT p = reinterpret_cast(to); + vec_vsx_st(hash, 0, p); +} + +static HH_INLINE PPC_VEC_U64 MultiplyVectors(const PPC_VEC_U64& vec1, + const PPC_VEC_U64& vec2) { + return vec_mule(reinterpret_cast(vec1), + reinterpret_cast(vec2)); +} + +// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010 +class HHStateVSX { + public: + explicit HH_INLINE HHStateVSX(const HHKey key) { Reset(key); } + + HH_INLINE void Reset(const HHKey key) { + // "Nothing up my sleeve numbers"; + const PPC_VEC_U64 init0L = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull}; + const PPC_VEC_U64 init0H = {0x13198a2e03707344ull, 0x243f6a8885a308d3ull}; + const PPC_VEC_U64 init1L = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull}; + const PPC_VEC_U64 init1H = {0xbe5466cf34e90c6cull, 0x452821e638d01377ull}; + const PPC_VEC_U64 keyL = LoadUnaligned(key); + const PPC_VEC_U64 keyH = LoadUnaligned(key + 2); + v0L = keyL ^ init0L; + v0H = keyH ^ init0H; + v1L = Rotate64By32(keyL) ^ init1L; + v1H = Rotate64By32(keyH) ^ init1H; + mul0L = init0L; + mul0H = init0H; + mul1L = init1L; + mul1H = init1H; + } + + HH_INLINE void Update(const HHPacket& packet_bytes) { + const uint64_t* HH_RESTRICT packet = + reinterpret_cast(packet_bytes); + const PPC_VEC_U64 packetL = LoadUnaligned(packet); + const PPC_VEC_U64 packetH = LoadUnaligned(packet + 2); + Update(packetH, packetL); + } + + HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) { + // 'Length padding' differentiates zero-valued inputs that have the same + // size/32. mod32 is sufficient because each Update behaves as if a + // counter were injected, because the state is large and mixed thoroughly. + uint32_t size_rounded = static_cast(size_mod32); + PPC_VEC_U32 vsize_mod32 = {size_rounded, size_rounded, size_rounded, + size_rounded}; + // Equivalent to storing size_mod32 in packet. + v0L += reinterpret_cast(vsize_mod32); + v0H += reinterpret_cast(vsize_mod32); + + // Boosts the avalanche effect of mod32. + Rotate32By(&v1H, &v1L, size_mod32); + + const size_t size_mod4 = size_mod32 & 3; + const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3); + + if (HH_UNLIKELY(size_mod32 & 16)) { // 16..31 bytes left + const PPC_VEC_U64 packetL = + vec_vsx_ld(0, reinterpret_cast(bytes)); + + PPC_VEC_U64 packetH = LoadMultipleOfFour(bytes + 16, size_mod32); + + const uint32_t last4 = + Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4); + + // The upper four bytes of packetH are zero, so insert there. + PPC_VEC_U32 packetH_32 = reinterpret_cast(packetH); + packetH_32[3] = last4; + packetH = reinterpret_cast(packetH_32); + Update(packetH, packetL); + } else { // size_mod32 < 16 + const PPC_VEC_U64 packetL = LoadMultipleOfFour(bytes, size_mod32); + + const uint64_t last4 = + Load3()(Load3::AllowUnordered(), remainder, size_mod4); + + // Rather than insert into packetL[3], it is faster to initialize + // the otherwise empty packetH. + const PPC_VEC_U64 packetH = {last4, 0}; + Update(packetH, packetL); + } + } + + HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) { + // Mix together all lanes. + for (int n = 0; n < 4; n++) { + PermuteAndUpdate(); + } + const PPC_VEC_U64 hash = v0L + v1L + mul0L + mul1L; + *result = hash[0]; + } + + HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) { + for (int n = 0; n < 6; n++) { + PermuteAndUpdate(); + } + const PPC_VEC_U64 hash = v0L + mul0L + v1H + mul1H; + StoreUnaligned(hash, *result); + } + + HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) { + for (int n = 0; n < 10; n++) { + PermuteAndUpdate(); + } + const PPC_VEC_U64 sum0L = v0L + mul0L; + const PPC_VEC_U64 sum1L = v1L + mul1L; + const PPC_VEC_U64 sum0H = v0H + mul0H; + const PPC_VEC_U64 sum1H = v1H + mul1H; + const PPC_VEC_U64 hashL = ModularReduction(sum1L, sum0L); + const PPC_VEC_U64 hashH = ModularReduction(sum1H, sum0H); + StoreUnaligned(hashL, *result); + StoreUnaligned(hashH, *result + 2); + } + + static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) { + for (size_t i = 0; i < sizeof(HHPacket); ++i) { + buffer_bytes[i] = 0; + } + } + + static HH_INLINE void CopyPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[i] = from[i]; + } + } + + static HH_INLINE void AppendPartial(const char* HH_RESTRICT from, + const size_t size_mod32, + char* HH_RESTRICT buffer, + const size_t buffer_valid) { + for (size_t i = 0; i < size_mod32; ++i) { + buffer[buffer_valid + i] = from[i]; + } + } + + HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from, + const size_t size_mod32, + const char* HH_RESTRICT buffer, + const size_t buffer_valid) { + HH_ALIGNAS(32) HHPacket tmp; + for (size_t i = 0; i < buffer_valid; ++i) { + tmp[i] = buffer[i]; + } + for (size_t i = 0; i < size_mod32; ++i) { + tmp[buffer_valid + i] = from[i]; + } + Update(tmp); + } + + private: + // Swap 32-bit halves of each lane (caller swaps 128-bit halves) + static HH_INLINE PPC_VEC_U64 Rotate64By32(const PPC_VEC_U64& v) { + PPC_VEC_U64 shuffle_vec = {32, 32}; + return vec_rl(v, shuffle_vec); + } + + // Rotates 32-bit lanes by "count" bits. + static HH_INLINE void Rotate32By(PPC_VEC_U64* HH_RESTRICT vH, + PPC_VEC_U64* HH_RESTRICT vL, + const uint64_t count) { + // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, + // which is broadcast into 32-bit lanes. + uint32_t count_rl = uint32_t(count); + PPC_VEC_U32 rot_left = {count_rl, count_rl, count_rl, count_rl}; + *vL = reinterpret_cast(vec_rl(PPC_VEC_U32(*vL), rot_left)); + *vH = reinterpret_cast(vec_rl(PPC_VEC_U32(*vH), rot_left)); + } + + static HH_INLINE PPC_VEC_U64 ZipperMerge(const PPC_VEC_U64& v) { + // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + // varying degrees. In descending order of goodness, bytes + // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + // As expected, the upper and lower bytes are much worse. + // For each 64-bit lane, our objectives are: + // 1) maximizing and equalizing total goodness across each lane's bytes; + // 2) mixing with bytes from the neighboring lane; + // 3) placing the worst bytes in the upper 32 bits because those will not + // be used in the next 32x32 multiplication. + + const PPC_VEC_U64 mask = {0x000F010E05020C03ull, 0x070806090D0A040Bull}; + return vec_vperm(v, v, reinterpret_cast(mask)); + } + + HH_INLINE void Update(const PPC_VEC_U64& packetH, + const PPC_VEC_U64& packetL) { + // Tried rearranging the instructions below and benchmarks are similar + v1L += packetL + mul0L; + v1H += packetH + mul0H; + mul0L ^= MultiplyVectors(v1L, Rotate64By32(v0L)); + mul0H ^= MultiplyVectors(v1H, v0H >> 32); + v0L += mul1L; + v0H += mul1H; + mul1L ^= MultiplyVectors(v0L, Rotate64By32(v1L)); + mul1H ^= MultiplyVectors(v0H, v1H >> 32); + v0L += ZipperMerge(v1L); + v1L += ZipperMerge(v0L); + v0H += ZipperMerge(v1H); + v1H += ZipperMerge(v0H); + } + + HH_INLINE void PermuteAndUpdate() { + // Permutes v0L and V0H by swapping 32 bits halves of each lane + Update(Rotate64By32(v0L), Rotate64By32(v0H)); + } + + // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12 + // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32. + static HH_INLINE PPC_VEC_U64 LoadMultipleOfFour(const char* bytes, + const size_t size) { + const uint32_t* words = reinterpret_cast(bytes); + // Updating the entries, as if done by vec_insert function call + PPC_VEC_U32 ret = {0, 0, 0, 0}; + if (size & 8) { + ret[0] = words[0]; + ret[1] = words[1]; + words += 2; + if (size & 4) { + ret[2] = words[0]; + } + } else if (size & 4) { + ret[0] = words[0]; + } + return reinterpret_cast(ret); + } + + // Modular reduction by the irreducible polynomial (x^128 + x^2 + x). + // Input: a 256-bit number a3210. + static HH_INLINE PPC_VEC_U64 ModularReduction(const PPC_VEC_U64& a32_unmasked, + const PPC_VEC_U64& a10) { + // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf. + PPC_VEC_U64 out = a10; + const PPC_VEC_U64 shifted1 = reinterpret_cast( + vec_sll(reinterpret_cast(a32_unmasked), vec_splat_u8(1))); + const PPC_VEC_U64 shifted2 = reinterpret_cast( + vec_sll(reinterpret_cast(a32_unmasked), vec_splat_u8(2))); + // The result must be as if the upper two bits of the input had been clear, + // otherwise we're no longer computing a reduction. + const PPC_VEC_U64 mask = {0xFFFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull}; + const PPC_VEC_U64 shifted1_masked = shifted1 & mask; + out ^= shifted1_masked ^ shifted2; + return out; + } + + PPC_VEC_U64 v0L; + PPC_VEC_U64 v0H; + PPC_VEC_U64 v1L; + PPC_VEC_U64 v1H; + PPC_VEC_U64 mul0L; + PPC_VEC_U64 mul0H; + PPC_VEC_U64 mul1L; + PPC_VEC_U64 mul1H; +}; + +} // namespace HH_TARGET_NAME +} // namespace highwayhash + +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HH_VSX_H_ diff --git a/highwayhash/highwayhash/highwayhash.h b/highwayhash/highwayhash/highwayhash.h new file mode 100644 index 000000000..fea4922b2 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash.h @@ -0,0 +1,216 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_H_ +#define HIGHWAYHASH_HIGHWAYHASH_H_ + +// This header's templates are useful for inlining into other CPU-specific code: +// template CodeUsingHash() { HighwayHashT(...); }, +// and can also be instantiated with HH_TARGET when callers don't care about the +// exact implementation. Otherwise, they are implementation details of the +// highwayhash_target wrapper. Use that instead if you need to detect the best +// available implementation at runtime. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" + +#if HH_ARCH_X64 +#include "highwayhash/iaca.h" +#endif + +// Include exactly one (see arch_specific.h) header, which defines a state +// object in a target-specific namespace, e.g. AVX2::HHStateAVX2. +// Attempts to use "computed includes" (#define MACRO "path/or_just_filename", +// #include MACRO) fail with 'file not found', so we need an #if chain. +#if HH_TARGET == HH_TARGET_AVX2 +#include "highwayhash/hh_avx2.h" +#elif HH_TARGET == HH_TARGET_SSE41 +#include "highwayhash/hh_sse41.h" +#elif HH_TARGET == HH_TARGET_VSX +#include "highwayhash/hh_vsx.h" +#elif HH_TARGET == HH_TARGET_NEON +#include "highwayhash/hh_neon.h" +#elif HH_TARGET == HH_TARGET_Portable +#include "highwayhash/hh_portable.h" +#else +#error "Unknown target, add its hh_*.h include here." +#endif + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { + +// Alias templates (HHStateT) cannot be specialized, so we need a helper struct. +// Note that hh_*.h don't just specialize HHStateT directly because vector128.h +// must reside in a distinct namespace (to allow including it from multiple +// translation units), and it is easier if its users, i.e. the concrete HHState, +// also reside in that same namespace, which precludes specialization. +template +struct HHStateForTarget {}; + +template <> +struct HHStateForTarget { + // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is + // technically redundant, but it makes searching easier.) + using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState); +}; + +// Typically used as HHStateT. It would be easier to just have a +// concrete type HH_STATE, but this alias template is required by the +// templates in highwayhash_target.cc. +template +using HHStateT = typename HHStateForTarget::type; + +// Computes HighwayHash of "bytes" using the implementation chosen by "State". +// +// "state" is a HHStateT<> initialized with a key. +// "bytes" is the data to hash (possibly unaligned). +// "size" is the number of bytes to hash; we do not read any additional bytes. +// "hash" is a HHResult* (either 64, 128 or 256 bits). +// +// HighwayHash is a strong pseudorandom function with security claims +// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose +// hash, about 4x faster than SipHash and 10x faster than BLAKE2. +// +// This template allows callers (e.g. tests) to invoke a specific +// implementation. It must be compiled with the flags required by the desired +// implementation. If the entire program cannot be built with these flags, use +// the wrapper in highwayhash_target.h instead. +// +// Callers wanting to hash multiple pieces of data should duplicate this +// function, calling HHStateT::Update for each input and only Finalizing once. +template +HH_INLINE void HighwayHashT(State* HH_RESTRICT state, + const char* HH_RESTRICT bytes, const size_t size, + Result* HH_RESTRICT hash) { + // BeginIACA(); + const size_t remainder = size & (sizeof(HHPacket) - 1); + const size_t truncated = size & ~(sizeof(HHPacket) - 1); + for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) { + state->Update(*reinterpret_cast(bytes + offset)); + } + + if (remainder != 0) { + state->UpdateRemainder(bytes + truncated, remainder); + } + + state->Finalize(hash); + // EndIACA(); +} + +// Wrapper class for incrementally hashing a series of data ranges. The final +// result is the same as HighwayHashT of the concatenation of all the ranges. +// This is useful for computing the hash of cords, iovecs, and similar +// data structures. +template +class HighwayHashCatT { + public: + HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) { + // Avoids msan uninitialized-memory warnings. + HHStateT::ZeroInitialize(buffer_); + } + + // Resets the state of the hasher so it can be used to hash a new string. + HH_INLINE void Reset(const HHKey& key) { + state_.Reset(key); + buffer_usage_ = 0; + } + + // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as + // required. Call this as often as desired. Only reads bytes within the + // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect. + // + // Beware that this implies hashing two strings {"A", ""} has the same result + // as {"", "A"}. To prevent this when hashing independent fields, you can + // append some extra (non-empty) data when a field is empty, or + // unconditionally also Append the field length. Either option would ensure + // the two examples above result in a different hash. + // + // There are no alignment requirements. + HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) { + // BeginIACA(); + const size_t capacity = sizeof(HHPacket) - buffer_usage_; + // New bytes fit within buffer, but still not enough to Update. + if (HH_UNLIKELY(num_bytes < capacity)) { + HHStateT::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_); + buffer_usage_ += num_bytes; + return; + } + + // HACK: ensures the state is kept in SIMD registers; otherwise, Update + // constantly load/stores its operands, which is much slower. + // Restrict-qualified pointers to external state or the state_ member are + // not sufficient for keeping this in registers. + HHStateT state_copy = state_; + + // Have prior bytes to flush. + const size_t buffer_usage = buffer_usage_; + if (HH_LIKELY(buffer_usage != 0)) { + // Calls update with prior buffer contents plus new data. Does not modify + // the buffer because some implementations can load into SIMD registers + // and Append to them directly. + state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage); + bytes += capacity; + num_bytes -= capacity; + } + + // Buffer currently empty => Update directly from the source. + while (num_bytes >= sizeof(HHPacket)) { + state_copy.Update(*reinterpret_cast(bytes)); + bytes += sizeof(HHPacket); + num_bytes -= sizeof(HHPacket); + } + + // Unconditionally assign even if zero because we didn't reset to zero + // after the AppendAndUpdate above. + buffer_usage_ = num_bytes; + + state_ = state_copy; + + // Store any remainders in buffer, no-op if multiple of a packet. + if (HH_LIKELY(num_bytes != 0)) { + HHStateT::CopyPartial(bytes, num_bytes, buffer_); + } + // EndIACA(); + } + + // Stores the resulting 64, 128 or 256-bit hash of data previously passed to + // Append since construction or a prior call to Reset. + template // HHResult* + HH_INLINE void Finalize(Result* HH_RESTRICT hash) const { + // BeginIACA(); + HHStateT state_copy = state_; + const size_t buffer_usage = buffer_usage_; + if (HH_LIKELY(buffer_usage != 0)) { + state_copy.UpdateRemainder(buffer_, buffer_usage); + } + state_copy.Finalize(hash); + // EndIACA(); + } + + private: + HH_ALIGNAS(64) HHPacket buffer_; + HH_ALIGNAS(32) HHStateT state_; + // How many bytes in buffer_ (starting with offset 0) are valid. + size_t buffer_usage_ = 0; +}; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC +#endif // HIGHWAYHASH_HIGHWAYHASH_H_ diff --git a/highwayhash/highwayhash/highwayhash_fuzzer.cc b/highwayhash/highwayhash/highwayhash_fuzzer.cc new file mode 100644 index 000000000..5234fcb01 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_fuzzer.cc @@ -0,0 +1,25 @@ +#include "highwayhash/highwayhash_target.h" +#include "highwayhash/instruction_sets.h" + +using highwayhash::HHKey; +using highwayhash::HHResult64; +using highwayhash::HighwayHash; +using highwayhash::InstructionSets; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < sizeof(uint64_t) * 4) { + return 0; + } + + // Generate the key. + const uint64_t *u64s = reinterpret_cast(data); + HH_ALIGNAS(32) const HHKey key = {u64s[0], u64s[1], u64s[2], u64s[3]}; + data += sizeof(uint64_t) * 4; + size -= sizeof(uint64_t) * 4; + + // Compute the hash. + HHResult64 result; + InstructionSets::Run(key, reinterpret_cast(data), + size, &result); + return 0; +} diff --git a/highwayhash/highwayhash/highwayhash_target.cc b/highwayhash/highwayhash/highwayhash_target.cc new file mode 100644 index 000000000..74022f64b --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_target.cc @@ -0,0 +1,104 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#include "highwayhash/highwayhash_target.h" + +#include "highwayhash/highwayhash.h" + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { + +extern "C" { +uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key, + const char* bytes, + const uint64_t size) { + HHStateT state(key); + HHResult64 result; + HighwayHashT(&state, bytes, size, &result); + return result; +} +} // extern "C" + +template +void HighwayHash::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult64* HH_RESTRICT hash) const { + HHStateT state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template +void HighwayHash::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult128* HH_RESTRICT hash) const { + HHStateT state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template +void HighwayHash::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + HHResult256* HH_RESTRICT hash) const { + HHStateT state(key); + HighwayHashT(&state, bytes, size, hash); +} + +template +void HighwayHashCat::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult64* HH_RESTRICT hash) const { + HighwayHashCatT cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +template +void HighwayHashCat::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult128* HH_RESTRICT hash) const { + HighwayHashCatT cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +template +void HighwayHashCat::operator()(const HHKey& key, + const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult256* HH_RESTRICT hash) const { + HighwayHashCatT cat(key); + for (size_t i = 0; i < num_fragments; ++i) { + cat.Append(fragments[i].data, fragments[i].num_bytes); + } + cat.Finalize(hash); +} + +// Instantiate for the current target. +template struct HighwayHash; +template struct HighwayHashCat; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC diff --git a/highwayhash/highwayhash/highwayhash_target.h b/highwayhash/highwayhash/highwayhash_target.h new file mode 100644 index 000000000..08b803f19 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_target.h @@ -0,0 +1,91 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ +#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ + +// Adapter for the InstructionSets::Run dispatcher, which invokes the best +// implementations available on the current CPU. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" + +namespace highwayhash { + +// Usage: InstructionSets::Run(key, bytes, size, hash). +// This incurs some small dispatch overhead. If the entire program is compiled +// for the target CPU, you can instead call HighwayHashT directly to avoid any +// overhead. This template is instantiated in the source file, which is +// compiled once for every target with the required flags (e.g. -mavx2). +template +struct HighwayHash { + // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT + // implementation for the "Target" CPU. The hash result is identical + // regardless of which implementation is used. + // + // "key" is a (randomly generated or hard-coded) HHKey. + // "bytes" is the data to hash (possibly unaligned). + // "size" is the number of bytes to hash; we do not read any additional bytes. + // "hash" is a HHResult* (either 64, 128 or 256 bits). + // + // HighwayHash is a strong pseudorandom function with security claims + // [https://arxiv.org/abs/1612.06257]. It is intended as a safer + // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2. + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult64* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult128* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, HHResult256* HH_RESTRICT hash) const; +}; + +// Replacement for C++17 std::string_view that avoids dependencies. +// A struct requires fewer allocations when calling HighwayHashCat with +// non-const "num_fragments". +struct StringView { + const char* data; // not necessarily aligned/padded + size_t num_bytes; // possibly zero +}; + +// Note: this interface avoids dispatch overhead per fragment. +template +struct HighwayHashCat { + // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the + // HighwayHashCatT implementation for "Target". The hash result is identical + // to HighwayHash of the flattened data, regardless of Target. + // + // "key" is a (randomly generated or hard-coded) HHKey. + // "fragments" contain unaligned pointers and the number of valid bytes. + // "num_fragments" indicates the number of entries in "fragments". + // "hash" is a HHResult* (either 64, 128 or 256 bits). + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult64* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult128* HH_RESTRICT hash) const; + void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments, + const size_t num_fragments, + HHResult256* HH_RESTRICT hash) const; +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_ diff --git a/highwayhash/highwayhash/highwayhash_test.cc b/highwayhash/highwayhash/highwayhash_test.cc new file mode 100644 index 000000000..aed9a9eed --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test.cc @@ -0,0 +1,391 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensures each implementation of HighwayHash returns consistent and unchanging +// hash values. + +#include "highwayhash/highwayhash_test_target.h" + +#include +#include +#include +#include + +#ifdef HH_GOOGLETEST +#include "testing/base/public/gunit.h" +#endif + +#include "highwayhash/data_parallel.h" +#include "highwayhash/highwayhash_target.h" +#include "highwayhash/instruction_sets.h" + +// Define to nonzero in order to print the (new) golden outputs. +// WARNING: HighwayHash is frozen, so the golden values must not change. +#define PRINT_RESULTS 0 + +namespace highwayhash { +namespace { + +// Known-good outputs are verified for all lengths in [0, 64]. +const size_t kMaxSize = 64; + +#if PRINT_RESULTS +void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); } + +// For HHResult128/256. +template +void Print(const HHResult64 (&result)[kNumLanes]) { + printf("{ "); + for (int i = 0; i < kNumLanes; ++i) { + if (i != 0) { + printf(", "); + } + printf("0x%016lXull", result[i]); + } + printf("},\n"); +} +#endif // PRINT_RESULTS + +// Called when any test fails; exits immediately because one mismatch usually +// implies many others. +void OnFailure(const char* target_name, const size_t size) { + printf("Mismatch at size %zu for target %s\n", size, target_name); +#ifdef HH_GOOGLETEST + EXPECT_TRUE(false); +#endif + exit(1); +} + +// Verifies every combination of implementation and input size. Returns which +// targets were run/verified. +template +TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) { + const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL, + 0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL}; + + TargetBits targets = ~0U; + + // For each test input: empty string, 00, 00 01, ... + char in[kMaxSize + 1] = {0}; + // Fast enough that we don't need a thread pool. + for (uint64_t size = 0; size <= kMaxSize; ++size) { + in[size] = static_cast(size); +#if PRINT_RESULTS + Result actual; + targets &= InstructionSets::Run(key, in, size, &actual); + Print(actual); +#else + const Result* expected = &known_good[size]; + targets &= InstructionSets::RunAll(key, in, size, expected, + &OnFailure); +#endif + } + return targets; +} + +// Cat + +void OnCatFailure(const char* target_name, const size_t size) { + printf("Cat mismatch at size %zu\n", size); +#ifdef HH_GOOGLETEST + EXPECT_TRUE(false); +#endif + exit(1); +} + +// Returns which targets were run/verified. +template +TargetBits VerifyCat(ThreadPool* pool) { + // Reversed order vs prior test. + const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL, + 0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL}; + + const size_t kMaxSize = 3 * 35; + char flat[kMaxSize]; + srand(129); + for (size_t size = 0; size < kMaxSize; ++size) { + flat[size] = static_cast(rand() & 0xFF); + } + + std::atomic targets{~0U}; + + pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) { + Result dummy; + targets.fetch_and(InstructionSets::RunAll( + key, flat, i, &dummy, &OnCatFailure)); + }); + return targets.load(); +} + +// WARNING: HighwayHash is frozen, so the golden values must not change. +const HHResult64 kExpected64[kMaxSize + 1] = { + 0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull, + 0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull, + 0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull, + 0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull, + 0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull, + 0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull, + 0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull, + 0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull, + 0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full, + 0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull, + 0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull, + 0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull, + 0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull, + 0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull, + 0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull, + 0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull, + 0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull, + 0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull, + 0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull, + 0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull, + 0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull, + 0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull}; + +// WARNING: HighwayHash is frozen, so the golden values must not change. +const HHResult128 kExpected128[kMaxSize + 1] = { + {0x0FED268F9D8FFEC7ull, 0x33565E767F093E6Full}, + {0xD6B0A8893681E7A8ull, 0xDC291DF9EB9CDCB4ull}, + {0x3D15AD265A16DA04ull, 0x78085638DC32E868ull}, + {0x0607621B295F0BEBull, 0xBFE69A0FD9CEDD79ull}, + {0x26399EB46DACE49Eull, 0x2E922AD039319208ull}, + {0x3250BDC386D12ED8ull, 0x193810906C63C23Aull}, + {0x6F476AB3CB896547ull, 0x7CDE576F37ED1019ull}, + {0x2A401FCA697171B4ull, 0xBE1F03FF9F02796Cull}, + {0xA1E96D84280552E8ull, 0x695CF1C63BEC0AC2ull}, + {0x142A2102F31E63B2ull, 0x1A85B98C5B5000CCull}, + {0x51A1B70E26B6BC5Bull, 0x929E1F3B2DA45559ull}, + {0x88990362059A415Bull, 0xBED21F22C47B7D13ull}, + {0xCD1F1F5F1CAF9566ull, 0xA818BA8CE0F9C8D4ull}, + {0xA225564112FE6157ull, 0xB2E94C78B8DDB848ull}, + {0xBD492FEBD1CC0919ull, 0xCECD1DBC025641A2ull}, + {0x142237A52BC4AF54ull, 0xE0796C0B6E26BCD7ull}, + {0x414460FFD5A401ADull, 0x029EA3D5019F18C8ull}, + {0xC52A4B96C51C9962ull, 0xECB878B1169B5EA0ull}, + {0xD940CA8F11FBEACEull, 0xF93A46D616F8D531ull}, + {0x8AC49D0AE5C0CBF5ull, 0x3FFDBF8DF51D7C93ull}, + {0xAC6D279B852D00A8ull, 0x7DCD3A6BA5EBAA46ull}, + {0xF11621BD93F08A56ull, 0x3173C398163DD9D5ull}, + {0x0C4CE250F68CF89Full, 0xB3123CDA411898EDull}, + {0x15AB97ED3D9A51CEull, 0x7CE274479169080Eull}, + {0xCD001E198D4845B8ull, 0xD0D9D98BD8AA2D77ull}, + {0x34F3D617A0493D79ull, 0x7DD304F6397F7E16ull}, + {0x5CB56890A9F4C6B6ull, 0x130829166567304Full}, + {0x30DA6F8B245BD1C0ull, 0x6F828B7E3FD9748Cull}, + {0xE0580349204C12C0ull, 0x93F6DA0CAC5F441Cull}, + {0xF648731BA5073045ull, 0x5FB897114FB65976ull}, + {0x024F8354738A5206ull, 0x509A4918EB7E0991ull}, + {0x06E7B465E8A57C29ull, 0x52415E3A07F5D446ull}, + {0x1984DF66C1434AAAull, 0x16FC1958F9B3E4B9ull}, + {0x111678AFE0C6C36Cull, 0xF958B59DE5A2849Dull}, + {0x773FBC8440FB0490ull, 0xC96ED5D243658536ull}, + {0x91E3DC710BB6C941ull, 0xEA336A0BC1EEACE9ull}, + {0x25CFE3815D7AD9D4ull, 0xF2E94F8C828FC59Eull}, + {0xB9FB38B83CC288F2ull, 0x7479C4C8F850EC04ull}, + {0x1D85D5C525982B8Cull, 0x6E26B1C16F48DBF4ull}, + {0x8A4E55BD6060BDE7ull, 0x2134D599058B3FD0ull}, + {0x2A958FF994778F36ull, 0xE8052D1AE61D6423ull}, + {0x89233AE6BE453233ull, 0x3ACF9C87D7E8C0B9ull}, + {0x4458F5E27EA9C8D5ull, 0x418FB49BCA2A5140ull}, + {0x090301837ED12A68ull, 0x1017F69633C861E6ull}, + {0x330DD84704D49590ull, 0x339DF1AD3A4BA6E4ull}, + {0x569363A663F2C576ull, 0x363B3D95E3C95EF6ull}, + {0xACC8D08586B90737ull, 0x2BA0E8087D4E28E9ull}, + {0x39C27A27C86D9520ull, 0x8DB620A45160932Eull}, + {0x8E6A4AEB671A072Dull, 0x6ED3561A10E47EE6ull}, + {0x0011D765B1BEC74Aull, 0xD80E6E656EDE842Eull}, + {0x2515D62B936AC64Cull, 0xCE088794D7088A7Dull}, + {0x91621552C16E23AFull, 0x264F0094EB23CCEFull}, + {0x1E21880D97263480ull, 0xD8654807D3A31086ull}, + {0x39D76AAF097F432Dull, 0xA517E1E09D074739ull}, + {0x0F17A4F337C65A14ull, 0x2F51215F69F976D4ull}, + {0xA0FB5CDA12895E44ull, 0x568C3DC4D1F13CD1ull}, + {0x93C8FC00D89C46CEull, 0xBAD5DA947E330E69ull}, + {0x817C07501D1A5694ull, 0x584D6EE72CBFAC2Bull}, + {0x91D668AF73F053BFull, 0xF98E647683C1E0EDull}, + {0x5281E1EF6B3CCF8Bull, 0xBC4CC3DF166083D8ull}, + {0xAAD61B6DBEAAEEB9ull, 0xFF969D000C16787Bull}, + {0x4325D84FC0475879ull, 0x14B919BD905F1C2Dull}, + {0x79A176D1AA6BA6D1ull, 0xF1F720C5A53A2B86ull}, + {0x74BD7018022F3EF0ull, 0x3AEA94A8AD5F4BCBull}, + {0x98BB1F7198D4C4F2ull, 0xE0BC0571DE918FC8ull}}; + +// WARNING: HighwayHash is frozen, so the golden values must not change. +const HHResult256 kExpected256[kMaxSize + 1] = { + {0xDD44482AC2C874F5ull, 0xD946017313C7351Full, 0xB3AEBECCB98714FFull, + 0x41DA233145751DF4ull}, + {0xEDB941BCE45F8254ull, 0xE20D44EF3DCAC60Full, 0x72651B9BCB324A47ull, + 0x2073624CB275E484ull}, + {0x3FDFF9DF24AFE454ull, 0x11C4BF1A1B0AE873ull, 0x115169CC6922597Aull, + 0x1208F6590D33B42Cull}, + {0x480AA0D70DD1D95Cull, 0x89225E7C6911D1D0ull, 0x8EA8426B8BBB865Aull, + 0xE23DFBC390E1C722ull}, + {0xC9CFC497212BE4DCull, 0xA85F9DF6AFD2929Bull, 0x1FDA9F211DF4109Eull, + 0x07E4277A374D4F9Bull}, + {0xB4B4F566A4DC85B3ull, 0xBF4B63BA5E460142ull, 0x15F48E68CDDC1DE3ull, + 0x0F74587D388085C6ull}, + {0x6445C70A86ADB9B4ull, 0xA99CFB2784B4CEB6ull, 0xDAE29D40A0B2DB13ull, + 0xB6526DF29A9D1170ull}, + {0xD666B1A00987AD81ull, 0xA4F1F838EB8C6D37ull, 0xE9226E07D463E030ull, + 0x5754D67D062C526Cull}, + {0xF1B905B0ED768BC0ull, 0xE6976FF3FCFF3A45ull, 0x4FBE518DD9D09778ull, + 0xD9A0AFEB371E0D33ull}, + {0x80D8E4D70D3C2981ull, 0xF10FBBD16424F1A1ull, 0xCF5C2DBE9D3F0CD1ull, + 0xC0BFE8F701B673F2ull}, + {0xADE48C50E5A262BEull, 0x8E9492B1FDFE38E0ull, 0x0784B74B2FE9B838ull, + 0x0E41D574DB656DCDull}, + {0xA1BE77B9531807CFull, 0xBA97A7DE6A1A9738ull, 0xAF274CEF9C8E261Full, + 0x3E39B935C74CE8E8ull}, + {0x15AD3802E3405857ull, 0x9D11CBDC39E853A0ull, 0x23EA3E993C31B225ull, + 0x6CD9E9E3CAF4212Eull}, + {0x01C96F5EB1D77C36ull, 0xA367F9C1531F95A6ull, 0x1F94A3427CDADCB8ull, + 0x97F1000ABF3BD5D3ull}, + {0x0815E91EEEFF8E41ull, 0x0E0C28FA6E21DF5Dull, 0x4EAD8E62ED095374ull, + 0x3FFD01DA1C9D73E6ull}, + {0xC11905707842602Eull, 0x62C3DB018501B146ull, 0x85F5AD17FA3406C1ull, + 0xC884F87BD4FEC347ull}, + {0xF51AD989A1B6CD1Full, 0xF7F075D62A627BD9ull, 0x7E01D5F579F28A06ull, + 0x1AD415C16A174D9Full}, + {0x19F4CFA82CA4068Eull, 0x3B9D4ABD3A9275B9ull, 0x8000B0DDE9C010C6ull, + 0x8884D50949215613ull}, + {0x126D6C7F81AB9F5Dull, 0x4EDAA3C5097716EEull, 0xAF121573A7DD3E49ull, + 0x9001AC85AA80C32Dull}, + {0x06AABEF9149155FAull, 0xDF864F4144E71C3Dull, 0xFDBABCE860BC64DAull, + 0xDE2BA54792491CB6ull}, + {0xADFC6B4035079FDBull, 0xA087B7328E486E65ull, 0x46D1A9935A4623EAull, + 0xE3895C440D3CEE44ull}, + {0xB5F9D31DEEA3B3DFull, 0x8F3024E20A06E133ull, 0xF24C38C8288FE120ull, + 0x703F1DCF9BD69749ull}, + {0x2B3C0B854794EFE3ull, 0x1C5D3F969BDACEA0ull, 0x81F16AAFA563AC2Eull, + 0x23441C5A79D03075ull}, + {0x418AF8C793FD3762ull, 0xBC6B8E9461D7F924ull, 0x776FF26A2A1A9E78ull, + 0x3AA0B7BFD417CA6Eull}, + {0xCD03EA2AD255A3C1ull, 0x0185FEE5B59C1B2Aull, 0xD1F438D44F9773E4ull, + 0xBE69DD67F83B76E4ull}, + {0xF951A8873887A0FBull, 0x2C7B31D2A548E0AEull, 0x44803838B6186EFAull, + 0xA3C78EC7BE219F72ull}, + {0x958FF151EA0D8C08ull, 0x4B7E8997B4F63488ull, 0xC78E074351C5386Dull, + 0xD95577556F20EEFAull}, + {0x29A917807FB05406ull, 0x3318F884351F578Cull, 0xDD24EA6EF6F6A7FAull, + 0xE74393465E97AEFFull}, + {0x98240880935E6CCBull, 0x1FD0D271B09F97DAull, 0x56E786472700B183ull, + 0x291649F99F747817ull}, + {0x1BD4954F7054C556ull, 0xFFDB2EFF7C596CEBull, 0x7C6AC69A1BAB6B5Bull, + 0x0F037670537FC153ull}, + {0x8825E38897597498ull, 0x647CF6EBAF6332C1ull, 0x552BD903DC28C917ull, + 0x72D7632C00BFC5ABull}, + {0x6880E276601A644Dull, 0xB3728B20B10FB7DAull, 0xD0BD12060610D16Eull, + 0x8AEF14EF33452EF2ull}, + {0xBCE38C9039A1C3FEull, 0x42D56326A3C11289ull, 0xE35595F764FCAEA9ull, + 0xC9B03C6BC9475A99ull}, + {0xF60115CBF034A6E5ull, 0x6C36EA75BFCE46D0ull, 0x3B17C8D382725990ull, + 0x7EDAA2ED11007A35ull}, + {0x1326E959EDF9DEA2ull, 0xC4776801739F720Cull, 0x5169500FD762F62Full, + 0x8A0DD0D90A2529ABull}, + {0x935149D503D442D4ull, 0xFF6BB41302DAD144ull, 0x339CB012CD9D36ECull, + 0xE61D53619ECC2230ull}, + {0x528BC888AA50B696ull, 0xB8AEECA36084E1FCull, 0xA158151EC0243476ull, + 0x02C14AAD097CEC44ull}, + {0xBED688A72217C327ull, 0x1EE65114F760873Full, 0x3F5C26B37D3002A6ull, + 0xDDF2E895631597B9ull}, + {0xE7DB21CF2B0B51ADull, 0xFAFC6324F4B0AB6Cull, 0xB0857244C22D9C5Bull, + 0xF0AD888D1E05849Cull}, + {0x05519793CD4DCB00ull, 0x3C594A3163067DEBull, 0xAC75081ACF119E34ull, + 0x5AC86297805CB094ull}, + {0x09228D8C22B5779Eull, 0x19644DB2516B7E84ull, 0x2B92C8ABF83141A0ull, + 0x7F785AD725E19391ull}, + {0x59C42E5D46D0A74Bull, 0x5EA53C65CA036064ull, 0x48A9916BB635AEB4ull, + 0xBAE6DF143F54E9D4ull}, + {0x5EB623696D03D0E3ull, 0xD53D78BCB41DA092ull, 0xFE2348DC52F6B10Dull, + 0x64802457632C8C11ull}, + {0x43B61BB2C4B85481ull, 0xC6318C25717E80A1ull, 0x8C4A7F4D6F9C687Dull, + 0xBD0217E035401D7Cull}, + {0x7F51CA5743824C37ull, 0xB04C4D5EB11D703Aull, 0x4D511E1ECBF6F369ull, + 0xD66775EA215456E2ull}, + {0x39B409EEF87E45CCull, 0x52B8E8C459FC79B3ull, 0x44920918D1858C24ull, + 0x80F07B645EEE0149ull}, + {0xCE8694D1BE9AD514ull, 0xBFA19026526836E7ull, 0x1EA4FDF6E4902A7Dull, + 0x380C4458D696E1FEull}, + {0xD189E18BF823A0A4ull, 0x1F3B353BE501A7D7ull, 0xA24F77B4E02E2884ull, + 0x7E94646F74F9180Cull}, + {0xAFF8C635D325EC48ull, 0x2C2E0AA414038D0Bull, 0x4ED37F611A447467ull, + 0x39EC38E33B501489ull}, + {0x2A2BFDAD5F83F197ull, 0x013D3E6EBEF274CCull, 0xE1563C0477726155ull, + 0xF15A8A5DE932037Eull}, + {0xD5D1F91EC8126332ull, 0x10110B9BF9B1FF11ull, 0xA175AB26541C6032ull, + 0x87BADC5728701552ull}, + {0xC7B5A92CD8082884ull, 0xDDA62AB61B2EEEFBull, 0x8F9882ECFEAE732Full, + 0x6B38BD5CC01F4FFBull}, + {0xCF6EF275733D32F0ull, 0xA3F0822DA2BF7D8Bull, 0x304E7435F512406Aull, + 0x0B28E3EFEBB3172Dull}, + {0xE698F80701B2E9DBull, 0x66AE2A819A8A8828ull, 0x14EA9024C9B8F2C9ull, + 0xA7416170523EB5A4ull}, + {0x3A917E87E307EDB7ull, 0x17B4DEDAE34452C1ull, 0xF689F162E711CC70ull, + 0x29CE6BFE789CDD0Eull}, + {0x0EFF3AD8CB155D8Eull, 0x47CD9EAD4C0844A2ull, 0x46C8E40EE6FE21EBull, + 0xDEF3C25DF0340A51ull}, + {0x03FD86E62B82D04Dull, 0x32AB0D600717136Dull, 0x682B0E832B857A89ull, + 0x138CE3F1443739B1ull}, + {0x2F77C754C4D7F902ull, 0x1053E0A9D9ADBFEAull, 0x58E66368544AE70Aull, + 0xC48A829C72DD83CAull}, + {0xF900EB19E466A09Full, 0x31BE9E01A8C7D314ull, 0x3AFEC6B8CA08F471ull, + 0xB8C0EB0F87FFE7FBull}, + {0xDB277D8FBE3C8EFBull, 0x53CE6877E11AA57Bull, 0x719C94D20D9A7E7Dull, + 0xB345B56392453CC9ull}, + {0x37639C3BDBA4F2C9ull, 0x6095E7B336466DC8ull, 0x3A8049791E65B88Aull, + 0x82C988CDE5927CD5ull}, + {0x6B1FB1A714234AE4ull, 0x20562E255BA6467Eull, 0x3E2B892D40F3D675ull, + 0xF40CE3FBE41ED768ull}, + {0x8EE11CB1B287C92Aull, 0x8FC2AAEFF63D266Dull, 0x66643487E6EB9F03ull, + 0x578AA91DE8D56873ull}, + {0xF5B1F8266A3AEB67ull, 0x83B040BE4DEC1ADDull, 0x7FE1C8635B26FBAEull, + 0xF4A3A447DEFED79Full}, + {0x90D8E6FF6AC12475ull, 0x1A422A196EDAC1F2ull, 0x9E3765FE1F8EB002ull, + 0xC1BDD7C4C351CFBEull}}; + +void RunTests() { + // TODO(janwas): detect number of cores. + ThreadPool pool(4); + + TargetBits tested = ~0U; + tested &= VerifyImplementations(kExpected64); + tested &= VerifyImplementations(kExpected128); + tested &= VerifyImplementations(kExpected256); + // Any failure causes immediate exit, so apparently all succeeded. + HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { + printf("%10s: OK\n", TargetName(target)); + }); + + tested = ~0U; + tested &= VerifyCat(&pool); + tested &= VerifyCat(&pool); + tested &= VerifyCat(&pool); + HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) { + printf("%10sCat: OK\n", TargetName(target)); + }); +} + +#ifdef HH_GOOGLETEST +TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); } +#endif + +} // namespace +} // namespace highwayhash + +#ifndef HH_GOOGLETEST +int main(int argc, char* argv[]) { + highwayhash::RunTests(); + return 0; +} +#endif diff --git a/highwayhash/highwayhash/highwayhash_test_avx2.cc b/highwayhash/highwayhash/highwayhash_test_avx2.cc new file mode 100644 index 000000000..f1efe0b5f --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_avx2.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME AVX2 +#include "highwayhash/highwayhash_test_target.cc" diff --git a/highwayhash/highwayhash/highwayhash_test_neon.cc b/highwayhash/highwayhash/highwayhash_test_neon.cc new file mode 100644 index 000000000..df5058829 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_neon.cc @@ -0,0 +1,22 @@ +// Copyright 2017-2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME NEON +// GCC 4.5.4 only defines the former; 5.4 defines both. +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include "highwayhash/highwayhash_test_target.cc" +#endif diff --git a/highwayhash/highwayhash/highwayhash_test_portable.cc b/highwayhash/highwayhash/highwayhash_test_portable.cc new file mode 100644 index 000000000..04930a7e1 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_portable.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME Portable +#include "highwayhash/highwayhash_test_target.cc" diff --git a/highwayhash/highwayhash/highwayhash_test_sse41.cc b/highwayhash/highwayhash/highwayhash_test_sse41.cc new file mode 100644 index 000000000..2d6e83d66 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_sse41.cc @@ -0,0 +1,19 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME SSE41 +#include "highwayhash/highwayhash_test_target.cc" diff --git a/highwayhash/highwayhash/highwayhash_test_target.cc b/highwayhash/highwayhash/highwayhash_test_target.cc new file mode 100644 index 000000000..65afd4e91 --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_target.cc @@ -0,0 +1,220 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#include "highwayhash/highwayhash_test_target.h" + +#include "highwayhash/highwayhash.h" + +#ifndef HH_DISABLE_TARGET_SPECIFIC +namespace highwayhash { +namespace { + +void NotifyIfUnequal(const size_t size, const HHResult64& expected, + const HHResult64& actual, const HHNotify notify) { + if (expected != actual) { + (*notify)(TargetName(HH_TARGET), size); + } +} + +// Overload for HHResult128 or HHResult256 (arrays). +template +void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes], + const uint64_t (&actual)[kNumLanes], + const HHNotify notify) { + for (size_t i = 0; i < kNumLanes; ++i) { + if (expected[i] != actual[i]) { + (*notify)(TargetName(HH_TARGET), size); + return; + } + } +} + +// Shared logic for all HighwayHashTest::operator() overloads. +template +void TestHighwayHash(HHStateT* HH_RESTRICT state, + const char* HH_RESTRICT bytes, const size_t size, + const Result* expected, const HHNotify notify) { + // TODO(janwas): investigate (length=33) +#if HH_TARGET == HH_TARGET_Portable && HH_GCC_VERSION && !HH_CLANG_VERSION + return; +#endif + Result actual; + HighwayHashT(state, bytes, size, &actual); + NotifyIfUnequal(size, *expected, actual, notify); +} + +// Shared logic for all HighwayHashCatTest::operator() overloads. +template +void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const Result* expected, + const HHNotify notify) { + // TODO(janwas): investigate (length=33) +#if HH_TARGET == HH_TARGET_Portable && HH_GCC_VERSION && !HH_CLANG_VERSION + return; +#endif + + // Slightly faster to compute the expected prefix hashes only once. + // Use new instead of vector to avoid headers with inline functions. + Result* results = new Result[size + 1]; + for (size_t i = 0; i <= size; ++i) { + HHStateT state_flat(key); + HighwayHashT(&state_flat, bytes, i, &results[i]); + } + + // Splitting into three fragments/Append should cover all codepaths. + const size_t max_fragment_size = size / 3; + for (size_t size1 = 0; size1 < max_fragment_size; ++size1) { + for (size_t size2 = 0; size2 < max_fragment_size; ++size2) { + for (size_t size3 = 0; size3 < max_fragment_size; ++size3) { + HighwayHashCatT cat(key); + const char* pos = bytes; + cat.Append(pos, size1); + pos += size1; + cat.Append(pos, size2); + pos += size2; + cat.Append(pos, size3); + pos += size3; + + Result result_cat; + cat.Finalize(&result_cat); + + const size_t total_size = pos - bytes; + NotifyIfUnequal(total_size, results[total_size], result_cat, notify); + } + } + } + + delete[] results; +} + +} // namespace + +template +void HighwayHashTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult64* expected, + const HHNotify notify) const { + HHStateT state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template +void HighwayHashTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult128* expected, + const HHNotify notify) const { + HHStateT state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template +void HighwayHashTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const size_t size, + const HHResult256* expected, + const HHNotify notify) const { + HHStateT state(key); + TestHighwayHash(&state, bytes, size, expected, notify); +} + +template +void HighwayHashCatTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult64* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +template +void HighwayHashCatTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult128* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +template +void HighwayHashCatTest::operator()(const HHKey& key, + const char* HH_RESTRICT bytes, + const uint64_t size, + const HHResult256* expected, + const HHNotify notify) const { + TestHighwayHashCat(key, bytes, size, expected, notify); +} + +// Instantiate for the current target. +template struct HighwayHashTest; +template struct HighwayHashCatTest; + +//----------------------------------------------------------------------------- +// benchmark + +namespace { + +template +uint64_t RunHighway(const void*, const size_t size) { + HH_ALIGNAS(32) static const HHKey key = {0, 1, 2, 3}; + char in[kMaxBenchmarkInputSize]; + in[0] = static_cast(size & 0xFF); + HHResult64 result; + HHStateT state(key); + HighwayHashT(&state, in, size, &result); + return result; +} + +template +uint64_t RunHighwayCat(const void*, const size_t size) { + HH_ALIGNAS(32) static const HHKey key = {0, 1, 2, 3}; + HH_ALIGNAS(64) HighwayHashCatT cat(key); + char in[kMaxBenchmarkInputSize]; + in[0] = static_cast(size & 0xFF); + const size_t half_size = size / 2; + cat.Append(in, half_size); + cat.Append(in + half_size, size - half_size); + HHResult64 result; + cat.Finalize(&result); + return result; +} + +} // namespace + +template +void HighwayHashBenchmark::operator()(DurationsForInputs* input_map, + NotifyBenchmark notify, + void* context) const { + MeasureDurations(&RunHighway, input_map); + notify("HighwayHash", TargetName(Target), input_map, context); +} + +template +void HighwayHashCatBenchmark::operator()(DurationsForInputs* input_map, + NotifyBenchmark notify, + void* context) const { + MeasureDurations(&RunHighwayCat, input_map); + notify("HighwayHashCat", TargetName(Target), input_map, context); +} + +// Instantiate for the current target. +template struct HighwayHashBenchmark; +template struct HighwayHashCatBenchmark; + +} // namespace highwayhash +#endif // HH_DISABLE_TARGET_SPECIFIC diff --git a/highwayhash/highwayhash/highwayhash_test_target.h b/highwayhash/highwayhash/highwayhash_test_target.h new file mode 100644 index 000000000..56ae960ba --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_target.h @@ -0,0 +1,90 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_ +#define HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_ + +// Tests called by InstructionSets::RunAll, so we can verify all +// implementations supported by the current CPU. + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include + +#include "highwayhash/arch_specific.h" +#include "highwayhash/compiler_specific.h" +#include "highwayhash/hh_types.h" +#include "highwayhash/highwayhash.h" +#include "highwayhash/nanobenchmark.h" + +namespace highwayhash { + +// Verifies the hash result matches "expected" and calls "notify" if not. +template +struct HighwayHashTest { + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult64* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult128* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const size_t size, const HHResult256* expected, + const HHNotify notify) const; +}; + +// For every possible partition of "bytes" into zero to three fragments, +// verifies HighwayHashCat returns the same result as HighwayHashT of the +// concatenated fragments, and calls "notify" if not. The value of "expected" +// is ignored; it is only used for overloading. +template +struct HighwayHashCatTest { + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult64* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult128* expected, + const HHNotify notify) const; + void operator()(const HHKey& key, const char* HH_RESTRICT bytes, + const uint64_t size, const HHResult256* expected, + const HHNotify notify) const; +}; + +// Called by benchmark with prefix, target_name, input_map, context. +// This function must set input_map->num_items to 0. +using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*, + void*); + +constexpr size_t kMaxBenchmarkInputSize = 1024; + +// Calls "notify" with benchmark results for the input sizes specified by +// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter. +template +struct HighwayHashBenchmark { + void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, + void* context) const; +}; + +template +struct HighwayHashCatBenchmark { + void operator()(DurationsForInputs* input_map, NotifyBenchmark notify, + void* context) const; +}; + +} // namespace highwayhash + +#endif // HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_ diff --git a/highwayhash/highwayhash/highwayhash_test_vsx.cc b/highwayhash/highwayhash/highwayhash_test_vsx.cc new file mode 100644 index 000000000..224a65efe --- /dev/null +++ b/highwayhash/highwayhash/highwayhash_test_vsx.cc @@ -0,0 +1,22 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// WARNING: this is a "restricted" source file; avoid including any headers +// unless they are also restricted. See arch_specific.h for details. + +#define HH_TARGET_NAME VSX + +#ifdef __VSX__ +#include "highwayhash/highwayhash_test_target.cc" +#endif diff --git a/highwayhash/highwayhash/iaca.h b/highwayhash/highwayhash/iaca.h new file mode 100644 index 000000000..80e1013ae --- /dev/null +++ b/highwayhash/highwayhash/iaca.h @@ -0,0 +1,63 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_IACA_H_ +#define HIGHWAYHASH_IACA_H_ + +// WARNING: this is a "restricted" header because it is included from +// translation units compiled with different flags. This header and its +// dependencies must not define any function unless it is static inline and/or +// within namespace HH_TARGET_NAME. See arch_specific.h for details. + +#include "highwayhash/compiler_specific.h" + +// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies, +// but only for code between special markers. These functions embed such markers +// in an executable, but only for reading via IACA - they deliberately trigger +// a crash if executed to ensure they are removed in normal builds. + +// Default off; callers must `#define HH_ENABLE_IACA 1` before including this. +#ifndef HH_ENABLE_IACA +#define HH_ENABLE_IACA 0 +#endif + +namespace highwayhash { + +#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION) + +// Call before the region of interest. Fences hopefully prevent reordering. +static HH_INLINE void BeginIACA() { + HH_COMPILER_FENCE; + asm volatile( + ".byte 0x0F, 0x0B\n\t" // UD2 + "movl $111, %ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t"); + HH_COMPILER_FENCE; +} + +// Call after the region of interest. Fences hopefully prevent reordering. +static HH_INLINE void EndIACA() { + HH_COMPILER_FENCE; + asm volatile( + "movl $222, %ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + ".byte 0x0F, 0x0B\n\t"); // UD2 + HH_COMPILER_FENCE; +} + +#endif + +} // namespace highwayhash + +#endif // HIGHWAYHASH_IACA_H_ diff --git a/highwayhash/highwayhash/instruction_sets.cc b/highwayhash/highwayhash/instruction_sets.cc new file mode 100644 index 000000000..ab6775b10 --- /dev/null +++ b/highwayhash/highwayhash/instruction_sets.cc @@ -0,0 +1,144 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "highwayhash/instruction_sets.h" +#include "highwayhash/arch_specific.h" + +// Currently there are only specialized targets for X64; other architectures +// only use HH_TARGET_Portable, in which case Supported() just returns that. +#if HH_ARCH_X64 + +#include + +namespace highwayhash { + +namespace { + +bool IsBitSet(const uint32_t reg, const int index) { + return (reg & (1U << index)) != 0; +} + +// Returns the lower 32 bits of extended control register 0. +// Requires CPU support for "OSXSAVE" (see below). +uint32_t ReadXCR0() { +#if HH_MSC_VERSION + return static_cast(_xgetbv(0)); +#else + uint32_t xcr0, xcr0_high; + const uint32_t index = 0; + asm volatile(".byte 0x0F, 0x01, 0xD0" + : "=a"(xcr0), "=d"(xcr0_high) + : "c"(index)); + return xcr0; +#endif +} + +// 0 iff not yet initialized by Supported(). +// Not function-local => no compiler-generated locking. +std::atomic supported_{0}; + +// Bits indicating which instruction set extensions are supported. +enum { + kBitSSE = 1 << 0, + kBitSSE2 = 1 << 1, + kBitSSE3 = 1 << 2, + kBitSSSE3 = 1 << 3, + kBitSSE41 = 1 << 4, + kBitSSE42 = 1 << 5, + kBitAVX = 1 << 6, + kBitAVX2 = 1 << 7, + kBitFMA = 1 << 8, + kBitLZCNT = 1 << 9, + kBitBMI = 1 << 10, + kBitBMI2 = 1 << 11, + + kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2, + kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41 +}; + +} // namespace + +TargetBits InstructionSets::Supported() { + TargetBits supported = supported_.load(std::memory_order_acquire); + // Already initialized, return that. + if (HH_LIKELY(supported)) { + return supported; + } + + uint32_t flags = 0; + uint32_t abcd[4]; + + Cpuid(0, 0, abcd); + const uint32_t max_level = abcd[0]; + + // Standard feature flags + Cpuid(1, 0, abcd); + flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0; + flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0; + flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0; + flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0; + flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0; + flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0; + flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0; + flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0; + const bool has_xsave = IsBitSet(abcd[2], 26); + const bool has_osxsave = IsBitSet(abcd[2], 27); + + // Extended feature flags + Cpuid(0x80000001U, 0, abcd); + flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0; + + // Extended features + if (max_level >= 7) { + Cpuid(7, 0, abcd); + flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0; + flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0; + flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0; + } + + // Verify OS support for XSAVE, without which XMM/YMM registers are not + // preserved across context switches and are not safe to use. + if (has_xsave && has_osxsave) { + const uint32_t xcr0 = ReadXCR0(); + // XMM/YMM + if ((xcr0 & 2) == 0 || (xcr0 & 4) == 0) { + flags &= ~(kBitAVX | kBitAVX2); + } + } else { + // Clear the AVX/AVX2 bits if the CPU or OS does not support XSAVE. + // + // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across + // context switches on x86_64 and any modern 32-bit system, so only AVX2 + // needs to be disabled. + flags &= ~(kBitAVX | kBitAVX2); + } + + // Also indicates "supported" has been initialized. + supported = HH_TARGET_Portable; + + // Set target bit(s) if all their group's flags are all set. + if ((flags & kGroupAVX2) == kGroupAVX2) { + supported |= HH_TARGET_AVX2; + } + if ((flags & kGroupSSE41) == kGroupSSE41) { + supported |= HH_TARGET_SSE41; + } + + supported_.store(supported, std::memory_order_release); + return supported; +} + +} // namespace highwayhash + +#endif // HH_ARCH_X64 diff --git a/highwayhash/highwayhash/instruction_sets.h b/highwayhash/highwayhash/instruction_sets.h new file mode 100644 index 000000000..aa7bd6b3f --- /dev/null +++ b/highwayhash/highwayhash/instruction_sets.h @@ -0,0 +1,118 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_ +#define HIGHWAYHASH_INSTRUCTION_SETS_H_ + +// Calls the best specialization of a template supported by the current CPU. +// +// Usage: for each dispatch site, declare a Functor template with a 'Target' +// argument, add a source file defining its operator() and instantiating +// Functor, add a cc_library_for_targets rule for that source file, +// and call InstructionSets::Run(/*args*/). + +#include // std::forward + +#include "highwayhash/arch_specific.h" // HH_TARGET_* +#include "highwayhash/compiler_specific.h" + +namespace highwayhash { + +// Detects TargetBits and calls specializations of a user-defined functor. +class InstructionSets { + public: +// Returns bit array of HH_TARGET_* supported by the current CPU. +// The HH_TARGET_Portable bit is guaranteed to be set. +#if HH_ARCH_X64 + static TargetBits Supported(); +#elif HH_ARCH_PPC + static HH_INLINE TargetBits Supported() { + return HH_TARGET_VSX | HH_TARGET_Portable; + } +#elif HH_ARCH_NEON + static HH_INLINE TargetBits Supported() { + return HH_TARGET_NEON | HH_TARGET_Portable; + } +#else + static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; } +#endif + + // Chooses the best available "Target" for the current CPU, runs the + // corresponding Func::operator()(args) and returns that Target + // (a single bit). The overhead of dispatching is low, about 4 cycles, but + // this should only be called infrequently (e.g. hoisting it out of loops). + template