diff --git a/.gitignore b/.gitignore
index d15cb8096..75eb4b16a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,7 +57,9 @@ history
 /config.status
 /configure
 /doc/asy-latex.i*
+/doc/asy-latex.hd
 /doc/asy.1
+/doc/asymptote_html/
 /glrender.d.54461
 /gsl.symbols.h
 /keywords.h
@@ -66,6 +68,10 @@ history
 /types.symbols.h
 *.dSYM
 .DS_Store
+/errors.temp
+/base/webgl/asygl.js
+/v3dheadertypes.py
+/v3dtypes.py
 
 ### TeX-related
 ## Core latex/pdflatex auxiliary files:
@@ -95,6 +101,9 @@ history
 /doc/**/asymptote.*
 !/doc/asymptote.texi
 /doc/options
+/doc/latexusage-?.asy
+/doc/latexusage-?.tex
+/doc/latexusage-*.pbsdat
 .asy_*
 
 ## Bibliography auxiliary files (bibtex/biblatex/biber):
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cee80ae5..ddc3fd6d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,21 @@
 cmake_minimum_required(VERSION 3.27)
 
+if (NOT CMAKE_HOST_SYSTEM_PROCESSOR)
+    if (WIN32)
+        if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+            set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+        else()
+            set(CMAKE_HOST_SYSTEM_PROCESSOR "i386")
+        endif()
+    else()
+        execute_process(COMMAND uname -m OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_PROCESSOR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+endif()
+message(STATUS "Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+if (NOT CMAKE_SYSTEM_PROCESSOR)
+    set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif()
+
 include(cmake-scripts/common.cmake)
 include(asy-pkg-version-suffix.cmake OPTIONAL RESULT_VARIABLE ASY_ADDR_VERSION_SUFFIX_FILE)
 include(cmake-scripts/options.cmake)
diff --git a/Makefile.in b/Makefile.in
index b572a0ec5..f1e56def5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -11,8 +11,8 @@ GCOPTIONS = @GCOPTIONS@
 GCLIB = @GCLIB@
 GCPPLIB = @GCPPLIB@
 GCLIBS = $(GCPPLIB) $(GCLIB)
-LFLAGS = @LDFLAGS@
-LIBS = $(LFLAGS) @PTHREAD_LIBS@ @GLEW@ @LIBS@ $(GCLIBS) @LSPLIBS@
+LFLAGS = @LDFLAGS@ -Lhighwayhash/lib
+LIBS = $(LFLAGS) @PTHREAD_LIBS@ @GLEW@ @LIBS@ $(GCLIBS) @LSPLIBS@ -lhighwayhash
 CXX_STANDARD = @CXX_STANDARD@
 DOSLIBS = $(LIBS) -ltermcap -lwolfssl -lgdi32 -lwinmm -s -static
 LSP_BUILD_ROOT=@LSP_BUILD_ROOT@
@@ -29,7 +29,7 @@ PYUIC ?= pyuic5
 # We have to remove OpenGL, threading, GC, etc from this.
 SHAREDLIBS = $(filter-out -lglut -GL -pthread $(GCLIBS), $(LIBS))
 
-vpath %.cc prc
+vpath %.cc prc highwayhash
 vpath %.cc thirdparty_impl/tinyexr_impl/src
 vpath %.ui GUI/windows
 vpath %.py GUI/pyUIClass
@@ -59,7 +59,8 @@ COREFILES = $(CAMP) $(SYMBOL_FILES) $(PRC) $(TINYEXR_FILES) \
 	Delaunay predicates glrender tr shaders jsfile v3dfile \
 	EXRFiles GLTextures lspserv symbolmaps win32helpers win32pipestream \
 	win32xdr xstream \
-	lspdec lspexp lspfundec lspstm
+	lspdec lspexp lspfundec lspstm \
+	hashing
 
 FILES = $(COREFILES) main
 
@@ -93,7 +94,7 @@ DEFS = @DEFS@ @OPTIONS@ @PTHREAD_CFLAGS@ -DFFTWPP_SINGLE_THREAD -Wall -I.
 CFLAGS = @CFLAGS@
 OPTS = $(DEFS) @CPPFLAGS@ @CXXFLAGS@ $(CFLAGS) \
 	-Ibackports/optional/include \
-	-Iprc/include -I$(LSP_ROOT)/include
+	-Iprc/include -I$(LSP_ROOT)/include -Ihighwayhash
 GLEWOPTS = $(DEFS) @CPPFLAGS@ $(CFLAGS) -DGLEW_NO_GLU -DGLEW_BUILD -O1 -fPIC
 
 # Options for compiling the object files for the shared library.
@@ -136,7 +137,7 @@ endif
 
 export prefix docdir exampledir mandir infodir INSTALL MAKE DESTDIR TEXI2DVI
 
-asy: base/version.asy $(FILES:=.o) $(XNAME) revision.o @LSPLIB@ @GLEW@
+asy: base/version.asy $(FILES:=.o) $(XNAME) revision.o @LSPLIB@ @GLEW@ libhighwayhash.a
 	$(CXX) $(OPTS) -o $(NAME) $(FILES:=.o) revision.o $(LIBS)
 
 $(XNAME): $(PYFILES)
@@ -175,6 +176,9 @@ $(LSP_BUILD_ROOT)/liblspcpp.a:
 	@LSP_CMAKE_OPTIONS@
 	$(MAKE) -C $(LSP_BUILD_ROOT)
 
+libhighwayhash.a:
+	cd highwayhash && $(MAKE)
+
 all:	asy sty man faq asy-keywords.el
 
 $(GCLIB):
@@ -372,6 +376,7 @@ clean:	FORCE
 	-cd LspCpp && $(MAKE) distclean
 	-cd LspCpp && rm -rf liblspcpp.a Makefile CMakeFiles third_party/uri/src/CMakeFiles
 	-cd tinyexr && $(MAKE) clean
+	-cd highwayhash && $(MAKE) clean
 
 gc-clean: FORCE clean
 	-$(MAKE) -C $(GC) clean
@@ -394,6 +399,8 @@ cleaner: FORCE clean
 
 distclean: FORCE cleaner
 	cd doc && $(MAKE) distclean
+	cd highwayhash && $(MAKE) distclean
+
 
 cleanest: FORCE maintainer-clean
 maintainer-clean: FORCE distclean
diff --git a/base/collections/enumerate.asy b/base/collections/enumerate.asy
new file mode 100644
index 000000000..d73f33465
--- /dev/null
+++ b/base/collections/enumerate.asy
@@ -0,0 +1,46 @@
+typedef import(T);
+
+from collections.iter(T=T) access Iter_T, Iterable_T;
+from collections.genericpair(K=int, V=T) access
+    Pair_K_V as Pair_int_T,
+    makePair;
+from collections.iter(T=Pair_int_T) access
+    Iter_T as Iter_Pair_int_T,
+    Iterable_T as Iterable_Pair_int_T,
+    Iterable;
+
+Iterable_Pair_int_T enumerate(Iterable_T iterable) {
+  Iter_Pair_int_T iter() {
+    int i = 0;
+    Iter_T it = iterable.operator iter();
+    Iter_Pair_int_T result;
+    result.valid = it.valid;
+    result.get = new Pair_int_T() {
+      return makePair(i, it.get());
+    };
+    result.advance = new void() {
+      ++i;
+      it.advance();
+    };
+    return result;
+  }
+  return Iterable(iter);
+}
+
+Iterable_Pair_int_T enumerate(T[] array) {
+  Iter_Pair_int_T iter() {
+    int i = 0;
+    Iter_Pair_int_T result;
+    result.valid = new bool() {
+      return i < array.length;
+    };
+    result.get = new Pair_int_T() {
+      return makePair(i, array[i]);
+    };
+    result.advance = new void() {
+      ++i;
+    };
+    return result;
+  }
+  return Iterable(iter);
+}
\ No newline at end of file
diff --git a/base/collections/genericpair.asy b/base/collections/genericpair.asy
new file mode 100644
index 000000000..55e861d05
--- /dev/null
+++ b/base/collections/genericpair.asy
@@ -0,0 +1,20 @@
+typedef import(K, V);
+
+struct Pair_K_V {
+  restricted K k;
+  restricted V v;
+  void operator init(K k, V v) {
+    this.k = k;
+    this.v = v;
+  }
+  autounravel bool operator ==(Pair_K_V a, Pair_K_V b) {
+    // NOTE: This won't compile if K or V is an array type since == is
+    // vectorized for arrays. We could locally define a cast operator from
+    // bool[] to bool, but that would not behave as expected if comparing two
+    // arrays of different lengths. (We would get an error instead of false.)
+    return a.k == b.k && a.v == b.v;
+  }
+  int hash();  // To be overridden by the user.
+}
+
+Pair_K_V makePair(K k, V v) = Pair_K_V;
\ No newline at end of file
diff --git a/base/collections/hashmap.asy b/base/collections/hashmap.asy
new file mode 100644
index 000000000..989ca0a79
--- /dev/null
+++ b/base/collections/hashmap.asy
@@ -0,0 +1,90 @@
+typedef import(K, V);
+
+from collections.map(K=K, V=V) access Map_K_V, Iter_K, Iter_K_V, Iterable_K;
+from collections.genericpair(K=K, V=V) access Pair_K_V, makePair;
+from collections.hashrepset(T=Pair_K_V) access
+    HashRepSet_T as HashRepSet_K_V;
+
+private Pair_K_V operator tuple(K k, V v) {
+  Pair_K_V pair = makePair(k, v); 
+  pair.hash = k.hash;
+  return pair;
+}
+
+struct HashMap_K_V {
+  struct _ { autounravel restricted Map_K_V map; }
+
+  private HashRepSet_K_V pairs = HashRepSet_K_V(
+    nullT=null,
+    equiv = new bool(Pair_K_V a, Pair_K_V b) {
+      // NOTE: This should never be called on a null pair.
+      return a.k == b.k;
+    },
+    isNullT = new bool(Pair_K_V kv) { return alias(kv, null); }
+  );
+
+  void operator init() {
+    using F = void();
+    ((F)map.operator init)();
+  }
+
+  // TODO: Change "isNullValue" to "isNullValue", and similar.
+
+  void operator init(V nullValue, bool isNullValue(V) = null) {
+    using F = void(V, bool isNullValue(V)=null);  // The default value here is ignored.
+    if (isNullValue == null) {
+      ((F)map.operator init)(nullValue);  // Let operator init supply its own default.
+    } else {
+      ((F)map.operator init)(nullValue, isNullValue);
+    }
+  }
+
+  map.size = pairs.size;
+
+  map.contains = new bool(K key) {
+    return pairs.contains((key, map.nullValue));
+  };
+
+  map.operator[] = new V(K key) {
+    Pair_K_V pair = pairs.get((key, map.nullValue));
+    if (!alias(pair, null)) {
+      return pair.v;
+    }
+    assert(map.isNullValue != null, 'Key not found in map');
+    return map.nullValue;
+  };
+
+  map.operator [=] = new void(K key, V value) {
+    if (map.isNullValue != null && map.isNullValue(value)) {
+      pairs.delete((key, value));
+    } else {
+      pairs.update((key, value));
+    }
+  };
+
+  map.delete = new void(K key) {
+    Pair_K_V removed = pairs.delete((key, map.nullValue));
+    assert(!alias(removed, null), 'Nonexistent key cannot be deleted');
+  };
+
+  map.operator iter = new Iter_K() {
+    Iter_K_V it = pairs.operator iter();
+    Iter_K result;
+    result.valid = it.valid;
+    result.advance = it.advance;
+    result.get = new K() { return it.get().k; };
+    return result;
+  };
+
+  autounravel Iterable_K operator cast(HashMap_K_V map) {
+    return Iterable_K(map.map.operator iter);
+  }
+  autounravel K[] operator ecast(HashMap_K_V map) {
+    return (K[])(Iterable_K)map;
+  }
+  autounravel Map_K_V operator cast(HashMap_K_V map) {
+    return map.map;
+  }
+
+  unravel map;
+}
\ No newline at end of file
diff --git a/base/collections/hashrepset.asy b/base/collections/hashrepset.asy
new file mode 100644
index 000000000..949b8d53d
--- /dev/null
+++ b/base/collections/hashrepset.asy
@@ -0,0 +1,259 @@
+typedef import(T);
+
+from collections.repset(T=T) access Iter_T, Iterable_T, RepSet_T;
+
+private struct HashEntry {
+  T item;
+  int hash = -1;
+  HashEntry newer = null;
+  HashEntry older = null;
+}
+
+struct HashRepSet_T {
+  struct _ { autounravel restricted RepSet_T super; }
+  from super unravel nullT, equiv, isNullT;
+
+  // These fields are mutable.
+  private HashEntry[] buckets = array(16, (HashEntry)null);
+  buckets.cyclic = true;
+  private int size = 0;
+  private int zombies = 0;
+  private int numChanges = 0;  // Detect concurrent modification.
+  HashEntry newest = null;
+  HashEntry oldest = null;
+
+  void operator init() {
+    using F = void();
+    ((F)super.operator init)();
+  }
+  void operator init(T nullT,
+      bool equiv(T a, T b) = operator ==,
+      bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) {
+    typedef void F(T, bool equiv(T, T), bool isNullT(T));
+    ((F)super.operator init)(nullT, equiv, isNullT);
+  }
+
+  RepSet_T newEmpty() {
+    return HashRepSet_T(nullT, equiv, isNullT).super;
+  }
+
+  super.size = new int() {
+    return size;
+  };
+
+  super.contains = new bool(T item) {
+    int bucket = item.hash();
+    for (int i = 0; i < buckets.length; ++i) {
+      HashEntry entry = buckets[bucket + i];
+      if (entry == null) {
+        return false;
+      }
+      if (entry.hash == bucket && equiv(entry.item, item)) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  super.get = new T(T item) {
+    int bucket = item.hash();
+    for (int i = 0; i < buckets.length; ++i) {
+      HashEntry entry = buckets[bucket + i];
+      if (entry == null) {
+        return super.nullT;
+      }
+      if (entry.hash == bucket && equiv(entry.item, item)) {
+        return entry.item;
+      }
+    }
+    assert(isNullT != null, 'Item is not present.');
+    return super.nullT;
+  };
+
+  super.operator iter = new Iter_T() {
+    Iter_T result = new Iter_T;
+    HashEntry current = oldest;
+    int expectedChanges = numChanges;
+    result.valid = new bool() {
+      assert(numChanges == expectedChanges, 'Concurrent modification');
+      return current != null;
+    };
+    result.get = new T() {
+      assert(numChanges == expectedChanges, 'Concurrent modification');
+      assert(result.valid());
+      return current.item;
+    };
+    result.advance = new void() {
+      assert(numChanges == expectedChanges, 'Concurrent modification');
+      assert(result.valid());
+      current = current.newer;
+    };
+    return result;
+  };
+
+  private void changeCapacity() {
+    ++numChanges;
+    int newCapacity = (zombies > size ? buckets.length : 2 * buckets.length);
+    zombies = 0;
+    buckets = array(newCapacity, (HashEntry)null);
+    buckets.cyclic = true;
+    for (HashEntry current = oldest; current != null; current = current.newer) {
+      int bucket = current.hash;
+      for (int i = 0; i < buckets.length; ++i) {
+        if (buckets[bucket + i] == null) {
+          buckets[bucket + i] = current;
+          break;
+        }
+        assert(i < buckets.length - 1, 'No space in hash table; '
+                                       'is the linked list circular?');
+      }
+    }
+  }
+
+  // Returns an int as follows (note: "index" is modulo buckets.length):
+  //   * If an equivalent item is in the set, returns its index.
+  //   * Otherwise, if least one bucket is empty, returns the index of the empty
+  //     bucket in which the item should be placed if added.
+  //   * Otherwise, returns -1.
+  private int find(T item, int hash) {
+    for (int i = 0; i < buckets.length; ++i) {
+      int index = hash + i;
+      HashEntry entry = buckets[index];
+      if (entry == null) {
+        return index;
+      }
+      if (entry.hash == hash && equiv(entry.item, item)) {
+        return index;
+      }
+    }
+    return -1;
+  }
+
+  super.add = new bool(T item) {
+    ++numChanges;
+    if (isNullT != null && isNullT(item)) {
+      return false;
+    }
+    if (2 * (size + zombies) >= buckets.length) {
+      changeCapacity();
+    }
+    int bucket = item.hash();
+    int index = find(item, bucket);
+    if (index == -1) {
+      changeCapacity();
+      index = find(item, bucket);
+      assert(index != -1, 'No space in hash table');
+    }
+    HashEntry entry = buckets[index];
+    if (entry != null) {
+      return false;
+    }
+
+    ++numChanges;
+    if (2 * (size + zombies) >= buckets.length) {
+      changeCapacity();
+      index = find(item, bucket);
+      assert(index != -1);
+      assert(buckets[index] == null);
+    }
+    entry = buckets[index] = new HashEntry;
+    entry.item = item;
+    entry.hash = bucket;
+    entry.older = newest;
+    if (newest != null) {
+      newest.newer = entry;
+    }
+    newest = entry;
+    if (oldest == null) {
+      oldest = entry;
+    }
+    ++size;
+    return true;
+  };
+
+  super.update = new T(T item) {
+    if (isNullT != null && isNullT(item)) {
+      return nullT;
+    }
+    int bucket = item.hash();
+    int index = find(item, bucket);
+    if (index == -1) {
+      changeCapacity();
+      index = find(item, bucket);
+      assert(index != -1, 'No space in hash table');
+    }
+    HashEntry entry = buckets[index];
+    if (entry != null) {
+      T result = entry.item;
+      entry.item = item;
+      return result;
+    }
+    ++numChanges;
+    if (2 * (size + zombies) >= buckets.length) {
+      changeCapacity();
+      index = find(item, bucket);
+      assert(index != -1);
+      assert(buckets[index] == null);
+    }
+    entry = buckets[index] = new HashEntry;
+    assert(isNullT != null,
+           'Adding item via update() without defining nullT.');
+    entry.item = item;
+    entry.hash = bucket;
+    entry.older = newest;
+    if (newest != null) {
+      newest.newer = entry;
+    }
+    newest = entry;
+    if (oldest == null) {
+      oldest = entry;
+    }
+    ++size;
+    return nullT;
+  };
+
+  super.delete = new T(T item) {
+    int bucket = item.hash();
+    int index = find(item, bucket);
+    HashEntry entry = buckets[index];
+    if (index == -1) {
+      assert(false, 'Overcrowded hash table; zombies: ' + string(zombies) +
+             '; size: ' + string(size) +
+             '; buckets.length: ' + string(buckets.length));
+      return nullT;
+    }
+    if (entry == null) {
+      assert(isNullT != null, 'Item is not present.');
+      return nullT;
+    }
+    ++numChanges;
+    T result = entry.item;
+    entry.hash = -1;
+    ++zombies;
+    if (entry.older != null) {
+      entry.older.newer = entry.newer;
+    } else {
+      oldest = entry.newer;
+    }
+    if (entry.newer != null) {
+      entry.newer.older = entry.older;
+    } else {
+      newest = entry.older;
+    }
+    --size;
+    if (2 * (size + zombies) > buckets.length) {
+      changeCapacity();
+    }
+    return result;
+  };
+
+  autounravel RepSet_T operator cast(HashRepSet_T set) {
+    return set.super;
+  }
+
+  autounravel Iterable_T operator cast(HashRepSet_T set) {
+    return Iterable_T(set.super.operator iter);
+  }
+  unravel super;
+}
+    
\ No newline at end of file
diff --git a/base/collections/iter.asy b/base/collections/iter.asy
new file mode 100644
index 000000000..75b9b0ff1
--- /dev/null
+++ b/base/collections/iter.asy
@@ -0,0 +1,49 @@
+typedef import(T);
+
+struct Iter_T {
+  // Returns the current item. Error if the iterator is not valid.
+  T get();
+  // Advances the iterator to the next item. Error if the iterator is not valid.
+  void advance();
+  // Returns true if the iterator is valid. If the iterator is used without
+  // modifying the datastructure, it will be valid as long as there is a next
+  // item.
+  //
+  // QUESTION: Do we want best-effort fail-fast iterators that set valid to false
+  // if the datastructure is modified, or do we want to leave it the behavior
+  // undefined in this case?
+  bool valid();
+}
+
+Iter_T Iter_T(T[] items) {
+  int index = 0;
+  Iter_T retv;
+  unravel retv;
+  advance = new void() { ++index; };
+  get = new T() { return items[index]; };
+  valid = new bool() { return index < items.length; };
+  return retv;
+}
+
+struct Iterable_T {
+  // Returns an iterator over the collection.
+  Iter_T operator iter();
+  void operator init(Iter_T iter()) {
+    this.operator iter = iter;
+  }
+  void operator init(T[] items) {
+    this.operator iter = new Iter_T() {
+      return Iter_T(items);
+    };
+  }
+  autounravel T[] operator ecast(Iterable_T iterable) {
+    T[] result;
+    for (T item : iterable) {
+      result.push(item);
+    }
+    return result;
+  }
+}
+
+Iterable_T Iterable(Iter_T iter()) = Iterable_T;
+Iterable_T Iterable(T[] items) = Iterable_T;
diff --git a/base/collections/map.asy b/base/collections/map.asy
new file mode 100644
index 000000000..49ce57cce
--- /dev/null
+++ b/base/collections/map.asy
@@ -0,0 +1,169 @@
+typedef import(K, V);
+
+from collections.genericpair(K=K, V=V) access Pair_K_V;
+from collections.iter(T=K) access Iter_T as Iter_K, Iterable_T as Iterable_K;
+from collections.iter(T=Pair_K_V) access
+    Iter_T as Iter_K_V,
+    Iterable_T as Iterable_K_V;
+
+struct Map_K_V {
+  restricted V nullValue;
+  restricted bool isNullValue(V) = null;
+  void operator init() {}
+  void operator init(V nullValue,
+    bool isNullValue(V) = new bool(V v) { return v == nullValue; }
+  ) {
+    this.nullValue = nullValue;
+    this.isNullValue = isNullValue;
+    assert(isNullValue(nullValue), 'nullValue must satisfy isNullValue');
+  }
+  // Remaining methods are not implemented here.
+  int size();
+  bool empty() { return size() == 0; }
+  bool contains(K key);
+  // If the key was not present already, returns nullValue, or throws error
+  // if nullValue was never set.
+  V operator [] (K key);
+  // Adds the key-value pair, replacing both the key and value if the key was
+  // already present.
+  void operator [=] (K key, V value);
+  // Removes the entry with the given key, if it exists.
+  // QUESTION: Should we throw an error if the key was not present? (Current
+  // implementation: yes, unless there is a nullValue to return.)
+  void delete(K key);
+
+  Iter_K operator iter();
+
+  autounravel Iterable_K operator cast(Map_K_V map) {
+    return Iterable_K(map.operator iter);
+  }
+
+  // Makes the notation `for (K key: (K[])map)` work for now, albeit inefficiently.
+  autounravel K[] operator ecast(Map_K_V map) {
+    return (K[])(Iterable_K)map;
+  }
+
+  void addAll(Iterable_K_V other) {
+    for (Pair_K_V kv : other) {
+      this[kv.k] = kv.v;
+    }
+  }
+  void removeAll(Iterable_K other) {
+    for (K key : other) {
+      delete(key);
+    }
+  }
+}
+
+// Reference implementation for testing purposes.
+struct NaiveMap_K_V {
+  private K[] keys;
+  private V[] values;
+  private int size;
+  private int numChanges = 0;
+  restricted Map_K_V map;
+  void operator init() {
+    keys = new K[0];
+    values = new V[0];
+    size = 0;
+    using F = void();
+    ((F)map.operator init)();
+  }
+  void operator init(V nullValue, bool isNullValue(V) = null) {
+    keys = new K[0];
+    values = new V[0];
+    size = 0;
+    if (isNullValue == null) {
+      map.operator init(nullValue);  // Let operator init supply its own default.
+    } else {
+      map.operator init(nullValue, isNullValue);
+    }
+  }
+  map.size = new int() { return size; };
+  map.contains = new bool(K key) {
+    for (int i = 0; i < size; ++i) {
+      if (keys[i] == key) {
+        return true;
+      }
+    }
+    return false;
+  };
+  map.operator[] = new V(K key) {
+    for (int i = 0; i < size; ++i) {
+      if (keys[i] == key) {
+        return values[i];
+      }
+    }
+    assert(map.isNullValue != null, 'Key not found in map');
+    return map.nullValue;
+  };
+  map.operator[=] = new void(K key, V value) {
+    bool delete = false;
+    if (map.isNullValue != null && map.isNullValue(value)) {
+      delete = true;
+    }
+    for (int i = 0; i < size; ++i) {
+      if (keys[i] == key) {
+        if (delete) {
+          keys.delete(i);
+          values.delete(i);
+          ++numChanges;
+          --size;
+        } else {
+          keys[i] = key;
+          values[i] = value;
+        }
+        return;
+      }
+    }
+    if (!delete) {
+      keys.push(key);
+      values.push(value);
+      ++numChanges;
+      ++size;
+    }
+  };
+  map.delete = new void(K key) {
+    ++numChanges;
+    for (int i = 0; i < size; ++i) {
+      if (keys[i] == key) {
+        keys.delete(i);
+        values.delete(i);
+        --size;
+        return;
+      }
+    }
+    assert(false, 'Nonexistent key cannot be deleted');
+  };
+  map.operator iter = new Iter_K() {
+    int numChangesAtStart = numChanges;
+    int i = 0;
+    Iter_K result;
+    result.valid = new bool() {
+      assert(numChanges == numChangesAtStart,
+             'Map keys changed during iteration');
+      return i < size;
+    };
+    result.advance = new void() { 
+      assert(numChanges == numChangesAtStart,
+             'Map keys changed during iteration');
+      ++i;
+    };
+    result.get = new K() {
+      assert(numChanges == numChangesAtStart,
+             'Map keys changed during iteration');
+      return keys[i];
+    };
+    return result;
+  };
+  autounravel Iterable_K operator cast(NaiveMap_K_V map) {
+    return Iterable_K(map.map.operator iter);
+  }
+  autounravel K[] operator ecast(NaiveMap_K_V map) {
+    return copy(map.keys);
+  }
+  autounravel Map_K_V operator cast(NaiveMap_K_V map) {
+    return map.map;
+  }
+  from map unravel *;
+}
\ No newline at end of file
diff --git a/base/collections/queue.asy b/base/collections/queue.asy
new file mode 100644
index 000000000..17c9e5d82
--- /dev/null
+++ b/base/collections/queue.asy
@@ -0,0 +1,207 @@
+typedef import(T);
+
+from collections.iter(T=T) access Iter_T, Iterable_T;
+
+struct Queue_T {
+  void push(T value);
+  T peek();
+  T pop();
+  int size();
+  Iter_T operator iter();
+  autounravel Iterable_T operator cast(Queue_T queue) {
+    return Iterable_T(queue.operator iter);
+  }
+}
+
+Queue_T makeNaiveQueue(T[] initialData) {
+  Queue_T queue = new Queue_T;
+  T[] data = new T[0];
+  data.append(initialData);
+  queue.push = new void(T value) {
+    data.push(value);
+  };
+  queue.peek = new T() {
+    return data[0];
+  };
+  queue.pop = new T() {
+    T retv = data[0];
+    data.delete(0);
+    return retv;
+  };
+  queue.size = new int() {
+    return data.length;
+  };
+  queue.operator iter = new Iter_T() {
+    return Iter_T(data);
+  };
+  return queue;
+}
+
+struct ArrayQueue_T {
+  T[] data = new T[8];
+  data.cyclic = true;
+  int start = 0;
+  int size = 0;
+
+  private void resize() {
+    T[] newData = new T[data.length * 2];
+    newData.cyclic = true;
+    newData[:size] = data[start : start+size];
+    data = newData;
+    start = 0;
+  }
+
+  Iter_T operator iter() {
+    int i = 0;
+    Iter_T result;
+    result.advance = new void() {
+      ++i;
+    };
+    result.get = new T() {
+      return data[start+i];
+    };
+    result.valid = new bool() {
+      return i < size;
+    };
+    return result;
+  }
+
+  void operator init(T[] initialData) {
+    if (initialData.length == 0 || alias(initialData, null)) {
+      return;
+    }
+    int desiredLength = data.length;
+    // TODO: Do this computation using CLZ.
+    while (desiredLength < initialData.length) {
+      desiredLength *= 2;
+    }
+    if (desiredLength != data.length) {
+      data = new T[desiredLength];
+      data.cyclic = true;
+    }
+    size = initialData.length;
+    data[:size] = initialData;
+  }
+
+  void push(T value) {
+    if (size == data.length) {
+      resize();
+    }
+    data[start+size] = value;
+    ++size;
+  }
+
+  T peek() {
+    return data[start];
+  }
+
+  T pop() {
+    T retv = data[start];
+    ++start;
+    --size;
+    return retv;
+  }
+
+  int size() {
+    return size;
+  }
+
+  autounravel Iterable_T operator cast(ArrayQueue_T queue) {
+    return Iterable_T(queue.operator iter);
+  }
+
+  autounravel Queue_T operator cast(ArrayQueue_T queue) {
+    Queue_T queue_ = new Queue_T;
+    queue_.push = queue.push;
+    queue_.peek = queue.peek; 
+    queue_.pop = queue.pop;
+    queue_.size = queue.size;
+    queue_.operator iter = queue.operator iter;
+    return queue_;
+  }
+
+
+}
+
+Queue_T makeArrayQueue(T[] initialData /*specify type for overloading*/) {
+  return ArrayQueue_T(initialData);
+}
+
+struct LinkedQueue_T {
+  struct Node {
+    T value;
+    Node next;
+  }
+  Node head = null;
+  Node tail = null;
+  int size = 0;
+
+  Iter_T operator iter() {
+    Node node = head;
+    Iter_T result;
+    result.advance = new void() {
+      node = node.next;
+    };
+    result.get = new T() {
+      return node.value;
+    };
+    result.valid = new bool() {
+      return node != null;
+    };
+    return result;
+  }
+
+  void push(T value) {
+    Node node = new Node;
+    node.value = value;
+    if (size == 0) {
+      head = node;
+      tail = node;
+    } else {
+      tail.next = node;
+      tail = node;
+    }
+    ++size;
+  }
+
+  T peek() {
+    return head.value;
+  }
+
+  T pop() {
+    T retv = head.value;
+    head = head.next;
+    --size;
+    return retv;
+  }
+
+  int size() {
+    return size;
+  }
+
+  autounravel Queue_T operator cast(LinkedQueue_T queue) {
+    Queue_T queue_ = new Queue_T;
+    queue_.push = queue.push;
+    queue_.peek = queue.peek; 
+    queue_.pop = queue.pop;
+    queue_.size = queue.size;
+    queue_.operator iter = queue.operator iter;
+    return queue_;
+  }
+
+  autounravel Iterable_T operator cast(LinkedQueue_T queue) {
+    return Iterable_T(queue.operator iter);
+  }
+
+}
+
+Queue_T makeLinkedQueue(T[] initialData) {
+  var queue = new LinkedQueue_T;
+  for (T value : initialData) {
+    queue.push(value);
+  }
+  return queue;
+}
+
+// Specify a "default" queue implementation.
+Queue_T makeQueue(T[]) = makeArrayQueue;
\ No newline at end of file
diff --git a/base/collections/repset.asy b/base/collections/repset.asy
new file mode 100644
index 000000000..b788061a0
--- /dev/null
+++ b/base/collections/repset.asy
@@ -0,0 +1,225 @@
+typedef import(T);
+from collections.iter(T=T) access Iter_T, Iterable_T;
+
+// RepSet: set of representatives of equivalence classes. Contains at most one
+// element from each equivalence class.
+
+
+struct RepSet_T {
+  restricted T nullT;
+  restricted bool equiv(T, T) = operator ==;
+  restricted bool isNullT(T) = null;
+  restricted void operator init() {}
+  restricted void operator init(T nullT,
+      bool equiv(T a, T b) = operator ==,
+      bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) {
+    this.nullT = nullT;
+    this.equiv = equiv;
+    this.isNullT = isNullT;
+  }
+
+  // Creates a new, empty RepSet with the same implemention, nullT,
+  // isNullT, and equiv as this one.
+  RepSet_T newEmpty();
+
+  int size();
+  bool empty() {
+    return size() == 0;
+  }
+  bool contains(T item);
+  // Returns the equivalent item in the set, or nullT if the set
+  // contains no equivalent item. Throws error if nullT was never set.
+  T get(T item);
+  // Returns an iterator over the items in the set.
+  Iter_T operator iter();
+  // If an equivalent item was already present, returns false. Otherwise, adds
+  // the item and returns true. Noop if isNullT is defined and item is empty.
+  bool add(T item);  
+  // Inserts item, and returns the item that was replaced, or nullT if
+  // no item was replaced. Throws error if nullT was never set.
+  // Noop if isNullT is defined and item is empty.
+  // QUESTION: Should we throw an error even if nullT was not needed,
+  // i.e., if there was already an equivalent item in the collection?
+  T update(T item);
+  // Removes the equivalent item from the set, and returns it. Returns
+  // nullT if there is no equivalent item. Throws error if
+  // there is not equivalent item and nullT was never set.
+  T delete(T item);
+
+  autounravel Iterable_T operator cast(RepSet_T set) {
+    return Iterable_T(set.operator iter);
+  }
+
+  void addAll(Iterable_T other) {
+    for (T item : other) {
+      add(item);
+    }
+  }
+  void removeAll(Iterable_T other) {
+    for (T item : other) {
+      delete(item);
+    }
+  }
+
+  autounravel bool operator <=(RepSet_T a, RepSet_T b) {
+    for (var item : a) {
+      if (!b.contains(item)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  autounravel bool operator >=(RepSet_T a, RepSet_T b) {
+    return b <= a;
+  }
+
+  autounravel bool operator ==(RepSet_T a, RepSet_T b) {
+    return a <= b && a >= b;
+  } 
+
+  autounravel bool operator !=(RepSet_T a, RepSet_T b) {
+    return !(a == b);
+  }
+
+  autounravel bool sameElementsInOrder(RepSet_T a, RepSet_T b) {
+    bool equiv(T ai, T bi) {
+      return a.equiv(ai, bi) && b.equiv(ai, bi);
+    }
+    var iterA = a.operator iter();
+    var iterB = b.operator iter();
+    while (iterA.valid() && iterB.valid()) {
+      if (!equiv(iterA.get(), iterB.get())) {
+        return false;
+      }
+      iterA.advance();
+      iterB.advance();
+    }
+    return iterA.valid() == iterB.valid();
+  }
+
+  autounravel RepSet_T operator +(RepSet_T a, Iterable_T b) {
+    RepSet_T result = a.newEmpty();
+    for (T item : a) {
+      result.add(item);
+    }
+    for (T item : b) {
+      result.add(item);
+    }
+    return result;
+  }
+
+  autounravel RepSet_T operator -(RepSet_T a, RepSet_T b) {
+    RepSet_T result = a.newEmpty();
+    for (T item : a) {
+      if (!b.contains(item)) {
+        result.add(item);
+      }
+    }
+    return result;
+  }
+
+}
+
+
+// A reference implementation, inefficient but suitable for testing.
+struct NaiveRepSet_T {
+  RepSet_T super;
+  unravel super;
+  private T[] items;
+  restricted void operator init() {
+    typedef void F();
+    ((F)super.operator init)();
+  }
+  restricted void operator init(T nullT,
+      bool equiv(T a, T b) = operator ==,
+      bool isNullT(T) = new bool(T t) { return equiv(t, nullT); }) {
+    typedef void F(T, bool equiv(T, T), bool isNullT(T));
+    ((F)super.operator init)(nullT, equiv, isNullT);
+  }
+
+  super.size = new int() {
+    return items.length;
+  };
+
+  super.contains = new bool(T item) {
+    for (T i : items) {
+      if (equiv(i, item)) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  super.get = new T(T item) {
+    for (T i : items) {
+      if (equiv(i, item)) {
+        return i;
+      }
+    }
+    return nullT;
+  };
+
+  super.operator iter = new Iter_T() {
+    return Iter_T(items);
+  };
+
+  super.add = new bool(T item) {
+    if (isNullT != null && isNullT(item)) {
+      return false;
+    }
+    if (contains(item)) {
+      return false;
+    }
+    items.push(item);
+    return true;
+  };
+
+  super.update = new T(T item) {
+    if (isNullT != null && isNullT(item)) {
+      return nullT;
+    }
+    for (int i = 0; i < items.length; ++i) {
+      if (equiv(items[i], item)) {
+        T result = items[i];
+        items[i] = item;
+        return result;
+      }
+    }
+    items.push(item);
+    assert(isNullT != null, 'item not found');
+    return nullT;
+  };
+
+  super.delete = new T(T item) {
+    for (int i = 0; i < items.length; ++i) {
+      if (equiv(items[i], item)) {
+        T result = items[i];
+        items.delete(i);
+        return result;
+      }
+    }
+    assert(isNullT != null, 'item not found');
+    return nullT;
+  };
+
+  autounravel Iterable_T operator cast(NaiveRepSet_T set) {
+    return Iterable_T(set.operator iter);
+  }
+
+  autounravel RepSet_T operator cast(NaiveRepSet_T set) {
+    return set.super;
+  }
+
+  super.newEmpty = new RepSet_T() {
+    return NaiveRepSet_T(nullT, equiv, isNullT);
+  };
+
+  autounravel T[] operator ecast(NaiveRepSet_T set) {
+    T[] result;
+    for (T item : set.items) {
+      result.push(item);
+    }
+    return result;
+  }
+}
diff --git a/tests/template/imports/sortedset.asy b/base/collections/sortedset.asy
similarity index 74%
rename from tests/template/imports/sortedset.asy
rename to base/collections/sortedset.asy
index bcef6889e..70799c7d6 100644
--- a/tests/template/imports/sortedset.asy
+++ b/base/collections/sortedset.asy
@@ -1,6 +1,6 @@
 typedef import(T);
 
-from "template/imports/pureset"(T=T) access Set_T, operator cast, makeNaiveSet;
+from pureset(T=T) access Set_T, makeNaiveSet;
 
 struct SortedSet_T {
   int size();
@@ -21,59 +21,66 @@ struct SortedSet_T {
   T popMin();            // Returns emptyresponse if collection is empty.
   T max();               // Returns emptyresponse if collection is empty.
   T popMax();            // Returns emptyresponse if collection is empty.
-  bool insert(T item);   // Returns true iff the collection is modified.
-  T replace(T item);     // Inserts item, and returns the item that was
+  bool add(T item);      // Returns true iff the collection is modified.
+  T update(T item);      // Inserts item, and returns the item that was
                          // replaced, or emptyresponse if no item was replaced.
-  bool delete(T item);   // Returns true iff the collection is modified.
+  T delete(T item);      // Returns the removed item, or emptyresponse if no
+                         // such item was found.
   // Calls process on each item in the collection, in ascending order,
   // until process returns false.
   void forEach(bool process(T item));
-}
 
-T[] operator cast(SortedSet_T set) {
-  T[] result;
-  set.forEach(new bool(T item) {
-                result.push(item);
-                return true;
-              });
-  return result;
-}
+  autounravel T[] operator cast(SortedSet_T set) {
+    T[] result;
+    set.forEach(new bool(T item) {
+                  result.push(item);
+                  return true;
+                });
+    return result;
+  }
+  
+  autounravel Set_T operator cast(SortedSet_T sorted_set) {
+    Set_T set = new Set_T;
+    set.size = sorted_set.size;
+    set.empty = sorted_set.empty;
+    set.contains = sorted_set.contains;
+    set.add = sorted_set.add;
+    set.update = sorted_set.update;
+    set.get = sorted_set.get;
+    set.delete = sorted_set.delete;
+    set.forEach = sorted_set.forEach;
+    return set;
+  }
 
-Set_T unSort(SortedSet_T sorted_set) {
-  Set_T set = new Set_T;
-  set.size = sorted_set.size;
-  set.empty = sorted_set.empty;
-  set.contains = sorted_set.contains;
-  set.insert = sorted_set.insert;
-  set.replace = sorted_set.replace;
-  set.get = sorted_set.get;
-  set.delete = sorted_set.delete;
-  set.forEach = sorted_set.forEach;
-  return set;
 }
 
-Set_T operator cast(SortedSet_T) = unSort;
+Set_T unSort(SortedSet_T sorted_set) = new Set_T(SortedSet_T sorted_set) { return sorted_set; };
 
 // For testing purposes, we provide a naive implementation of SortedSet_T.
 // This implementation is highly inefficient, but it is correct, and can be
 // used to test other implementations of SortedSet_T.
 struct NaiveSortedSet_T {
-  private bool lt(T a, T b);
+  private bool lt(T a, T b) = null;
   private T[] buffer = new T[0];
   private T emptyresponse;
 
-  private bool leq(T a, T b) {
+  private bool leq(T, T), gt(T, T), geq(T, T), equiv(T, T);
+  
+  leq = new bool(T a, T b) {
     return !lt(b, a);
-  }
-  private bool gt(T a, T b) {
+  };
+
+  gt = new bool(T a, T b) {
     return lt(b, a);
-  }
-  private bool geq(T a, T b) {
+  };
+
+  geq = new bool(T a, T b) {
     return leq(b, a);
-  }
-  private bool equiv(T a, T b) {
+  };
+  
+  equiv = new bool(T a, T b) {
     return leq(a, b) && leq(b, a);
-  }
+  };
 
   void operator init(bool lessThan(T, T), T emptyresponse) {
     this.lt = lessThan;
@@ -128,7 +135,7 @@ struct NaiveSortedSet_T {
     return buffer.pop();
   }
 
-  bool insert(T item) {
+  bool add(T item) {
     for (int i = 0; i < buffer.length; ++i) {
       if (equiv(buffer[i], item)) return false;
       else if (gt(buffer[i], item)) {
@@ -140,7 +147,7 @@ struct NaiveSortedSet_T {
     return true;
   }
 
-  T replace(T item) {
+  T update(T item) {
     for (int i = 0; i < buffer.length; ++i) {
       if (equiv(buffer[i], item)) {
         T toreturn = buffer[i];
@@ -163,14 +170,15 @@ struct NaiveSortedSet_T {
     return emptyresponse;
   }
 
-  bool delete(T item) {
+  T delete(T item) {
     for (int i = 0; i < buffer.length; ++i) {
-      if (equiv(buffer[i], item)) {
+      T candidate = buffer[i];
+      if (equiv(candidate, item)) {
         buffer.delete(i);
-        return true;
+        return candidate;
       }
     }
-    return false;
+    return emptyresponse;
   }
 
   void forEach(bool process(T item)) {
@@ -190,8 +198,8 @@ SortedSet_T operator cast(NaiveSortedSet_T naive) {
   toreturn.popMin = naive.popMin;
   toreturn.max = naive.max;
   toreturn.popMax = naive.popMax;
-  toreturn.insert = naive.insert;
-  toreturn.replace = naive.replace;
+  toreturn.add = naive.add;
+  toreturn.update = naive.update;
   toreturn.get = naive.get;
   toreturn.delete = naive.delete;
   toreturn.forEach = naive.forEach;
diff --git a/tests/template/imports/splaytree.asy b/base/collections/splaytree.asy
similarity index 91%
rename from tests/template/imports/splaytree.asy
rename to base/collections/splaytree.asy
index d5850482f..cae7eb111 100644
--- a/tests/template/imports/splaytree.asy
+++ b/base/collections/splaytree.asy
@@ -1,9 +1,6 @@
 typedef import(T);
 
-from "template/imports/sortedset"(T=T) access
-    Set_T,
-    SortedSet_T,
-    operator cast;
+from sortedset(T=T) access Set_T, SortedSet_T;
 
 private struct treenode {
   treenode leftchild;
@@ -152,9 +149,10 @@ private treenode splay(treenode[] ancestors, bool lessthan(T a, T b)) {
 struct SplayTree_T {
   private treenode root = null;
   restricted int size = 0;
-  private bool operator < (T a, T b);
+
   private T emptyresponse;
 
+  private bool operator < (T a, T b);
   void operator init(bool lessthan(T,T), T emptyresponse) {
     operator< = lessthan;
     this.emptyresponse = emptyresponse;
@@ -355,7 +353,7 @@ struct SplayTree_T {
   /*
    * returns true iff the tree was modified
    */
-  bool insert(T value) {
+  bool add(T value) {
     if (root == null) {
       root = treenode(value);
       ++size;
@@ -392,9 +390,9 @@ struct SplayTree_T {
     return true;
   }
 
-  T replace(T item) {
+  T update(T item) {
     if (root == null) {
-      insert(item);
+      add(item);
       return emptyresponse;
     }
     treenode[] ancestors = new treenode[0];
@@ -456,9 +454,9 @@ struct SplayTree_T {
   }
 
   /*
-   * returns true iff the tree was modified
+   * returns the removed item, or emptyresponse if the item was not found
    */
-  bool delete(T value) {
+  T delete(T value) {
     treenode[] ancestors = new treenode[0];
     ancestors.cyclic = true;  // Makes ancestors[-1] refer to the last entry.
     ancestors.push(root);
@@ -468,7 +466,7 @@ struct SplayTree_T {
       if (current == null) {
         ancestors.pop();
         root = splay(ancestors, operator<);
-        return false;
+        return emptyresponse;
       }
       if (value < current.value)
         ancestors.push(current.leftchild);
@@ -478,6 +476,7 @@ struct SplayTree_T {
     }
 
     treenode toDelete = ancestors.pop();
+    T retv = toDelete.value;
     treenode parent = null;
     if (ancestors.length > 0) parent = ancestors[-1];
     
@@ -510,40 +509,39 @@ struct SplayTree_T {
 
     if (parent != null) root = splay(ancestors, operator<);
     --size;
-    return true;    
+    return retv;    
   }
 
   void forEach(bool run(T)) {
     inOrderNonRecursive(root, run);
   }
-  
-}
 
-SortedSet_T operator cast(SplayTree_T splaytree) {
-  SortedSet_T result = new SortedSet_T;
-  result.size = splaytree.size;
-  result.empty = splaytree.empty;
-  result.contains = splaytree.contains;
-  result.after = splaytree.after;
-  result.before = splaytree.before;
-  result.firstGEQ = splaytree.firstGEQ;
-  result.firstLEQ = splaytree.firstLEQ;
-  result.min = splaytree.min;
-  result.popMin = splaytree.popMin;
-  result.max = splaytree.max;
-  result.popMax = splaytree.popMax;
-  result.insert = splaytree.insert;
-  result.replace = splaytree.replace;
-  result.get = splaytree.get;
-  result.delete = splaytree.delete;
-  result.forEach = splaytree.forEach;
-  return result;
-}
+  autounravel SortedSet_T operator cast(SplayTree_T splaytree) {
+    SortedSet_T result = new SortedSet_T;
+    result.size = splaytree.size;
+    result.empty = splaytree.empty;
+    result.contains = splaytree.contains;
+    result.after = splaytree.after;
+    result.before = splaytree.before;
+    result.firstGEQ = splaytree.firstGEQ;
+    result.firstLEQ = splaytree.firstLEQ;
+    result.min = splaytree.min;
+    result.popMin = splaytree.popMin;
+    result.max = splaytree.max;
+    result.popMax = splaytree.popMax;
+    result.add = splaytree.add;
+    result.update = splaytree.update;
+    result.get = splaytree.get;
+    result.delete = splaytree.delete;
+    result.forEach = splaytree.forEach;
+    return result;
+  }
 
-Set_T operator cast(SplayTree_T splaytree) {
-  return (SortedSet_T)splaytree;
-}
+  autounravel Set_T operator cast(SplayTree_T splaytree) {
+    return (SortedSet_T)splaytree;
+  }
 
-T[] operator cast(SplayTree_T splaytree) {
-  return (SortedSet_T)splaytree;
+  autounravel T[] operator cast(SplayTree_T splaytree) {
+    return (SortedSet_T)splaytree;
+  }
 }
\ No newline at end of file
diff --git a/base/collections/wrapper.asy b/base/collections/wrapper.asy
new file mode 100644
index 000000000..0ba93178d
--- /dev/null
+++ b/base/collections/wrapper.asy
@@ -0,0 +1,19 @@
+typedef import(T);
+
+struct Wrapped_T {
+  T t;
+  void operator init(T t) {
+    this.t = t;
+  }
+  autounravel bool operator ==(Wrapped_T a, Wrapped_T b) {
+    return a.t == b.t;
+  }
+  autounravel bool operator !=(Wrapped_T a, Wrapped_T b) {
+    // Let's not assume that != was overloaded.
+    return !(a.t == b.t);
+  }
+}
+
+Wrapped_T wrap(T t) {
+  return Wrapped_T(t);
+}
\ No newline at end of file
diff --git a/tests/template/imports/zip.asy b/base/collections/zip.asy
similarity index 100%
rename from tests/template/imports/zip.asy
rename to base/collections/zip.asy
diff --git a/base/set_smallpositiveint.asy b/base/set_smallpositiveint.asy
new file mode 100644
index 000000000..bbe17c470
--- /dev/null
+++ b/base/set_smallpositiveint.asy
@@ -0,0 +1,94 @@
+from pureset(int) access
+    Set_T as set_int;
+
+struct Set_smallPositiveInt {
+  bool[] buffer = new bool[];
+
+  int size() {
+    return sum(buffer);
+  }
+
+  bool empty() {
+    return all(!buffer);
+  }
+
+  bool contains(int item) {
+    if (item < 0 || item >= buffer.length) {
+      return false;
+    }
+    return buffer[item];
+  }
+
+  bool insert(int item) {
+    if (item < 0) {
+      return false;
+    }
+    while (item >= buffer.length) {
+      buffer.push(false);
+    }
+    if (buffer[item]) {
+      return false;
+    }
+    buffer[item] = true;
+    return true;
+  }
+
+  int replace(int item) {
+    if (item < 0) {
+      return -1;
+    }
+    while (item >= buffer.length) {
+      buffer.push(false);
+    }
+    if (buffer[item]) {
+      return item;
+    }
+    buffer[item] = true;
+    return -1;
+  }
+
+  int get(int item) {
+    if (item < 0 || item >= buffer.length) {
+      return -1;
+    }
+    if (buffer[item]) {
+      return item;
+    }
+    return -1;
+  }
+
+  bool delete(int item) {
+    if (item < 0 || item >= buffer.length) {
+      return false;
+    }
+    if (buffer[item]) {
+      buffer[item] = false;
+      return true;
+    }
+    return false;
+  }
+
+  void foreach(bool process(int item)) {
+    for (int i = 0; i < buffer.length; ++i) {
+      if (buffer[i]) {
+        if (!process(i)) {
+          return;
+        }
+      }
+    }
+  }
+
+}
+
+Set_int operator cast(Set_smallPositiveInt set) {
+  Set_int result = new Set_int;
+  result.size = set.size;
+  result.empty = set.empty;
+  result.contains = set.contains;
+  result.insert = set.insert;
+  result.replace = set.replace;
+  result.get = set.get;
+  result.delete = set.delete;
+  result.foreach = set.foreach;
+  return result;
+}
\ No newline at end of file
diff --git a/camp.l b/camp.l
index d6b456606..e6469ad5a 100644
--- a/camp.l
+++ b/camp.l
@@ -420,7 +420,7 @@ operator           {adjust(); BEGIN opname; }
                      BEGIN INITIAL;
                      return ID;
                    }
-[-+*/#%^!<>]|==|!=|<=|>=|&|\||\^\^|\.\.|::|--|---|\+\+|{EXTRAOPS} {
+[-+*/#%^!<>]|==|!=|<=|>=|&|\||\^\^|\.\.|::|--|---|\+\+|\[\]|\[=\]|{EXTRAOPS} {
   makeopsymbol();
   BEGIN INITIAL;
   return ID;}
diff --git a/camp.y b/camp.y
index b8831bc9f..0f1dd50fb 100644
--- a/camp.y
+++ b/camp.y
@@ -50,6 +50,20 @@ bool checkKeyword(position pos, symbol sym)
   return true;
 }
 
+// Check if the symbol given is "as".  Returns true in this case and
+// returns false and reports an error otherwise.
+bool checkAs(position pos, symbol sym)
+{
+  if (sym != symbol::trans("as")) {
+    em.error(pos);
+    em << "expected 'as' here";
+
+    return false;
+  }
+  return true;
+}
+
+
 namespace absyntax { file *root; }
 
 using namespace absyntax;
@@ -317,7 +331,8 @@ idpairlist:
 ;
 
 strid:
-  ID               { $$ = $1; }
+  name             { $$.pos = $1->getPos();
+                     $$.sym = $1->asPath(); }
 | STRING           { $$.pos = $1->getPos();
                      $$.sym = symbol::literalTrans($1->getString()); }
 ;
diff --git a/cmake-scripts/asy-base-files.cmake b/cmake-scripts/asy-base-files.cmake
index 0622c21cb..21a927ec2 100644
--- a/cmake-scripts/asy-base-files.cmake
+++ b/cmake-scripts/asy-base-files.cmake
@@ -16,6 +16,11 @@ set(ASY_STATIC_SHADER_FILES
         sum3 vertex zero
 )
 
+set(ASY_STATIC_BASE_COLLECTIONS_FILES
+    enumerate genericpair hashmap hashrepset iter map queue repset sortedset splaytree
+    wrapper zip
+)
+
 set(OTHER_STATIC_BASE_FILES nopapersize.ps)
 
 # base dir
@@ -49,6 +54,11 @@ foreach(ASY_STATIC_BASE_FILE ${ASY_STATIC_BASE_FILES})
     copy_base_file(${ASY_STATIC_BASE_FILE}.asy)
 endforeach ()
 
+file(MAKE_DIRECTORY ${ASY_BUILD_BASE_DIR}/collections)
+foreach (ASY_COLLECTION_BASE_FILE ${ASY_STATIC_BASE_COLLECTIONS_FILES})
+    copy_base_file(collections/${ASY_COLLECTION_BASE_FILE}.asy)
+endforeach()
+
 foreach(OTHER_STATIC_BASE_FILE ${OTHER_STATIC_BASE_FILES})
     copy_base_file(${OTHER_STATIC_BASE_FILE})
 endforeach ()
diff --git a/cmake-scripts/asy-files.cmake b/cmake-scripts/asy-files.cmake
index efd15e46d..66c341329 100644
--- a/cmake-scripts/asy-files.cmake
+++ b/cmake-scripts/asy-files.cmake
@@ -22,7 +22,7 @@ set(CORE_BUILD_FILES
         ${CAMP_BUILD_FILES} ${SYMBOL_STATIC_BUILD_FILES}
         env genv stm dec errormsg
         callable name symbol entry exp newexp stack exithandlers
-        access virtualfieldaccess absyn record interact fileio
+        access virtualfieldaccess absyn record interact fileio hashing
         fftw++asy parallel simpson coder coenv impdatum locate asyparser program application
         varinit fundec refaccess envcompleter asyprocess constructor array memory
         Delaunay predicates glrender tr shaders jsfile v3dfile
diff --git a/cmake-scripts/subrepo-projects.cmake b/cmake-scripts/subrepo-projects.cmake
index a93c0367f..27b77ddd7 100644
--- a/cmake-scripts/subrepo-projects.cmake
+++ b/cmake-scripts/subrepo-projects.cmake
@@ -4,6 +4,15 @@ set(LSP_REPO_ROOT ${ASY_SUBREPO_CLONE_ROOT}/LspCpp)
 set(TINYEXR_SUBREPO_ROOT ${ASY_SUBREPO_CLONE_ROOT}/tinyexr)
 set(BOEHM_GC_ROOT ${ASY_SUBREPO_CLONE_ROOT}/gc)
 set(LIBATOMIC_OPS_ROOT ${ASY_SUBREPO_CLONE_ROOT}/libatomic_ops)
+set(HIGHWAYHASH_ROOT ${ASY_SUBREPO_CLONE_ROOT}/highwayhash)
+
+# highwayhash
+set(OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "highwayhash shared libs flag")
+add_subdirectory(${HIGHWAYHASH_ROOT})
+unset(BUILD_SHARED_LIBS CACHE)
+set(BUILD_SHARED_LIBS ${OLD_BUILD_SHARED_LIBS})
+list(APPEND ASY_STATIC_LIBARIES highwayhash)
 
 # boehm gc
 if (ENABLE_GC)
diff --git a/cmake-scripts/tests-asy.cmake b/cmake-scripts/tests-asy.cmake
index 7e990cfec..fe7876a94 100644
--- a/cmake-scripts/tests-asy.cmake
+++ b/cmake-scripts/tests-asy.cmake
@@ -84,7 +84,7 @@ add_asy_tests(
 if (ENABLE_GC)
     add_asy_tests(
             TEST_DIR gc
-            TESTS array funcall guide label path shipout string struct transform
+            TESTS array file funcall guide label path pen shipout string struct transform
             TEST_ARTIFACTS .eps
             TEST_NOT_PART_OF_CHECK_TEST true
     )
@@ -98,7 +98,7 @@ if (ENABLE_GSL)
 endif()
 
 add_asy_tests(TEST_DIR imp TESTS unravel)
-add_asy_tests(TEST_DIR io TESTS csv)
+add_asy_tests(TEST_DIR io TESTS csv read)
 add_asy_tests(TEST_DIR output TESTS circle line TEST_ARTIFACTS circle.eps line.eps)
 add_asy_tests(TEST_DIR pic TESTS trans)
 add_asy_tests(
@@ -109,14 +109,21 @@ add_asy_tests(
         TEST_DIR types
         TESTS
         autounravel builtinOps cast constructor ecast guide
-        init keyword order resolve shadow spec var
+        init keyword order overrideEquals resolve shadow spec var
 )
 
 add_asy_tests(
         TEST_DIR template
         TESTS
         initTest functionTest mapArrayTest multiImport nestedImport
-        singletype sortedsetTest splaytreeTest structTest
+        singletype structTest
+)
+
+add_asy_tests(
+        TEST_DIR datastructures
+        TESTS
+        bracketsTest changeWhileIterTest enumerateTest hashmapTest
+        hashrepsetTest hashTest iterTest queueTest
 )
 
 add_asy_tests(
diff --git a/common.h b/common.h
index d968d51be..1a4f427bd 100644
--- a/common.h
+++ b/common.h
@@ -53,6 +53,15 @@ using nonstd::optional;
 using nonstd::nullopt;
 using nonstd::make_optional;
 
+
+#if __cplusplus < 202002L
+#  include "span.hpp"
+using nonstd::span;
+#else
+#  include <span>
+using std::span;
+#endif
+
 using std::make_pair;
 
 #if !defined(FOR_SHARED) &&                                             \
diff --git a/dec.cc b/dec.cc
index f8673f9c8..70acc9f9a 100644
--- a/dec.cc
+++ b/dec.cc
@@ -71,9 +71,11 @@ bool usableInTemplate(ty *t) {
 }
 
 
-trans::tyEntry *astType::transAsTyEntry(coenv &e, record *where)
+trans::tyEntry* astType::transAsTyEntry(coenv& e, record* where)
 {
-  return new trans::tyEntry(trans(e, false), nullptr, where, getPos());
+  return new trans::tyEntry(
+          trans(e, ErrorMode::NORMAL), nullptr, where, getPos()
+  );
 }
 
 
@@ -113,17 +115,16 @@ void addNameOps(coenv &e, record *r, record *qt, varEntry *qv, position pos) {
 
 void nameTy::addOps(coenv &e, record *r, AutounravelOption opt)
 {
-  if (opt == AutounravelOption::Apply)
-  {
-    if (record* qt= dynamic_cast<record*>(id->getType(e, true)); qt)
-    {
+  if (opt == AutounravelOption::Apply) {
+    if (record* qt= dynamic_cast<record*>(id->getType(e, ErrorMode::SUPPRESS));
+        qt) {
       varEntry* qv= id->getVarEntry(e);
       addNameOps(e, r, qt, qv, getPos());
     }
   }
 }
 
-types::ty *nameTy::trans(coenv &e, bool tacit)
+types::ty *nameTy::trans(coenv &e, ErrorMode tacit)
 {
   return id->typeTrans(e, tacit);
 }
@@ -144,9 +145,9 @@ void dimensions::prettyprint(ostream &out, Int indent)
   out << "dimensions (" << depth << ")\n";
 }
 
-types::array *dimensions::truetype(types::ty *base, bool tacit)
+types::array *dimensions::truetype(types::ty *base, ErrorMode tacit)
 {
-  if (!tacit && base->kind == ty_void) {
+  if (tacit==ErrorMode::NORMAL && base->kind == ty_void) {
     em.error(getPos());
     em << "cannot declare array of type void";
   }
@@ -172,7 +173,7 @@ void arrayTy::prettyprint(ostream &out, Int indent)
 // NOTE: Can this be merged with trans somehow?
 void arrayTy::addOps(coenv &e, record *r, AutounravelOption)
 {
-  types::ty *t=trans(e, true);
+  types::ty *t=trans(e, ErrorMode::SUPPRESS);
 
   // Only add ops if it is an array (and not, say, an error)
   if (t->kind == types::ty_array) {
@@ -184,7 +185,7 @@ void arrayTy::addOps(coenv &e, record *r, AutounravelOption)
   }
 }
 
-types::ty *arrayTy::trans(coenv &e, bool tacit)
+types::ty *arrayTy::trans(coenv &e, ErrorMode tacit)
 {
   types::ty *ct = cell->trans(e, tacit);
   assert(ct);
@@ -220,7 +221,7 @@ void tyEntryTy::prettyprint(ostream &out, Int indent)
   out << "tyEntryTy: " << *(ent->t) << "\n";
 }
 
-types::ty *tyEntryTy::trans(coenv &, bool) {
+types::ty *tyEntryTy::trans(coenv &, ErrorMode) {
   return ent->t;
 }
 
@@ -566,7 +567,7 @@ void decidstart::prettyprint(ostream &out, Int indent)
     dims->prettyprint(out, indent+1);
 }
 
-types::ty *decidstart::getType(types::ty *base, coenv &, bool)
+types::ty *decidstart::getType(types::ty *base, coenv &, ErrorMode)
 {
   return dims ? dims->truetype(base) : base;
 }
@@ -575,7 +576,8 @@ trans::tyEntry *decidstart::getTyEntry(trans::tyEntry *base, coenv &e,
                                        record *where)
 {
   return dims ? new trans::tyEntry(
-                        getType(base->t, e, false), nullptr, where, getPos()
+                        getType(base->t, e, ErrorMode::NORMAL), nullptr, where,
+                        getPos()
                 )
               : base;
 }
@@ -604,7 +606,7 @@ void decidstart::addOps(types::ty *base, coenv &e, record *r)
     params->prettyprint(out, indent+1);
 }
 
-types::ty *fundecidstart::getType(types::ty *base, coenv &e, bool tacit)
+types::ty *fundecidstart::getType(types::ty *base, coenv &e, ErrorMode tacit)
 {
   types::ty *result = decidstart::getType(base, e, tacit);
 
@@ -617,21 +619,23 @@ types::ty *fundecidstart::getType(types::ty *base, coenv &e, bool tacit)
   }
 }
 
-trans::tyEntry *fundecidstart::getTyEntry(trans::tyEntry *base, coenv &e,
-                                          record *where)
+trans::tyEntry*
+fundecidstart::getTyEntry(trans::tyEntry* base, coenv& e, record* where)
 {
-  return new trans::tyEntry(getType(base->t,e,false), nullptr, where, getPos());
+  return new trans::tyEntry(
+          getType(base->t, e, ErrorMode::NORMAL), nullptr, where, getPos()
+  );
 }
 
-void fundecidstart::addOps(types::ty *base, coenv &e, record *r)
+void fundecidstart::addOps(types::ty* base, coenv& e, record* r)
 {
   decidstart::addOps(base, e, r);
 
   params->addOps(e, r);
 
-  types::function *ft=dynamic_cast<types::function *>(getType(base, e, true));
+  types::function* ft=
+          dynamic_cast<types::function*>(getType(base, e, ErrorMode::SUPPRESS));
   assert(ft);
-
 }
 
 
@@ -1113,7 +1117,7 @@ void recordInitializer(coenv &e, symbol id, record *r, position here)
   assert(r);
   {
     e.c.pushModifier(AUTOUNRAVEL);
-    function *ft = fun.transType(e, false);
+    function *ft = fun.transType(e, ErrorMode::NORMAL);
     assert(ft);
 
     symbol initSym=symbol::opTrans("init");
@@ -1200,7 +1204,9 @@ class PermissionSetter {
   coder &c;
   permission oldPerm;
 public:
-  PermissionSetter(coder &c, permission newPerm) : c(c), oldPerm(c.getPermission()) {
+  PermissionSetter(coder& c, permission newPerm)
+      : c(c), oldPerm(c.getPermission())
+  {
     c.setPermission(newPerm);
   }
   ~PermissionSetter() {
@@ -1295,7 +1301,7 @@ void unraveldec::prettyprint(ostream &out, Int indent)
 fromdec::qualifier unraveldec::getQualifier(coenv &e, record *)
 {
   // getType is where errors in the qualifier are reported.
-  record *qt=dynamic_cast<record *>(id->getType(e, false));
+  record *qt=dynamic_cast<record *>(id->getType(e, ErrorMode::NORMAL));
   if (!qt) {
     em.error(getPos());
     em << "qualifier is not a record";
@@ -1442,6 +1448,7 @@ void recorddec::transAsField(coenv &e, record *parent)
   // the default initializer first.
   re.c.closeRecord();
 
+  r->computeKVTypes(getPos());
 
   // Add types and variables defined during the record that should be added to
   // the enclosing environment.  These are the implicit constructors defined by
diff --git a/dec.h b/dec.h
index 95b1aef83..c34118617 100644
--- a/dec.h
+++ b/dec.h
@@ -66,7 +66,7 @@ class astType : public absyn {
   // Returns the internal representation of the type.  This method can
   // be called by exp::getType which does not report errors, so tacit is
   // needed to silence errors in this case.
-  virtual types::ty *trans(coenv &e, bool tacit = false) = 0;
+  virtual types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) = 0;
 
   virtual trans::tyEntry *transAsTyEntry(coenv &e, record *where);
 
@@ -93,7 +93,7 @@ class nameTy : public astType {
   void
   addOps(coenv& e, record* r,
          AutounravelOption opt= AutounravelOption::Apply) override;
-  types::ty *trans(coenv &e, bool tacit = false) override;
+  types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override;
   trans::tyEntry *transAsTyEntry(coenv &e, record *where) override;
 
   virtual operator string() const override;
@@ -114,7 +114,7 @@ class dimensions : public absyn {
     return depth;
   }
 
-  types::array *truetype(types::ty *base, bool tacit=false);
+  types::array *truetype(types::ty *base, ErrorMode tacit=ErrorMode::NORMAL);
 };
 
 class arrayTy : public astType {
@@ -134,7 +134,7 @@ class arrayTy : public astType {
   addOps(coenv& e, record* r,
          AutounravelOption opt= AutounravelOption::Apply) override;
 
-  types::ty *trans(coenv &e, bool tacit = false) override;
+  types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override;
 
   operator string() const override;
 };
@@ -151,7 +151,7 @@ class tyEntryTy : public astType {
 
   void prettyprint(ostream &out, Int indent) override;
 
-  types::ty *trans(coenv &e, bool tacit = false) override;
+  types::ty *trans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) override;
   trans::tyEntry *transAsTyEntry(coenv &, record *) override {
     return ent;
   }
@@ -362,7 +362,8 @@ class decidstart : public absyn {
 
   virtual void prettyprint(ostream &out, Int indent) override;
 
-  virtual types::ty *getType(types::ty *base, coenv &, bool = false);
+  virtual types::ty*
+  getType(types::ty* base, coenv&, ErrorMode tacit= ErrorMode::NORMAL);
   virtual trans::tyEntry *getTyEntry(trans::tyEntry *base, coenv &e,
                                      record *where);
 
@@ -391,9 +392,10 @@ class fundecidstart : public decidstart {
 
   void prettyprint(ostream &out, Int indent);
 
-  types::ty *getType(types::ty *base, coenv &e, bool tacit = false);
-  trans::tyEntry *getTyEntry(trans::tyEntry *base, coenv &e, record *where);
-  void addOps(types::ty *base, coenv &e, record *r);
+  types::ty*
+  getType(types::ty* base, coenv& e, ErrorMode tacit= ErrorMode::NORMAL);
+  trans::tyEntry* getTyEntry(trans::tyEntry* base, coenv& e, record* where);
+  void addOps(types::ty* base, coenv& e, record* r);
 };
 
 class decid : public absyn {
diff --git a/errormsg.cc b/errormsg.cc
index 61992d963..856635bf1 100644
--- a/errormsg.cc
+++ b/errormsg.cc
@@ -63,6 +63,8 @@ void errorstream::clear()
 
 void errorstream::message(position pos, const string& s)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   if (floating) out << endl;
   out << pos << ": " << s;
   floating = true;
@@ -70,6 +72,7 @@ void errorstream::message(position pos, const string& s)
 
 void errorstream::compiler(position pos)
 {
+  mode = ErrorMode::FORCE;
   message(pos,"Compiler bug; report to https://github.com/vectorgraphics/asymptote/issues:\n");
   anyErrors = true;
 }
@@ -81,36 +84,47 @@ void errorstream::compiler()
 
 void errorstream::runtime(position pos)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   message(pos,"runtime: ");
   anyErrors = true;
 }
 
 void errorstream::error(position pos)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   message(pos,"");
   anyErrors = true;
 }
 
 void errorstream::warning(position pos, string s)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   message(pos,"warning ["+s+"]: ");
   anyWarnings = true;
 }
 
 void errorstream::warning(position pos)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   message(pos,"warning: ");
   anyWarnings = true;
 }
 
 void errorstream::fatal(position pos)
 {
+  mode = ErrorMode::FORCE;
   message(pos,"abort: ");
   anyErrors = true;
 }
 
 void errorstream::trace(position pos)
 {
+  if (mode == ErrorMode::SUPPRESS)
+    return;
   static position lastpos;
   if(!pos || (pos.match(lastpos.filename()) && pos.match(lastpos.Line())))
     return;
diff --git a/errormsg.h b/errormsg.h
index df6398a2c..b00a9c3f8 100644
--- a/errormsg.h
+++ b/errormsg.h
@@ -162,6 +162,13 @@ inline bool operator == (const position& a, const position& b)
 
 string warning(string s);
 
+enum class ErrorMode
+{
+  SUPPRESS,// Suppress warnings and errors.
+  NORMAL,
+  FORCE,// Like normal mode, but ignores attempts to change the mode.
+};
+
 class errorstream {
   ostream& out;
   bool anyErrors;
@@ -171,6 +178,13 @@ class errorstream {
   // Is there an error that warrants the asy process to return 1 instead of 0?
   bool anyStatusErrors;
 
+  ErrorMode mode;
+  void setMode(ErrorMode newMode)
+  {
+    if (mode != ErrorMode::FORCE)
+      mode= newMode;
+  }
+
 public:
   static bool interrupt; // Is there a pending interrupt?
 
@@ -179,7 +193,7 @@ class errorstream {
 
   errorstream(ostream& out = cerr)
     : out(out), anyErrors(false), anyWarnings(false), floating(false),
-      anyStatusErrors(false) {}
+      anyStatusErrors(false), mode(ErrorMode::NORMAL) {}
 
 
   void clear();
@@ -218,8 +232,10 @@ class errorstream {
   // NOTE: May later make it do automatic line breaking for long messages.
   template<class T>
   errorstream& operator << (const T& x) {
-    flush(out);
-    out << x;
+    if (mode != ErrorMode::SUPPRESS) {
+      flush(out);
+      out << x;
+    }
     return *this;
   }
 
@@ -246,6 +262,21 @@ class errorstream {
   bool processStatus() const {
     return !anyStatusErrors;
   }
+
+  class ModeGuard
+  {
+    errorstream& es;
+    ErrorMode oldMode;
+
+  public:
+    ModeGuard(errorstream& es, ErrorMode newMode) : es(es), oldMode(es.mode)
+    {
+      es.setMode(newMode);
+    }
+    ~ModeGuard() { es.setMode(oldMode); }
+  };
+
+  ModeGuard modeGuard(ErrorMode newMode) { return ModeGuard(*this, newMode); }
 };
 
 extern errorstream em;
diff --git a/errors b/errors
index a679ea016..33c2e9986 100644
--- a/errors
+++ b/errors
@@ -151,9 +151,8 @@ errortest.asy: 438.17: cannot cast 'int' to 'var'
 errortest.asy: 442.7: could not infer type of initializer
 errortest.asy: 446.7: could not infer type of initializer
 errortest.asy: 448.7: could not infer type of initializer
-errortest.asy: 452.16: expression is not an array of inferable type
-errortest.asy: 457.16: expression is not an array of inferable type
-errortest.asy: 463.16: expression is not an array of inferable type
+errortest.asy: 452.16: cannot iterate over expression of type 'int'
+errortest.asy: 457.16: cannot resolve type for iteration
 errortest.asy: 470.7: array expression cannot be used as an address
 errortest.asy: 519.29: expected 'as'
 errortest.asy: 521.30: expected 'as'
@@ -204,3 +203,12 @@ errortest.asy: 626.9: accessing private field outside of structure
 errortest.asy: 627.4: accessing private field outside of structure
 errortest.asy: 628.4: accessing private field outside of structure
 errortest.asy: 639.4: accessing private field outside of structure
+errortest.asy: 643.3: multiple operator[] definitions in one struct
+errortest.asy: 650.3: operator[=] defined without operator[]
+errortest.asy: 656.3: operator[=] must return void
+errortest.asy: 667.3: no matching variable '<exp>.valid'
+errortest.asy: 667.3: no matching variable '<exp>.get'
+errortest.asy: 667.3: no matching variable '<exp>.advance'
+errortest.asy: 677.16: cannot iterate over expression of type 'int(int i)'
+errortest.asy: 684.17: cannot call 'int f(int i)' with parameter 'string'
+errortest.asy: 687.17: cannot call 'int f(int i)' with parameter 'string'
diff --git a/errortest.asy b/errortest.asy
index e5ffbb59b..50dcf3c7b 100644
--- a/errortest.asy
+++ b/errortest.asy
@@ -638,3 +638,52 @@
   }
   T.x;  // incorrectly accessing private field
 }
+{
+  // multiple signatures for operator[]
+  struct A {
+    int operator[](string);
+    int operator[](int);
+  }
+}
+{
+  // operator[=] without operator[]
+  struct A {
+    void operator[=](int);
+  }
+}
+{
+  // non-void operator[=]
+  struct A {
+    int operator[](string);
+    int operator[=](string, int);
+  }
+}
+{
+  // operator iter returns a non-iterable type
+  struct A {
+    int operator iter() { return 0; }
+  }
+  A a;
+  for (var i : a)
+    ;
+}
+{
+  // Implicitly cast a function to an array
+  using Function = int(int);
+  int[] operator cast(Function f) {
+    return sequence(f, 10);
+  }
+  int f(int i) { return i + 17; }
+  for (var i : f)  // This would work if we used `int` rather than `var`.
+    ;
+}
+{
+  // Iterate over an ill-formed expression
+  int f(int i) { return 7; }
+  // cannot call 'int f(int i)' with parameter 'string'
+  for (int i : f('asdf'))
+    ;
+  // cannot call 'int f(int i)' with parameter 'string'
+  for (var i : f('asdf'))
+    ;
+}
\ No newline at end of file
diff --git a/exp.cc b/exp.cc
index 28f16c477..1efcc748c 100644
--- a/exp.cc
+++ b/exp.cc
@@ -52,6 +52,15 @@ void exp::transToType(coenv &e, types::ty *target)
 {
   types::ty *ct=cgetType(e);
 
+  // stringstream ss;
+  // target->print(ss);
+  // ss << " <- ";
+  // this->cgetType(e)->print(ss);
+  // string targetStr = ss.str();
+  // if (targetStr == "int <- B") {
+  //   cout << "transToType: " << targetStr << endl;
+  // }
+
   if (equivalent(target, ct)) {
     transAsType(e, target);
     return;
@@ -141,6 +150,15 @@ types::ty *tempExp::trans(coenv &e) {
   return t;
 }
 
+exp *tempExp::evaluate(coenv &e, types::ty *target) {
+  if (equivalent(target, t)) {
+    // A tempExp, by design, has no side effects.
+    return this;
+  }
+  // Apply implicit cast.
+  return new tempExp(e, this, target);
+}
+
 
 varEntryExp::varEntryExp(position pos, types::ty *t, access *a)
   : exp(pos), v(new trans::varEntry(t, a, 0, nullPos)) {}
@@ -186,6 +204,14 @@ void nameExp::prettyprint(ostream &out, Int indent)
 
   value->prettyprint(out, indent+1);
 }
+exp *nameExp::evaluate(coenv &e, types::ty *target) {
+  // Names have no side effects unless an implicit cast is needed.
+  if (equivalent(target, cgetType(e))) {
+    // No side effects.
+    return this;
+  }
+  return new tempExp(e, this, target);
+}
 
 void fieldExp::pseudoName::prettyprint(ostream &out, Int indent)
 {
@@ -214,14 +240,30 @@ types::ty *fieldExp::getObject(coenv& e)
   return t;
 }
 
+exp *fieldExp::evaluate(coenv &e, types::ty *t) {
+  if (equivalent(cgetType(e), t)) {
+    // Evaluate the object.
+    return new fieldExp(getPos(),
+                        object->evaluate(e, getObject(e)),
+                        field);
+  }
+  // Evaluate `this` and cast it to the correct type.
+  return new tempExp(e, this, t);
+}
 
-array *arrayExp::getArrayType(coenv &e)
+types::ty *bracketsExp::getObjectType(coenv &e) {
+  types::ty *t = object->cgetType(e);
+  if (t->kind == ty_overloaded) {
+    t = ((overloaded *)t)->signatureless();
+  }
+  return t;
+}
+
+array *bracketsExp::getArrayType(coenv &e)
 {
-  types::ty *a = set->cgetType(e);
-  if (a->kind == ty_overloaded) {
-    a = ((overloaded *)a)->signatureless();
-    if (!a)
-      return 0;
+  types::ty *a = getObjectType(e);
+  if (a == nullptr) {
+    return nullptr;
   }
 
   switch (a->kind) {
@@ -234,19 +276,19 @@ array *arrayExp::getArrayType(coenv &e)
   }
 }
 
-array *arrayExp::transArray(coenv &e)
+array *bracketsExp::transArray(coenv &e)
 {
-  types::ty *a = set->cgetType(e);
+  types::ty *a = object->cgetType(e);
   if (a->kind == ty_overloaded) {
     a = ((overloaded *)a)->signatureless();
     if (!a) {
-      em.error(set->getPos());
+      em.error(object->getPos());
       em << "expression is not an array";
       return 0;
     }
   }
 
-  set->transAsType(e, a);
+  object->transAsType(e, a);
 
   switch (a->kind) {
     case ty_array:
@@ -254,7 +296,7 @@ array *arrayExp::transArray(coenv &e)
     case ty_error:
       return 0;
     default:
-      em.error(set->getPos());
+      em.error(object->getPos());
       em << "expression is not an array";
       return 0;
   }
@@ -275,12 +317,38 @@ void subscriptExp::prettyprint(ostream &out, Int indent)
   prettyindent(out, indent);
   out << "subscriptExp\n";
 
-  set->prettyprint(out, indent+1);
+  object->prettyprint(out, indent+1);
   index->prettyprint(out, indent+1);
 }
 
+callExp *buildSubscriptReadCall(exp *object, exp *index) {
+  // Convert object[index] into
+  // object.operator[](index)
+  const static symbol SYM_BRACKETS = symbol::trans("[]");
+  position pos = object->getPos();
+  return new callExp(
+          pos, new fieldExp(pos, object, SYM_BRACKETS), index
+  );
+}
+
+callExp *buildSubscriptWriteCall(exp *object, exp *index, exp *value) {
+  // Convert object[index] = value into
+  // object.operator[=](index, value)
+  const static symbol SYM_BRACKETS_ASSIGN = symbol::trans("[=]");
+  position pos = object->getPos();
+  return new callExp(
+          pos, new fieldExp(pos, object, SYM_BRACKETS_ASSIGN), index, value
+  );
+}
+
 types::ty *subscriptExp::trans(coenv &e)
 {
+  // EXPERIMENTAL
+  if (!isAnArray(e, object)) {
+    callExp *call = buildSubscriptReadCall(object, index);
+    return call->trans(e);
+  }
+
   array *a = transArray(e);
   if (!a)
     return primError();
@@ -301,6 +369,22 @@ types::ty *subscriptExp::trans(coenv &e)
 
 types::ty *subscriptExp::getType(coenv &e)
 {
+  // EXPERIMENTAL
+  if (!isAnArray(e, object)) {
+    ty *t = object->cgetType(e);
+    if (t->kind == ty_overloaded) {
+      t = ((overloaded *)t)->signatureless();
+      if (!t)
+        return primError();
+    }
+    if (t->kind != ty_record) {
+      return primError();
+    }
+    return static_cast<record*>(t)->valType();
+    // callExp *call = buildSubscriptReadCall(set, index);
+    // return call->getType(e);
+  }
+
   array *a = getArrayType(e);
   return a ? (isAnArray(e, index) ? a : a->celltype) :
     primError();
@@ -308,6 +392,34 @@ types::ty *subscriptExp::getType(coenv &e)
 
 void subscriptExp::transWrite(coenv &e, types::ty *t, exp *value)
 {
+  // EXPERIMENTAL
+  if (!isAnArray(e, object)) {
+    // Find the types of object and index.
+    types::ty *objectType = getObjectType(e);
+    assert(objectType);
+    types::ty *indexType = objectType->keyType();
+    if (!indexType || indexType->kind == ty_error) {
+      em.error(object->getPos());
+      em << "object does not have operator[]";
+      return;
+    }
+    // Evaluate them to control the order in which side effects occur.
+    // We need value evaluated because we use it twice. We need the other two
+    // because any side effects from their translation should occur before the
+    // side effects from translating value.
+    exp *objectEvaluated = object->evaluate(e, objectType);
+    exp *indexEvaluated = index->evaluate(e, indexType);
+    exp *valueEvaluated = value->evaluate(e, t);
+    // Call object.operator[=](index, value).
+    callExp* call= buildSubscriptWriteCall(
+            objectEvaluated, indexEvaluated, valueEvaluated
+    );
+    call->trans(e);
+    // Push the value back on the stack as the result of the assignment.
+    valueEvaluated->transAsType(e, t);
+    return;
+  }
+
   // Put array, index, and value on the stack in that order, then call
   // arrayWrite.
   array *a = transArray(e);
@@ -331,6 +443,31 @@ void subscriptExp::transWrite(coenv &e, types::ty *t, exp *value)
   e.c.encode(inst::builtin, run::arrayWrite);
 }
 
+exp *subscriptExp::evaluate(coenv &e, types::ty *)
+{
+  types::ty *base = object->cgetType(e);
+  if (base->kind == ty_overloaded) {
+    base = ((overloaded *)base)->signatureless();
+  }
+  if (!base) {
+    em.error(object->getPos());
+    em << "object to index cannot be resolved";
+    return nullptr;
+  }
+  types::ty *indexType = base->keyType();
+  if (indexType->kind == ty_error) {
+    em.error(object->getPos());
+    em << "object does not have operator[=] set up correctly";
+    return nullptr;
+  }
+  // Force object and index to be evaluated in the correct order.
+  // (Note that in C++, the order of evaluation of function arguments is
+  // unspecified.)
+  exp *a = object->evaluate(e, base);
+  exp *b = index->evaluate(e, indexType);
+  return new subscriptExp(getPos(), a, b);
+}
+
 
 void slice::prettyprint(ostream &out, Int indent)
 {
@@ -361,7 +498,7 @@ void slice::trans(coenv &e)
 void sliceExp::prettyprint(ostream &out, Int indent)
 {
   prettyname(out, "sliceExp", indent, getPos());
-  set->prettyprint(out, indent+1);
+  object->prettyprint(out, indent+1);
   index->prettyprint(out, indent+1);
 }
 
@@ -1157,7 +1294,7 @@ types::ty *castExp::trans(coenv &e)
 
 types::ty *castExp::getType(coenv &e)
 {
-  return target->trans(e, true);
+  return target->trans(e, ErrorMode::SUPPRESS);
 }
 
 
diff --git a/exp.h b/exp.h
index 152df76de..4a6e572de 100644
--- a/exp.h
+++ b/exp.h
@@ -193,13 +193,15 @@ class tempExp : public exp {
 public:
   tempExp(coenv &e, varinit *v, types::ty *t);
 
-  void prettyprint(ostream &out, Int indent);
+  void prettyprint(ostream &out, Int indent) override;
 
-  types::ty *trans(coenv &e);
+  types::ty *trans(coenv &e) override;
 
-  types::ty *getType(coenv &) {
+  types::ty *getType(coenv &) override {
     return t;
   }
+
+  exp *evaluate(coenv &e, types::ty *target) override;
 };
 
 // Wrap a varEntry so that it can be used as an expression.
@@ -267,10 +269,8 @@ class nameExp : public exp {
       em << "use of variable \'" << *value << "\' is ambiguous";
       return types::primError();
     }
-    else {
-      transAsType(e, t);
-      return t;
-    }
+    transAsType(e, t);
+    return t;
   }
 
   types::ty *getType(coenv &e) override {
@@ -298,10 +298,7 @@ class nameExp : public exp {
     ct=0;  // See note in transAsType.
   }
 
-  exp *evaluate(coenv &, types::ty *) override {
-    // Names have no side-effects.
-    return this;
-  }
+  exp *evaluate(coenv &, types::ty *) override;
 };
 
 // Most fields accessed are handled as parts of qualified names, but in cases
@@ -336,8 +333,8 @@ class fieldExp : public nameExp {
     }
 
     // As a type:
-    types::ty *typeTrans(coenv &, bool tacit = false) {
-      if (!tacit) {
+    types::ty *typeTrans(coenv &, ErrorMode tacit = ErrorMode::NORMAL) {
+      if (tacit == ErrorMode::NORMAL) {
         em.error(getPos());
         em << "expression is not a type";
       }
@@ -364,6 +361,10 @@ class fieldExp : public nameExp {
     void print(ostream& out) const {
       out << "<exp>";
     }
+    void printPath(ostream& out) const {
+      em.compiler(getPos());
+      em << "expression cannot be used as a path";
+    }
 
     symbol getName() const {
       return object->getName();
@@ -371,7 +372,8 @@ class fieldExp : public nameExp {
 
     AsymptoteLsp::SymbolLit getLit() const
     {
-      return AsymptoteLsp::SymbolLit(static_cast<std::string>(object->getName()));
+      return AsymptoteLsp::SymbolLit(static_cast<std::string>(object->getName())
+      );
     }
   };
 
@@ -392,33 +394,30 @@ class fieldExp : public nameExp {
     return field;
   }
 
-  exp *evaluate(coenv &e, types::ty *) {
-    // Evaluate the object.
-    return new fieldExp(getPos(),
-                        new tempExp(e, object, getObject(e)),
-                        field);
-  }
+  exp *evaluate(coenv &e, types::ty *);
 };
 
-class arrayExp : public exp {
+// Common functionality for subscriptExp and sliceExp.
+class bracketsExp : public exp {
 protected:
-  exp *set;
+  exp *object;
 
+  types::ty *getObjectType(coenv &e);
   array *getArrayType(coenv &e);
   array *transArray(coenv &e);
 
 public:
-  arrayExp(position pos, exp *set)
-    : exp(pos), set(set) {}
+  bracketsExp(position pos, exp *set)
+    : exp(pos), object(set) {}
 };
 
 
-class subscriptExp : public arrayExp {
+class subscriptExp : public bracketsExp {
   exp *index;
 
 public:
   subscriptExp(position pos, exp *set, exp *index)
-    : arrayExp(pos, set), index(index) {}
+    : bracketsExp(pos, set), index(index) {}
 
   void prettyprint(ostream &out, Int indent);
 
@@ -426,11 +425,7 @@ class subscriptExp : public arrayExp {
   types::ty *getType(coenv &e);
   void transWrite(coenv &e, types::ty *t, exp *value);
 
-  exp *evaluate(coenv &e, types::ty *) {
-    return new subscriptExp(getPos(),
-                            new tempExp(e, set, getArrayType(e)),
-                            new tempExp(e, index, types::primInt()));
-  }
+  exp *evaluate(coenv &e, types::ty *);
 };
 
 class slice : public absyn {
@@ -458,12 +453,12 @@ class slice : public absyn {
   }
 };
 
-class sliceExp : public arrayExp {
+class sliceExp : public bracketsExp {
   slice *index;
 
 public:
   sliceExp(position pos, exp *set, slice *index)
-    : arrayExp(pos, set), index(index) {}
+    : bracketsExp(pos, set), index(index) {}
 
   void prettyprint(ostream &out, Int indent);
 
@@ -473,7 +468,7 @@ class sliceExp : public arrayExp {
 
   exp *evaluate(coenv &e, types::ty *) {
     return new sliceExp(getPos(),
-                        new tempExp(e, set, getArrayType(e)),
+                        new tempExp(e, object, getArrayType(e)),
                         index->evaluate(e));
   }
 };
@@ -826,8 +821,8 @@ class callExp : public exp {
   using colorInfo = std::tuple<double, double, double>;
 
   /**
-   * @return nullopt if callExp is not a color, pair<color, nullopt> if color is RGB,
-   * and pair<color, alpha> if color is RGBA.
+   * @return nullopt if callExp is not a color, pair<color, nullopt> if color is
+   * RGB, and pair<color, alpha> if color is RGBA.
    */
   optional<std::tuple<colorInfo, optional<double>,
     AsymptoteLsp::posInFile, AsymptoteLsp::posInFile>> getColorInformation();
diff --git a/fundec.cc b/fundec.cc
index 99e8f436c..8b13cc4a1 100644
--- a/fundec.cc
+++ b/fundec.cc
@@ -31,17 +31,17 @@ void formal::prettyprint(ostream &out, Int indent)
   if (defval) defval->prettyprint(out, indent+1);
 }
 
-types::formal formal::trans(coenv &e, bool encodeDefVal, bool tacit) {
+types::formal formal::trans(coenv &e, bool encodeDefVal, ErrorMode tacit) {
   return types::formal(getType(e,tacit),
                        getName(),
                        encodeDefVal ? (bool) getDefaultValue() : 0,
                        getExplicit());
 }
 
-types::ty *formal::getType(coenv &e, bool tacit) {
+types::ty *formal::getType(coenv &e, ErrorMode tacit) {
   types::ty *bt = base->trans(e, tacit);
   types::ty *t = start ? start->getType(bt, e, tacit) : bt;
-  if (t->kind == ty_void && !tacit) {
+  if (t->kind == ty_void && tacit != ErrorMode::SUPPRESS) {
     em.error(getPos());
     em << "cannot declare parameters of type void";
     return primError();
@@ -53,7 +53,7 @@ types::ty *formal::getType(coenv &e, bool tacit) {
 void formal::addOps(coenv &e, record *r) {
   base->addOps(e, r);
   if (start)
-    start->addOps(base->trans(e, true), e, r);
+    start->addOps(base->trans(e, ErrorMode::SUPPRESS), e, r);
 }
 
 void formals::prettyprint(ostream &out, Int indent)
@@ -65,7 +65,7 @@ void formals::prettyprint(ostream &out, Int indent)
 }
 
 void formals::addToSignature(signature& sig,
-                             coenv &e, bool encodeDefVal, bool tacit)
+                             coenv &e, bool encodeDefVal, ErrorMode tacit)
 {
   for (list<formal *>::iterator p = fields.begin(); p != fields.end(); ++p) {
     formal& f=**p;
@@ -78,7 +78,7 @@ void formals::addToSignature(signature& sig,
   }
 
   if (rest) {
-    if (!tacit && rest->getDefaultValue()) {
+    if (tacit!=ErrorMode::SUPPRESS && rest->getDefaultValue()) {
       em.error(rest->getPos());
       em << "rest parameters cannot have default values";
     }
@@ -89,7 +89,7 @@ void formals::addToSignature(signature& sig,
 // Returns the types of each parameter as a signature.
 // encodeDefVal means that it will also encode information regarding
 // the default values into the signature
-signature *formals::getSignature(coenv &e, bool encodeDefVal, bool tacit)
+signature *formals::getSignature(coenv &e, bool encodeDefVal, ErrorMode tacit)
 {
   signature *sig = new signature;
   addToSignature(*sig,e,encodeDefVal,tacit);
@@ -101,7 +101,7 @@ signature *formals::getSignature(coenv &e, bool encodeDefVal, bool tacit)
 // value of types::ty *result.
 function *formals::getType(types::ty *result, coenv &e,
                            bool encodeDefVal,
-                           bool tacit)
+                           ErrorMode tacit)
 {
   function *ft = new function(result);
   addToSignature(ft->sig,e,encodeDefVal,tacit);
@@ -177,7 +177,7 @@ void formal::transAsVar(coenv &e, Int index) {
 
     // Suppress error messages because they will already be reported
     // when the formals are translated to yield the type earlier.
-    types::ty *t = getType(e, true);
+    types::ty *t = getType(e, ErrorMode::SUPPRESS);
     varEntry *v = new varEntry(t, a, 0, getPos());
 
     // Translate the default argument before adding the formal to the
@@ -212,12 +212,12 @@ void fundef::prettyprint(ostream &out, Int indent)
   body->prettyprint(out, indent+1);
 }
 
-function *fundef::transType(coenv &e, bool tacit) {
+function *fundef::transType(coenv &e, ErrorMode tacit) {
   bool encodeDefVal=true;
   return params->getType(result->trans(e, tacit), e, encodeDefVal, tacit);
 }
 
-function *fundef::transTypeAndAddOps(coenv &e, record *r, bool tacit) {
+function *fundef::transTypeAndAddOps(coenv &e, record *r, ErrorMode tacit) {
   result->addOps(e,r);
   params->addOps(e,r);
 
@@ -284,7 +284,7 @@ types::ty *fundef::trans(coenv &e) {
   //   new guide[] (guide f(int)) {
   //     return sequence(f, 10);
   //   };
-  function *ft=transTypeAndAddOps(e, (record *)0, false);
+  function *ft=transTypeAndAddOps(e, (record *)0, ErrorMode::NORMAL);
   assert(ft);
 
   baseTrans(e, ft);
@@ -307,7 +307,7 @@ void fundec::trans(coenv &e)
 
 void fundec::transAsField(coenv &e, record *r)
 {
-  function *ft = fun.transTypeAndAddOps(e, r, false);
+  function *ft = fun.transTypeAndAddOps(e, r, ErrorMode::NORMAL);
   assert(ft);
 
   createVar(getPos(), e, r, id, ft, fun.makeVarInit(ft));
diff --git a/fundec.h b/fundec.h
index f90d2c3f2..3e499311b 100644
--- a/fundec.h
+++ b/fundec.h
@@ -30,13 +30,14 @@ class formal : public absyn {
   virtual void prettyprint(ostream &out, Int indent) override;
 
   // Build the corresponding types::formal to put into a signature.
-  types::formal trans(coenv &e, bool encodeDefVal, bool tacit=false);
+  types::formal
+  trans(coenv& e, bool encodeDefVal, ErrorMode tacit= ErrorMode::NORMAL);
 
   // Add the formal parameter to the environment to prepare for the
   // function body's translation.
   virtual void transAsVar(coenv &e, Int index);
 
-  types::ty *getType(coenv &e, bool tacit=false);
+  types::ty *getType(coenv &e, ErrorMode tacit=ErrorMode::NORMAL);
 
   absyntax::astType *getAbsyntaxType() { return base; }
 
@@ -84,7 +85,7 @@ class formals : public absyn {
   bool keywordOnly;
 
   void addToSignature(types::signature& sig,
-                      coenv &e, bool encodeDefVal, bool tacit);
+                      coenv &e, bool encodeDefVal, ErrorMode tacit);
 public:
   formals(position pos)
     : absyn(pos), rest(0), keywordOnly(false) {}
@@ -126,13 +127,13 @@ class formals : public absyn {
   // the default values into the signature
   types::signature *getSignature(coenv &e,
                                  bool encodeDefVal = false,
-                                 bool tacit = false);
+                                 ErrorMode tacit = ErrorMode::NORMAL);
 
   // Returns the corresponding function type, assuming it has a return
   // value of "result."
   types::function *getType(types::ty *result, coenv &e,
                            bool encodeDefVal = false,
-                           bool tacit = false);
+                           ErrorMode tacit = ErrorMode::NORMAL);
   
   mem::vector<tySymbolPair> *getFields();
 
@@ -167,10 +168,11 @@ class fundef : public exp {
   virtual void baseTrans(coenv &e, types::function *ft);
   virtual types::ty *trans(coenv &e) override;
 
-  virtual types::function *transType(coenv &e, bool tacit);
-  virtual types::function *transTypeAndAddOps(coenv &e, record *r, bool tacit);
+  virtual types::function *transType(coenv &e, ErrorMode tacit);
+  virtual types::function*
+  transTypeAndAddOps(coenv& e, record* r, ErrorMode tacit);
   virtual types::ty *getType(coenv &e) override {
-    return transType(e, true);
+    return transType(e, ErrorMode::SUPPRESS);
   }
 
   void createSymMap(AsymptoteLsp::SymbolContext* symContext) override;
diff --git a/hashing.cc b/hashing.cc
new file mode 100644
index 000000000..7e2f45b3f
--- /dev/null
+++ b/hashing.cc
@@ -0,0 +1,73 @@
+#include "hashing.h"
+
+#include <iostream>  // For Debugging ONLY
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#include <highwayhash/highwayhash_target.h>
+#include <highwayhash/instruction_sets.h>
+
+namespace hashing {
+using namespace highwayhash;
+
+// uint64_t highwayHash() {
+//   HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+//   char in[8] = {1};
+//   HHResult64 result;  // or HHResult128 or HHResult256
+//   InstructionSets::Run<HighwayHash>(key, in, 8, &result);
+//   return result;
+// }
+
+uint64_t constexpr shiftLeftDefined(uint64_t x, int8_t shift) {
+  return shift >= 64 ? 0 : x << shift;
+}
+
+uint64_t random_bits(int8_t bits) {
+  static std::random_device *rd = new std::random_device();
+  static auto *gen = new std::mt19937_64((*rd)());
+  // uint64_t max = (bits >= 64 ? UINT64_C(-1) : (UINT64_C(1) << bits) - 1);
+  std::uniform_int_distribution<uint64_t> dist(
+    0, shiftLeftDefined(1, bits) - 1);
+  return dist(*gen);
+}
+
+uint64_t hashSpan(span<const char> s) {
+  HH_ALIGNAS(32) static const HHKey key = {random_bits(64), random_bits(64),
+                                           random_bits(64), random_bits(64)};
+  HHResult64 result;
+  InstructionSets::Run<HighwayHash>(key, s.data(), s.size(), &result);
+  return result & (shiftLeftDefined(1, 62) - 1);
+}
+
+uint64_t hashSpan(span<const uint64_t> s) {
+  span<const char> sChar = {reinterpret_cast<const char*>(s.data()),
+                            s.size() * (sizeof(uint64_t) / sizeof(char))};
+  return hashSpan(sChar);
+}
+
+std::array<uint64_t, 4> fingerprint(span<const char> s) {
+  // The following key was generated using the Python `secrets` module.
+  // However, since the key is public, the resulting hash is not secure.
+  // (While HighwayHash makes cryptographic claims, those claims rely on
+  // the secrecy of the key.)
+  HH_ALIGNAS(32) static constexpr HHKey key= {
+      UINT64_C(0x6e1b31ab5e83c15a),
+      UINT64_C(0x6648d2208b67c4af),
+      UINT64_C(0xcddc6e8f557f7103),
+      UINT64_C(0x0729a6dd6e86d99a)
+  };
+  HHResult256 result;
+  InstructionSets::Run<HighwayHash>(key, s.data(), s.size(), &result);
+  std::array<uint64_t, 4> fingerprint;
+  std::copy_n(result, 4, fingerprint.begin());
+  return fingerprint;
+}
+
+uint64_t hashInt(uint64_t i) {
+  span<const uint64_t> s = {&i, 1};
+  return hashSpan(s);
+}
+
+
+}  // namespace hashing
diff --git a/hashing.h b/hashing.h
new file mode 100644
index 000000000..724c9d392
--- /dev/null
+++ b/hashing.h
@@ -0,0 +1,13 @@
+#include <cstdint>
+#include <array>
+
+#include "common.h"
+
+namespace hashing {
+
+uint64_t hashSpan(span<const uint64_t> s);
+uint64_t hashSpan(span<const char> s);
+uint64_t hashInt(uint64_t i);
+std::array<uint64_t, 4> fingerprint(span<const char> s);
+
+}  // namespace hashing
\ No newline at end of file
diff --git a/highwayhash/.gitignore b/highwayhash/.gitignore
new file mode 100644
index 000000000..1da3cef1b
--- /dev/null
+++ b/highwayhash/.gitignore
@@ -0,0 +1,13 @@
+bin
+lib
+obj
+deps.mk
+OWNERS
+*.a
+*.o
+benchmark
+nanobenchmark_example
+profiler_example
+sip_hash_test
+vector_test
+highwayhash_test
diff --git a/highwayhash/.gitrepo b/highwayhash/.gitrepo
new file mode 100644
index 000000000..58fee452f
--- /dev/null
+++ b/highwayhash/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/google/highwayhash.git
+	branch = master
+	commit = 5ad3bf8444cfc663b11bf367baaa31f36e7ff7c8
+	parent = 4c385fa31d75be1faebf40a246d57a10b944c6fb
+	method = merge
+	cmdver = 0.4.6
diff --git a/highwayhash/.travis.yml b/highwayhash/.travis.yml
new file mode 100644
index 000000000..e05097581
--- /dev/null
+++ b/highwayhash/.travis.yml
@@ -0,0 +1,10 @@
+language: cpp
+
+dist: trusty
+
+compiler:
+  - clang
+  - gcc
+
+script:
+  - make
diff --git a/highwayhash/CMakeLists.txt b/highwayhash/CMakeLists.txt
new file mode 100644
index 000000000..1af921225
--- /dev/null
+++ b/highwayhash/CMakeLists.txt
@@ -0,0 +1,251 @@
+
+project(highwayhash C CXX)
+
+cmake_minimum_required(VERSION 3.18)
+
+# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to make
+# it prominent in the GUI.
+option(BUILD_SHARED_LIBS "Build library as shared." OFF)
+
+# Force PIC on unix when building shared libs
+# see: https://en.wikipedia.org/wiki/Position-independent_code
+if(BUILD_SHARED_LIBS AND UNIX)
+  option(CMAKE_POSITION_INDEPENDENT_CODE "Build with Position Independant Code." ON)
+endif()
+
+
+set(PROCESSOR_IS_ARM FALSE)
+set(PROCESSOR_IS_AARCH64 FALSE)
+set(PROCESSOR_IS_X86 FALSE)
+set(PROCESSOR_IS_POWER FALSE)
+
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)")
+  set(PROCESSOR_IS_AARCH64 TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set(PROCESSOR_IS_ARM TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
+  set(PROCESSOR_IS_X86 TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
+  set(PROCESSOR_IS_POWER TRUE)
+endif()
+
+message(STATUS "Processor: ARM=${PROCESSOR_IS_ARM}, AARCH64=${PROCESSOR_IS_AARCH64}, X86=${PROCESSOR_IS_X86}, POWER=${PROCESSOR_IS_POWER}")
+
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CLANG)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O3 -fPIC -pthread -Wno-maybe-uninitialized")
+  if(PROCESSOR_IS_ARM)
+    # aarch64 and ARM use the same code, although ARM usually needs an extra flag for NEON.
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -march=armv7-a -mfpu=neon")
+  endif()
+endif()
+
+
+#
+# library : highwayhash
+#
+
+set(HH_INCLUDES
+  ${PROJECT_SOURCE_DIR}/highwayhash/c_bindings.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash.h
+)
+
+set(HH_SOURCES
+  ${PROJECT_SOURCE_DIR}/highwayhash/c_bindings.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/arch_specific.cc
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_target.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.cc
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/scalar_sip_tree_hash.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.cc
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/state_helpers.h
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/arch_specific.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/compiler_specific.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/load3.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/vector128.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/vector256.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/endianess.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/iaca.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/hh_types.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/hh_buffer.h
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/scalar_sip_tree_hash.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash.h
+  ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.h
+)
+
+if(PROCESSOR_IS_ARM OR PROCESSOR_IS_AARCH64)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_neon.cc)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_neon.h)
+
+elseif(PROCESSOR_IS_POWER)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.cc)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.h)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/benchmark.cc
+    PROPERTIES COMPILE_FLAGS -mvsx)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/hh_vsx.cc
+    PROPERTIES COMPILE_FLAGS -mvsx)
+
+elseif(PROCESSOR_IS_X86)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.cc)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.cc)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.h)
+  list(APPEND HH_SOURCES  ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.h)
+
+  # TODO: Portability: Have AVX2 be optional so benchmarking can be done on older machines.
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/benchmark.cc
+    PROPERTIES COMPILE_FLAGS  -mavx2)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/sip_tree_hash.cc
+    PROPERTIES COMPILE_FLAGS  -mavx2)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/hh_avx2.cc
+    PROPERTIES COMPILE_FLAGS  -mavx2)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/hh_sse41.cc
+    PROPERTIES COMPILE_FLAGS  -msse4.1)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/hh_portable.cc
+    PROPERTIES COMPILE_FLAGS  -DHH_TARGET_NAME=Portable)
+
+else()
+  # Unknown architecture.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHH_DISABLE_TARGET_SPECIFIC")
+endif()
+
+
+add_library(highwayhash ${HH_INCLUDES} ${HH_SOURCES})
+set_target_properties(highwayhash PROPERTIES PUBLIC_HEADER "${HH_INCLUDES}")
+
+target_include_directories(highwayhash
+  PUBLIC  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+)
+target_include_directories(highwayhash
+  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/highwayhash>
+)
+
+if(NOT WIN32 AND NOT ANDROID)
+  target_link_libraries(highwayhash pthread)
+endif()
+
+add_library(highwayhash::highwayhash ALIAS highwayhash)
+
+
+#
+# Tests & Similar
+#
+
+add_library(nanobenchmark OBJECT
+   ${PROJECT_SOURCE_DIR}/highwayhash/nanobenchmark.h
+   ${PROJECT_SOURCE_DIR}/highwayhash/nanobenchmark.cc
+
+   ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.h
+   ${PROJECT_SOURCE_DIR}/highwayhash/os_specific.h
+   ${PROJECT_SOURCE_DIR}/highwayhash/profiler.h
+   ${PROJECT_SOURCE_DIR}/highwayhash/tsc_timer.h
+
+   ${PROJECT_SOURCE_DIR}/highwayhash/instruction_sets.cc
+   ${PROJECT_SOURCE_DIR}/highwayhash/os_specific.cc
+)
+target_include_directories(nanobenchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+
+add_executable(highwayhash_test)
+target_sources(highwayhash_test PRIVATE
+
+  ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_portable.cc
+  ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_target.h
+)
+target_link_libraries(highwayhash_test highwayhash nanobenchmark)
+
+
+add_executable(vector_test)
+target_sources(vector_test PRIVATE
+   ${PROJECT_SOURCE_DIR}/highwayhash/vector_test.cc
+   ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_portable.cc
+   ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_target.h
+)
+target_link_libraries(vector_test highwayhash nanobenchmark)
+
+
+if(PROCESSOR_IS_ARM OR PROCESSOR_IS_AARCH64)
+  target_sources(highwayhash_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_neon.cc
+  )
+  target_sources(vector_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_neon.cc
+  )
+
+elseif(PROCESSOR_IS_X86)
+  target_sources(highwayhash_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_avx2.cc
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_sse41.cc
+  )
+  target_sources(vector_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_avx2.cc
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_sse41.cc
+  )
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_avx2.cc
+    PROPERTIES COMPILE_FLAGS  -mavx2)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_sse41.cc
+    PROPERTIES COMPILE_FLAGS  -msse4.1)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_avx2.cc
+    PROPERTIES COMPILE_FLAGS  -mavx2)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test_sse41.cc
+    PROPERTIES COMPILE_FLAGS  -msse4.1)
+
+elseif(PROCESSOR_IS_POWER)
+  target_sources(highwayhash_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_vsx.cc
+  )
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/highwayhash_test_vsx.cc
+    PROPERTIES COMPILE_FLAGS -mvsx)
+
+  set_source_files_properties(
+    ${PROJECT_SOURCE_DIR}/highwayhash/vector_test.cc
+    PROPERTIES COMPILE_FLAGS  -DHH_DISABLE_TARGET_SPECIFIC)
+
+endif()
+
+
+add_executable(sip_hash_test)
+target_sources(sip_hash_test PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/sip_hash_test.cc
+)
+target_link_libraries(sip_hash_test highwayhash)
+
+
+add_executable(example)
+target_sources(example PRIVATE
+    ${PROJECT_SOURCE_DIR}/highwayhash/example.cc
+ )
+target_link_libraries(example highwayhash)
+
diff --git a/highwayhash/CONTRIBUTING b/highwayhash/CONTRIBUTING
new file mode 100644
index 000000000..bd6072591
--- /dev/null
+++ b/highwayhash/CONTRIBUTING
@@ -0,0 +1,27 @@
+Want to contribute? Great! First, read this page (including the small print at the end).
+
+### Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement]
+(https://cla.developers.google.com/about/google-individual)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things-for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+### Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+### The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the
+[Software Grant and Corporate Contributor License Agreement]
+(https://cla.developers.google.com/about/google-corporate).
diff --git a/highwayhash/LICENSE b/highwayhash/LICENSE
new file mode 100644
index 000000000..6b0b1270f
--- /dev/null
+++ b/highwayhash/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/highwayhash/Makefile b/highwayhash/Makefile
new file mode 100644
index 000000000..a312bc263
--- /dev/null
+++ b/highwayhash/Makefile
@@ -0,0 +1,140 @@
+# We assume X64 unless HH_POWER, HH_ARM, or HH_AARCH64 are defined.
+
+override CPPFLAGS += -I.
+override CXXFLAGS += -std=c++11 -Wall -O3 -fPIC -pthread
+override LDFLAGS += -pthread
+
+PREFIX ?= /usr/local
+INCDIR ?= $(PREFIX)/include
+LIBDIR ?= $(PREFIX)/lib
+
+SIP_OBJS := $(addprefix obj/, \
+	sip_hash.o \
+	sip_tree_hash.o \
+	scalar_sip_tree_hash.o \
+)
+
+DISPATCHER_OBJS := $(addprefix obj/, \
+	arch_specific.o \
+	instruction_sets.o \
+	nanobenchmark.o \
+	os_specific.o \
+)
+
+HIGHWAYHASH_OBJS := $(DISPATCHER_OBJS) obj/hh_portable.o
+HIGHWAYHASH_TEST_OBJS := $(DISPATCHER_OBJS) obj/highwayhash_test_portable.o
+VECTOR_TEST_OBJS := $(DISPATCHER_OBJS) obj/vector_test_portable.o
+
+# aarch64 and ARM use the same code, although ARM usually needs an extra flag for NEON.
+ifdef HH_ARM
+CXXFLAGS += -mfloat-abi=hard -march=armv7-a -mfpu=neon
+HH_AARCH64 = 1
+endif
+
+ifdef HH_AARCH64
+HH_X64 =
+HIGHWAYHASH_OBJS += obj/hh_neon.o
+HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_neon.o
+VECTOR_TEST_OBJS += obj/vector_test_neon.o
+else
+ifdef HH_POWER
+HH_X64 =
+HIGHWAYHASH_OBJS += obj/hh_vsx.o
+HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_vsx.o
+else
+HH_X64 = 1
+HIGHWAYHASH_OBJS += obj/hh_avx2.o obj/hh_sse41.o
+HIGHWAYHASH_TEST_OBJS += obj/highwayhash_test_avx2.o obj/highwayhash_test_sse41.o
+VECTOR_TEST_OBJS += obj/vector_test_avx2.o obj/vector_test_sse41.o
+endif
+endif
+
+# In case highwayhash_test defines PRINT_RESULTS.
+HIGHWAYHASH_TEST_OBJS += $(HIGHWAYHASH_OBJS)
+
+all: $(addprefix bin/, \
+	profiler_example nanobenchmark_example vector_test sip_hash_test \
+	highwayhash_test benchmark) lib/libhighwayhash.a
+
+obj/%.o: highwayhash/%.cc
+	@mkdir -p -- $(dir $@)
+	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
+
+bin/%: obj/%.o
+	@mkdir -p -- $(dir $@)
+	$(CXX) $(LDFLAGS) $^ -o $@
+
+.DELETE_ON_ERROR:
+deps.mk: $(wildcard highwayhash/*.cc) $(wildcard highwayhash/*.h) Makefile
+	set -eu; for file in highwayhash/*.cc; do \
+		target=obj/$${file##*/}; target=$${target%.*}.o; \
+		[ "$$target" = "obj/highwayhash_target.o" ] || \
+		[ "$$target" = "obj/data_parallel_benchmark.o" ] || \
+		[ "$$target" = "obj/data_parallel_test.o" ] || \
+		$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -DHH_DISABLE_TARGET_SPECIFIC -MM -MT \
+		"$$target" "$$file"; \
+	done | sed -e ':b' -e 's-../[^./]*/--' -e 'tb' >$@
+-include deps.mk
+
+bin/profiler_example: $(DISPATCHER_OBJS)
+
+bin/nanobenchmark_example: $(DISPATCHER_OBJS) obj/nanobenchmark.o
+
+ifdef HH_X64
+# TODO: Portability: Have AVX2 be optional so benchmarking can be done on older machines.
+obj/sip_tree_hash.o: CXXFLAGS+=-mavx2
+# (Compiled from same source file with different compiler flags)
+obj/highwayhash_test_avx2.o: CXXFLAGS+=-mavx2
+obj/highwayhash_test_sse41.o: CXXFLAGS+=-msse4.1
+obj/hh_avx2.o: CXXFLAGS+=-mavx2
+obj/hh_sse41.o: CXXFLAGS+=-msse4.1
+obj/vector_test_avx2.o: CXXFLAGS+=-mavx2
+obj/vector_test_sse41.o: CXXFLAGS+=-msse4.1
+
+obj/benchmark.o: CXXFLAGS+=-mavx2
+endif
+
+ifdef HH_POWER
+obj/highwayhash_test_vsx.o: CXXFLAGS+=-mvsx
+obj/hh_vsx.o: CXXFLAGS+=-mvsx
+obj/benchmark.o: CXXFLAGS+=-mvsx
+# Skip file - vector library/test not supported on PPC
+obj/vector_test_target.o: CXXFLAGS+=-DHH_DISABLE_TARGET_SPECIFIC
+obj/vector_test.o: CXXFLAGS+=-DHH_DISABLE_TARGET_SPECIFIC
+endif
+
+lib/libhighwayhash.a: $(SIP_OBJS) $(HIGHWAYHASH_OBJS) obj/c_bindings.o
+	@mkdir -p -- $(dir $@)
+	$(AR) rcs $@ $^
+
+lib/libhighwayhash.so: $(SIP_OBJS) $(HIGHWAYHASH_OBJS) obj/c_bindings.o
+	@mkdir -p -- $(dir $@)
+	$(CXX) $(CXXFLAGS) $(LDFLAGS) -shared $^ -o $@.0 -Wl,-soname,libhighwayhash.so.0
+	@cd $(dir $@); ln -s libhighwayhash.so.0 libhighwayhash.so
+
+bin/highwayhash_test: $(HIGHWAYHASH_TEST_OBJS)
+
+bin/benchmark: obj/benchmark.o $(HIGHWAYHASH_TEST_OBJS)
+bin/benchmark: $(SIP_OBJS) $(HIGHWAYHASH_OBJS)
+bin/vector_test: $(VECTOR_TEST_OBJS)
+
+clean:
+	[ ! -d obj ] || $(RM) -r -- obj/
+
+distclean: clean
+	[ ! -d bin ] || $(RM) -r -- bin/
+	[ ! -d lib ] || $(RM) -r -- lib/
+
+# Mode bits are from issue #58, thanks to yurivict for suggesting.
+# Also added owner-write for stripping the .so in post-install.
+install: lib/libhighwayhash.a lib/libhighwayhash.so
+	mkdir -p $(DESTDIR)/$(LIBDIR)
+	mkdir -p $(DESTDIR)/$(INCDIR)/highwayhash
+	install -m0444 lib/libhighwayhash.a $(DESTDIR)/$(LIBDIR)
+	install -m0755 lib/libhighwayhash.so $(DESTDIR)/$(LIBDIR)
+	install -m0444 highwayhash/*.h $(DESTDIR)/$(INCDIR)/highwayhash/
+
+post-install:
+	${STRIP_CMD} $(DESTDIR)/$(LIBDIR)/libhighwayhash.so
+
+.PHONY: clean distclean all install post-install
diff --git a/highwayhash/README.md b/highwayhash/README.md
new file mode 100644
index 000000000..d59f7ab6d
--- /dev/null
+++ b/highwayhash/README.md
@@ -0,0 +1,404 @@
+Strong (well-distributed and unpredictable) hashes:
+
+*   Portable implementation of
+    [SipHash](https://www.131002.net/siphash/siphash.pdf)
+*   HighwayHash, a 5x faster SIMD hash with [security
+    claims](https://arxiv.org/abs/1612.06257)
+
+## Quick Start
+
+To build on a Linux or Mac platform, simply run `make`. For Windows, we provide
+a Visual Studio 2015 project in the `msvc` subdirectory.
+
+Run `benchmark` for speed measurements. `sip_hash_test` and `highwayhash_test`
+ensure the implementations return known-good values for a given set of inputs.
+
+64-bit SipHash for any CPU:
+
+```
+    #include "highwayhash/sip_hash.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(16) const HH_U64 key2[2] = {1234, 5678};
+    char in[8] = {1};
+    return SipHash(key2, in, 8);
+```
+
+64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags:
+
+```
+    #include "highwayhash/highwayhash.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+    char in[8] = {1};
+    HHResult64 result;  // or HHResult128 or HHResult256
+    HHStateT<HH_TARGET> state(key);
+    HighwayHashT(&state, in, 8, &result);
+```
+
+64, 128 or 256 bit HighwayHash for the CPU on which we're currently running:
+
+```
+    #include "highwayhash/highwayhash_target.h"
+    #include "highwayhash/instruction_sets.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+    char in[8] = {1};
+    HHResult64 result;  // or HHResult128 or HHResult256
+    InstructionSets::Run<HighwayHash>(key, in, 8, &result);
+```
+
+C-callable 64-bit HighwayHash for the CPU on which we're currently running:
+
+    #include "highwayhash/c_bindings.h"
+    const uint64_t key[4] = {1, 2, 3, 4};
+    char in[8] = {1};
+    return HighwayHash64(key, in, 8);
+
+Printing a 256-bit result in a hexadecimal format similar to sha1sum:
+
+    HHResult256 result;
+    printf("%016"PRIx64"%016"PRIx64"%016"PRIx64"%016"PRIx64"\n",
+         result[3], result[2], result[1], result[0]);
+
+## Introduction
+
+Hash functions are widely used, so it is desirable to increase their speed and
+security. This package provides two 'strong' (well-distributed and
+unpredictable) hash functions: a faster version of SipHash, and an even faster
+algorithm we call HighwayHash.
+
+SipHash is a fast but 'cryptographically strong' pseudo-random function by
+Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf].
+
+HighwayHash is a new way of mixing inputs which may inspire new
+cryptographically strong hashes. Large inputs are processed at a rate of 0.24
+cycles per byte, and latency remains low even for small inputs. HighwayHash is
+faster than SipHash for all input sizes, with 5 times higher throughput at 1
+KiB. We discuss design choices and provide statistical analysis and preliminary
+cryptanalysis in https://arxiv.org/abs/1612.06257.
+
+## Applications
+
+Unlike prior strong hashes, these functions are fast enough to be recommended
+as safer replacements for weak hashes in many applications. The additional CPU
+cost appears affordable, based on profiling data indicating C++ hash functions
+account for less than 0.25% of CPU usage.
+
+Hash-based selection of random subsets is useful for A/B experiments and similar
+applications. Such random generators are idempotent (repeatable and
+deterministic), which is helpful for parallel algorithms and testing. To avoid
+bias, it is important that the hash function be unpredictable and
+indistinguishable from a uniform random generator. We have verified the bit
+distribution and avalanche properties of SipHash and HighwayHash.
+
+64-bit hashes are also useful for authenticating short-lived messages such as
+network/RPC packets. This requires that the hash function withstand
+differential, length extension and other attacks. We have published a formal
+security analysis for HighwayHash. New cryptanalysis tools may still need to be
+developed for further analysis.
+
+Strong hashes are also important parts of methods for protecting hash tables
+against unacceptable worst-case behavior and denial of service attacks
+(see "hash flooding" below).
+
+128 and 256-bit hashes can be useful for verifying data integrity (checksums).
+
+## SipHash
+
+Our SipHash implementation is a fast and portable drop-in replacement for
+the reference C code. Outputs are identical for the given test cases (messages
+between 0 and 63 bytes).
+
+Interestingly, it is about twice as fast as a SIMD implementation using SSE4.1
+(https://goo.gl/80GBSD). This is presumably due to the lack of SIMD bit rotate
+instructions prior to AVX-512.
+
+SipHash13 is a faster but weaker variant with one mixing round per update and
+three during finalization.
+
+We also provide a data-parallel 'tree hash' variant that enables efficient SIMD
+while retaining safety guarantees. This is about twice as fast as SipHash, but
+does not return the same results.
+
+## HighwayHash
+
+We have devised a new way of mixing inputs with SIMD multiply and permute
+instructions. The multiplications are 32x32 -> 64 bits and therefore infeasible
+to reverse. Permuting equalizes the distribution of the resulting bytes.
+
+The internal state is quite large (1024 bits) but fits within SIMD registers.
+Due to limitations of the AVX2 instruction set, the registers are partitioned
+into two 512-bit halves that remain independent until the reduce phase. The
+algorithm outputs 64 bit digests or up to 256 bits at no extra cost.
+
+In addition to high throughput, the algorithm is designed for low finalization
+cost. The result is more than twice as fast as SipTreeHash.
+
+We also provide an SSE4.1 version (80% as fast for large inputs and 95% as fast
+for short inputs), an implementation for VSX on POWER and a portable version
+(10% as fast). A third-party ARM implementation is referenced below.
+
+Statistical analyses and preliminary cryptanalysis are given in
+https://arxiv.org/abs/1612.06257.
+
+## Versioning and stability
+
+Now that 21 months have elapsed since their initial release, we have declared
+all (64/128/256 bit) variants of HighwayHash frozen, i.e. unchanging forever.
+
+SipHash and HighwayHash are 'fingerprint functions' whose input -> hash
+mapping will not change. This is important for applications that write hashes to
+persistent storage.
+
+## Speed measurements
+
+To measure the CPU cost of a hash function, we can either create an artificial
+'microbenchmark' (easier to control, but probably not representative of the
+actual runtime), or insert instrumentation directly into an application (risks
+influencing the results through observer overhead). We provide novel variants of
+both approaches that mitigate their respective disadvantages.
+
+profiler.h uses software write-combining to stream program traces to memory
+with minimal overhead. These can be analyzed offline, or when memory is full,
+to learn how much time was spent in each (possibly nested) zone.
+
+nanobenchmark.h enables cycle-accurate measurements of very short functions.
+It uses CPU fences and robust statistics to minimize variability, and also
+avoids unrealistic branch prediction effects.
+
+We compile the 64-bit C++ implementations with a patched GCC 4.9 and run on a
+single idle core of a Xeon E5-2690 v3 clocked at 2.6 GHz. CPU cost is measured
+as cycles per byte for various input sizes:
+
+Algorithm        | 8     | 31   | 32   | 63   | 64   | 1024
+---------------- | ----- | ---- | ---- | ---- | ---- | ----
+HighwayHashAVX2  | 7.34  | 1.81 | 1.71 | 1.04 | 0.95 | 0.24
+HighwayHashSSE41 | 8.00  | 2.11 | 1.75 | 1.13 | 0.96 | 0.30
+SipTreeHash      | 16.51 | 4.57 | 4.09 | 2.22 | 2.29 | 0.57
+SipTreeHash13    | 12.33 | 3.47 | 3.06 | 1.68 | 1.63 | 0.33
+SipHash          | 8.13  | 2.58 | 2.73 | 1.87 | 1.93 | 1.26
+SipHash13        | 6.96  | 2.09 | 2.12 | 1.32 | 1.33 | 0.68
+
+SipTreeHash is slower than SipHash for small inputs because it processes blocks
+of 32 bytes. AVX2 and SSE4.1 HighwayHash are faster than SipHash for all input
+sizes due to their highly optimized handling of partial vectors.
+
+Note that previous measurements included the initialization of their input,
+which dramatically increased timings especially for small inputs.
+
+## CPU requirements
+
+SipTreeHash(13) requires an AVX2-capable CPU (e.g. Haswell). HighwayHash
+includes a dispatcher that chooses the implementation (AVX2, SSE4.1, VSX or
+portable)  at runtime, as well as a directly callable function template that can
+only run on the CPU for which it was built. SipHash(13) and
+ScalarSipTreeHash(13) have no particular CPU requirements.
+
+### AVX2 vs SSE4
+
+When both AVX2 and SSE4 are available, the decision whether to use AVX2 is
+non-obvious. AVX2 vectors are twice as wide, but require a higher power license
+(integer multiplications count as 'heavy' instructions) and can thus reduce the
+clock frequency of the core or entire socket(!) on Haswell systems. This
+partially explains the observed 1.25x (not 2x) speedup over SSE4. Moreover, it
+is inadvisable to only sporadically use AVX2 instructions because there is also
+a ~56K cycle warmup period during which AVX2 operations are slower, and Haswell
+can even stall during this period. Thus, we recommend avoiding AVX2 for
+infrequent hashing if the rest of the application is also not using AVX2. For
+any input larger than 1 MiB, it is probably worthwhile to enable AVX2.
+
+### SIMD implementations
+
+Our x86 implementations use custom vector classes with overloaded operators
+(e.g. `const V4x64U a = b + c`) for type-safety and improved readability vs.
+compiler intrinsics (e.g. `const __m256i a = _mm256_add_epi64(b, c)`).
+The VSX implementation uses built-in vector types alongside Altivec intrinsics.
+A high-performance third-party ARM implementation is mentioned below.
+
+### Dispatch
+
+Our instruction_sets dispatcher avoids running newer instructions on older CPUs
+that do not support them. However, intrinsics, and therefore also any vector
+classes that use them, require (on GCC < 4.9 or Clang < 3.9) a compiler flag
+that also allows the compiler to generate code for that CPU. This means the
+intrinsics must be placed in separate translation units that are compiled with
+the required flags. It is important that these source files and their headers
+not define any inline functions, because that might break the one definition
+rule and cause crashes.
+
+To minimize dispatch overhead when hashes are computed often (e.g. in a loop),
+we can inline the hash function into its caller using templates. The dispatch
+overhead will only be paid once (e.g. before the loop). The template mechanism
+also avoids duplicating code in each CPU-specific implementation.
+
+## Defending against hash flooding
+
+To mitigate hash flooding attacks, we need to take both the hash function and
+the data structure into account.
+
+We wish to defend (web) services that utilize hash sets/maps against
+denial-of-service attacks. Such data structures assign attacker-controlled
+input messages `m` to a hash table bin `b` by computing the hash `H(s, m)`
+using a hash function `H` seeded by `s`, and mapping it to a bin with some
+narrowing function `b = R(h)`, discussed below.
+
+Attackers may attempt to trigger 'flooding' (excessive work in insertions or
+lookups) by finding multiple `m` that map to the same bin. If the attacker has
+local access, they can do far worse, so we assume the attacker can only issue
+remote requests. If the attacker is able to send large numbers of requests,
+they can already deny service, so we need only ensure the attacker's cost is
+sufficiently large compared to the service's provisioning.
+
+If the hash function is 'weak', attackers can easily generate 'hash collisions'
+(inputs mapping to the same hash values) that are independent of the seed. In
+other words, certain input messages will cause collisions regardless of the seed
+value. The author of SipHash has published C++ programs to generate such
+'universal (key-independent) multicollisions' for CityHash and Murmur. Similar
+'differential' attacks are likely possible for any hash function consisting only
+of reversible operations (e.g. addition/multiplication/rotation) with a constant
+operand. `n` requests with such inputs cause `n^2` work for an unprotected hash
+table, which is unacceptable.
+
+By contrast, 'strong' hashes such as SipHash or HighwayHash require infeasible
+attacker effort to find a hash collision (an expected 2^32 guesses of `m` per
+the birthday paradox) or recover the seed (2^63 requests). These security claims
+assume the seed is secret. It is reasonable to suppose `s` is initially unknown
+to attackers, e.g. generated on startup or even per-connection. A timing attack
+by Wool/Bar-Yosef recovers 13-bit seeds by testing all 8K possibilities using
+millions of requests, which takes several days (even assuming unrealistic 150 us
+round-trip times). It appears infeasible to recover 64-bit seeds in this way.
+
+However, attackers are only looking for multiple `m` mapping to the same bin
+rather than identical hash values. We assume they know or are able to discover
+the hash table size `p`. It is common to choose `p = 2^i` to enable an efficient
+`R(h) := h & (p - 1)`, which simply retains the lower hash bits. It may be
+easier for attackers to compute partial collisions where only the lower `i` bits
+match. This can be prevented by choosing a prime `p` so that `R(h) := h % p`
+incorporates all hash bits. The costly modulo operation can be avoided by
+multiplying with the inverse (https://goo.gl/l7ASm8). An interesting alternative
+suggested by Kyoung Jae Seo chooses a random subset of the `h` bits. Such an `R`
+function can be computed in just 3 cycles using PEXT from the BMI2 instruction
+set. This is expected to defend against SAT-solver attacks on the hash bits at a
+slightly lower cost than the multiplicative inverse method, and still allows
+power-of-two table sizes.
+
+Summary thus far: given a strong hash function and secret seed, it appears
+infeasible for attackers to generate hash collisions because `s` and/or `R` are
+unknown. However, they can still observe the timings of data structure
+operations for various `m`. With typical table sizes of 2^10 to 2^17 entries,
+attackers can detect some 'bin collisions' (inputs mapping to the same bin).
+Although this will be costly for the attacker, they can then send many instances
+of such inputs, so we need to limit the resulting work for our data structure.
+
+Hash tables with separate chaining typically store bin entries in a linked list,
+so worst-case inputs lead to unacceptable linear-time lookup cost. We instead
+seek optimal asymptotic worst-case complexity for each operation (insertion,
+deletion and lookups), which is a constant factor times the logarithm of the
+data structure size. This naturally leads to a tree-like data structure for each
+bin. The Java8 HashMap only replaces its linked list with trees when needed.
+This leads to additional cost and complexity for deciding whether a bin is a
+list or tree.
+
+Our first proposal (suggested by Github user funny-falcon) avoids this overhead
+by always storing one tree per bin. It may also be worthwhile to store the first
+entry directly in the bin, which avoids allocating any tree nodes in the common
+case where bins are sparsely populated. What kind of tree should be used?
+
+Given SipHash and HighwayHash provide high quality randomness, depending on
+expecting attack surface simple non-balancing binary search tree could perform
+reasonably well. [Wikipedia says](https://en.wikipedia.org/wiki/Binary_search_tree#Definition)
+> After a long intermixed sequence of random insertion and deletion, the
+> expected height of the tree approaches square root of the number of keys, √n,
+> which grows much faster than log n.
+
+While `O(√n)` is much larger than `O(log n)`, it is still much smaller than `O(n)`.
+And it will certainly complicate the timing attack, since the time of operation
+on collisioned bin will grow slower.
+
+If stronger safety guarantees are needed, then a balanced tree should be used.
+Scapegoat and splay trees only offer amortized complexity guarantees, whereas
+treaps require an entropy source and have higher constant factors in practice.
+Self-balancing structures such as 2-3 or red-black trees require additional
+bookkeeping information. We can hope to reduce rebalancing cost by realizing
+that the output bits of strong `H` functions are uniformly distributed. When
+using them as keys instead of the original message `m`, recent relaxed balancing
+schemes such as left-leaning red-black or weak AVL trees may require fewer tree
+rotations to maintain their invariants. Note that `H` already determines the
+bin, so we should only use the remaining bits. 64-bit hashes are likely
+sufficient for this purpose, and HighwayHash generates up to 256 bits. It seems
+unlikely that attackers can craft inputs resulting in worst cases for both the
+bin index and tree key without being able to generate hash collisions, which
+would contradict the security claims of strong hashes. Even if they succeed, the
+relaxed tree balancing still guarantees an upper bound on height and therefore
+the worst-case operation cost. For the AVL variant, the constant factors are
+slightly lower than for red-black trees.
+
+The second proposed approach uses augmented/de-amortized cuckoo hash tables
+(https://goo.gl/PFwwkx). These guarantee worst-case `log n` bounds for all
+operations, but only if the hash function is 'indistinguishable from random'
+(uniformly distributed regardless of the input distribution), which is claimed
+for SipHash and HighwayHash but certainly not for weak hashes.
+
+Both alternatives retain good average case performance and defend against
+flooding by limiting the amount of extra work an attacker can cause. The first
+approach guarantees an upper bound of `log n` additional work even if the hash
+function is compromised.
+
+In summary, a strong hash function is not, by itself, sufficient to protect a
+chained hash table from flooding attacks. However, strong hash functions are
+important parts of two schemes for preventing denial of service. Using weak hash
+functions can slightly accelerate the best-case and average-case performance of
+a service, but at the risk of greatly reduced attack costs and worst-case
+performance.
+
+## Third-party implementations / bindings
+
+Thanks to Damian Gryski and Frank Wessels for making us aware of these
+third-party implementations or bindings. Please feel free to get in touch or
+raise an issue and we'll add yours as well.
+
+By | Language | URL
+--- | --- | ---
+Damian Gryski | Go and x64 assembly | https://github.com/dgryski/go-highway/
+Simon Abdullah | NPM package | https://www.npmjs.com/package/highwayhash-nodejs
+Lovell Fuller | node.js bindings | https://github.com/lovell/highwayhash
+Andreas Sonnleitner | [WebAssembly](https://github.com/asonnleitner/highwayhash-wasm) and NPM package | https://www.npmjs.com/package/highwayhash-wasm
+Nick Babcock | Rust port | https://github.com/nickbabcock/highway-rs
+Caleb Zulawski | Rust portable SIMD | https://github.com/calebzulawski/autobahn-hash
+Vinzent Steinberg | Rust bindings | https://github.com/vks/highwayhash-rs
+Frank Wessels & Andreas Auernhammer | Go and ARM assembly | https://github.com/minio/highwayhash
+Phil Demetriou | Python 3 bindings | https://github.com/kpdemetriou/highwayhash-cffi
+Jonathan Beard | C++20 constexpr | https://gist.github.com/jonathan-beard/632017faa1d9d1936eb5948ac9186657
+James Cook | Ruby bindings | https://github.com/jamescook/highwayhash
+
+## Modules
+
+### Hashes
+
+*   c_bindings.h declares C-callable versions of SipHash/HighwayHash.
+*   sip_hash.cc is the compatible implementation of SipHash, and also provides
+    the final reduction for sip_tree_hash.
+*   sip_tree_hash.cc is the faster but incompatible SIMD j-lanes tree hash.
+*   scalar_sip_tree_hash.cc is a non-SIMD version.
+*   state_helpers.h simplifies the implementation of the SipHash variants.
+*   highwayhash.h is our new, fast hash function.
+*   hh_{avx2,sse41,vsx,portable}.h are its various implementations.
+*   highwayhash_target.h chooses the best available implementation at runtime.
+
+### Infrastructure
+
+*   arch_specific.h offers byte swapping and CPUID detection.
+*   compiler_specific.h defines some compiler-dependent language extensions.
+*   data_parallel.h provides a C++11 ThreadPool and PerThread (similar to
+    OpenMP).
+*   instruction_sets.h and targets.h enable efficient CPU-specific dispatching.
+*   nanobenchmark.h measures elapsed times with < 1 cycle variability.
+*   os_specific.h sets thread affinity and priority for benchmarking.
+*   profiler.h is a low-overhead, deterministic hierarchical profiler.
+*   tsc_timer.h obtains high-resolution timestamps without CPU reordering.
+*   vector256.h and vector128.h contain wrapper classes for AVX2 and SSE4.1.
+
+By Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala
+<jyrki.alakuijala@gmail.com>, updated 2023-03-29
+
+This is not an official Google product.
diff --git a/highwayhash/c/highwayhash.c b/highwayhash/c/highwayhash.c
new file mode 100644
index 000000000..bf4863ecd
--- /dev/null
+++ b/highwayhash/c/highwayhash.c
@@ -0,0 +1,261 @@
+#include "c/highwayhash.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+This code is compatible with C90 with the additional requirement of
+supporting uint64_t.
+*/
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Internal implementation                                                    */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+void HighwayHashReset(const uint64_t key[4], HighwayHashState* state) {
+  state->mul0[0] = 0xdbe6d5d5fe4cce2full;
+  state->mul0[1] = 0xa4093822299f31d0ull;
+  state->mul0[2] = 0x13198a2e03707344ull;
+  state->mul0[3] = 0x243f6a8885a308d3ull;
+  state->mul1[0] = 0x3bd39e10cb0ef593ull;
+  state->mul1[1] = 0xc0acf169b5f18a8cull;
+  state->mul1[2] = 0xbe5466cf34e90c6cull;
+  state->mul1[3] = 0x452821e638d01377ull;
+  state->v0[0] = state->mul0[0] ^ key[0];
+  state->v0[1] = state->mul0[1] ^ key[1];
+  state->v0[2] = state->mul0[2] ^ key[2];
+  state->v0[3] = state->mul0[3] ^ key[3];
+  state->v1[0] = state->mul1[0] ^ ((key[0] >> 32) | (key[0] << 32));
+  state->v1[1] = state->mul1[1] ^ ((key[1] >> 32) | (key[1] << 32));
+  state->v1[2] = state->mul1[2] ^ ((key[2] >> 32) | (key[2] << 32));
+  state->v1[3] = state->mul1[3] ^ ((key[3] >> 32) | (key[3] << 32));
+}
+
+static void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0,
+                              uint64_t* add1, uint64_t* add0) {
+  *add0 += (((v0 & 0xff000000ull) | (v1 & 0xff00000000ull)) >> 24) |
+           (((v0 & 0xff0000000000ull) | (v1 & 0xff000000000000ull)) >> 16) |
+           (v0 & 0xff0000ull) | ((v0 & 0xff00ull) << 32) |
+           ((v1 & 0xff00000000000000ull) >> 8) | (v0 << 56);
+  *add1 += (((v1 & 0xff000000ull) | (v0 & 0xff00000000ull)) >> 24) |
+           (v1 & 0xff0000ull) | ((v1 & 0xff0000000000ull) >> 16) |
+           ((v1 & 0xff00ull) << 24) | ((v0 & 0xff000000000000ull) >> 8) |
+           ((v1 & 0xffull) << 48) | (v0 & 0xff00000000000000ull);
+}
+
+static void Update(const uint64_t lanes[4], HighwayHashState* state) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    state->v1[i] += state->mul0[i] + lanes[i];
+    state->mul0[i] ^= (state->v1[i] & 0xffffffff) * (state->v0[i] >> 32);
+    state->v0[i] += state->mul1[i];
+    state->mul1[i] ^= (state->v0[i] & 0xffffffff) * (state->v1[i] >> 32);
+  }
+  ZipperMergeAndAdd(state->v1[1], state->v1[0], &state->v0[1], &state->v0[0]);
+  ZipperMergeAndAdd(state->v1[3], state->v1[2], &state->v0[3], &state->v0[2]);
+  ZipperMergeAndAdd(state->v0[1], state->v0[0], &state->v1[1], &state->v1[0]);
+  ZipperMergeAndAdd(state->v0[3], state->v0[2], &state->v1[3], &state->v1[2]);
+}
+
+static uint64_t Read64(const uint8_t* src) {
+  return (uint64_t)src[0] | ((uint64_t)src[1] << 8) |
+      ((uint64_t)src[2] << 16) | ((uint64_t)src[3] << 24) |
+      ((uint64_t)src[4] << 32) | ((uint64_t)src[5] << 40) |
+      ((uint64_t)src[6] << 48) | ((uint64_t)src[7] << 56);
+}
+
+void HighwayHashUpdatePacket(const uint8_t* packet, HighwayHashState* state) {
+  uint64_t lanes[4];
+  lanes[0] = Read64(packet + 0);
+  lanes[1] = Read64(packet + 8);
+  lanes[2] = Read64(packet + 16);
+  lanes[3] = Read64(packet + 24);
+  Update(lanes, state);
+}
+
+static void Rotate32By(uint64_t count, uint64_t lanes[4]) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    uint32_t half0 = lanes[i] & 0xffffffff;
+    uint32_t half1 = (lanes[i] >> 32);
+    lanes[i] = (half0 << count) | (half0 >> (32 - count));
+    lanes[i] |= (uint64_t)((half1 << count) | (half1 >> (32 - count))) << 32;
+  }
+}
+
+void HighwayHashUpdateRemainder(const uint8_t* bytes, const size_t size_mod32,
+                                HighwayHashState* state) {
+  int i;
+  const size_t size_mod4 = size_mod32 & 3;
+  const uint8_t* remainder = bytes + (size_mod32 & ~3);
+  uint8_t packet[32] = {0};
+  for (i = 0; i < 4; ++i) {
+    state->v0[i] += ((uint64_t)size_mod32 << 32) + size_mod32;
+  }
+  Rotate32By(size_mod32, state->v1);
+  for (i = 0; i < remainder - bytes; i++) {
+    packet[i] = bytes[i];
+  }
+  if (size_mod32 & 16) {
+    for (i = 0; i < 4; i++) {
+      packet[28 + i] = remainder[i + size_mod4 - 4];
+    }
+  } else {
+    if (size_mod4) {
+      packet[16 + 0] = remainder[0];
+      packet[16 + 1] = remainder[size_mod4 >> 1];
+      packet[16 + 2] = remainder[size_mod4 - 1];
+    }
+  }
+  HighwayHashUpdatePacket(packet, state);
+}
+
+static void Permute(const uint64_t v[4], uint64_t* permuted) {
+  permuted[0] = (v[2] >> 32) | (v[2] << 32);
+  permuted[1] = (v[3] >> 32) | (v[3] << 32);
+  permuted[2] = (v[0] >> 32) | (v[0] << 32);
+  permuted[3] = (v[1] >> 32) | (v[1] << 32);
+}
+
+void PermuteAndUpdate(HighwayHashState* state) {
+  uint64_t permuted[4];
+  Permute(state->v0, permuted);
+  Update(permuted, state);
+}
+
+static void ModularReduction(uint64_t a3_unmasked, uint64_t a2, uint64_t a1,
+                             uint64_t a0, uint64_t* m1, uint64_t* m0) {
+  uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull;
+  *m1 = a1 ^ ((a3 << 1) | (a2 >> 63)) ^ ((a3 << 2) | (a2 >> 62));
+  *m0 = a0 ^ (a2 << 1) ^ (a2 << 2);
+}
+
+static uint64_t HighwayHashFinalize64(HighwayHashState* state) {
+  int i;
+  for (i = 0; i < 4; i++) {
+    PermuteAndUpdate(state);
+  }
+  return state->v0[0] + state->v1[0] + state->mul0[0] + state->mul1[0];
+}
+
+static void HighwayHashFinalize128(HighwayHashState* state, uint64_t hash[2]) {
+  int i;
+  for (i = 0; i < 6; i++) {
+    PermuteAndUpdate(state);
+  }
+  hash[0] = state->v0[0] + state->mul0[0] + state->v1[2] + state->mul1[2];
+  hash[1] = state->v0[1] + state->mul0[1] + state->v1[3] + state->mul1[3];
+}
+
+static void HighwayHashFinalize256(HighwayHashState* state, uint64_t hash[4]) {
+  int i;
+  /* We anticipate that 256-bit hashing will be mostly used with long messages
+     because storing and using the 256-bit hash (in contrast to 128-bit)
+     carries a larger additional constant cost by itself. Doing extra rounds
+     here hardly increases the per-byte cost of long messages. */
+  for (i = 0; i < 10; i++) {
+    PermuteAndUpdate(state);
+  }
+  ModularReduction(state->v1[1] + state->mul1[1], state->v1[0] + state->mul1[0],
+                   state->v0[1] + state->mul0[1], state->v0[0] + state->mul0[0],
+                   &hash[1], &hash[0]);
+  ModularReduction(state->v1[3] + state->mul1[3], state->v1[2] + state->mul1[2],
+                   state->v0[3] + state->mul0[3], state->v0[2] + state->mul0[2],
+                   &hash[3], &hash[2]);
+}
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Non-cat API: single call on full data                                      */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+static void ProcessAll(const uint8_t* data, size_t size, const uint64_t key[4],
+                       HighwayHashState* state) {
+  size_t i;
+  HighwayHashReset(key, state);
+  for (i = 0; i + 32 <= size; i += 32) {
+    HighwayHashUpdatePacket(data + i, state);
+  }
+  if ((size & 31) != 0) HighwayHashUpdateRemainder(data + i, size & 31, state);
+}
+
+uint64_t HighwayHash64(const uint8_t* data, size_t size,
+    const uint64_t key[4]) {
+  HighwayHashState state;
+  ProcessAll(data, size, key, &state);
+  return HighwayHashFinalize64(&state);
+}
+
+void HighwayHash128(const uint8_t* data, size_t size,
+    const uint64_t key[4], uint64_t hash[2]) {
+  HighwayHashState state;
+  ProcessAll(data, size, key, &state);
+  HighwayHashFinalize128(&state, hash);
+}
+
+void HighwayHash256(const uint8_t* data, size_t size,
+    const uint64_t key[4], uint64_t hash[4]) {
+  HighwayHashState state;
+  ProcessAll(data, size, key, &state);
+  HighwayHashFinalize256(&state, hash);
+}
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Cat API: allows appending with multiple calls                              */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+void HighwayHashCatStart(const uint64_t key[4], HighwayHashCat* state) {
+  HighwayHashReset(key, &state->state);
+  state->num = 0;
+}
+
+void HighwayHashCatAppend(const uint8_t* bytes, size_t num,
+                          HighwayHashCat* state) {
+  size_t i;
+  if (state->num != 0) {
+    size_t num_add = num > (32u - state->num) ? (32u - state->num) : num;
+    for (i = 0; i < num_add; i++) {
+      state->packet[state->num + i] = bytes[i];
+    }
+    state->num += num_add;
+    num -= num_add;
+    bytes += num_add;
+    if (state->num == 32) {
+      HighwayHashUpdatePacket(state->packet, &state->state);
+      state->num = 0;
+    }
+  }
+  while (num >= 32) {
+    HighwayHashUpdatePacket(bytes, &state->state);
+    num -= 32;
+    bytes += 32;
+  }
+  for (i = 0; i < num; i++) {
+    state->packet[state->num] = bytes[i];
+    state->num++;
+  }
+}
+
+uint64_t HighwayHashCatFinish64(const HighwayHashCat* state) {
+  HighwayHashState copy = state->state;
+  if (state->num) {
+    HighwayHashUpdateRemainder(state->packet, state->num, &copy);
+  }
+  return HighwayHashFinalize64(&copy);
+}
+
+void HighwayHashCatFinish128(const HighwayHashCat* state, uint64_t hash[2]) {
+  HighwayHashState copy = state->state;
+  if (state->num) {
+    HighwayHashUpdateRemainder(state->packet, state->num, &copy);
+  }
+  HighwayHashFinalize128(&copy, hash);
+}
+
+void HighwayHashCatFinish256(const HighwayHashCat* state, uint64_t hash[4]) {
+  HighwayHashState copy = state->state;
+  if (state->num) {
+    HighwayHashUpdateRemainder(state->packet, state->num, &copy);
+  }
+  HighwayHashFinalize256(&copy, hash);
+}
diff --git a/highwayhash/c/highwayhash.h b/highwayhash/c/highwayhash.h
new file mode 100644
index 000000000..10c877fdc
--- /dev/null
+++ b/highwayhash/c/highwayhash.h
@@ -0,0 +1,100 @@
+#ifndef C_HIGHWAYHASH_H_
+#define C_HIGHWAYHASH_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Low-level API, use for implementing streams etc...                         */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+typedef struct {
+  uint64_t v0[4];
+  uint64_t v1[4];
+  uint64_t mul0[4];
+  uint64_t mul1[4];
+} HighwayHashState;
+
+/* Initializes state with given key */
+static void HighwayHashReset(const uint64_t key[4], HighwayHashState* state);
+/* Takes a packet of 32 bytes */
+void HighwayHashUpdatePacket(const uint8_t* packet, HighwayHashState* state);
+/* Adds the final 1..31 bytes, do not use if 0 remain */
+void HighwayHashUpdateRemainder(const uint8_t* bytes, const size_t size_mod32,
+                                HighwayHashState* state);
+/* Compute final hash value. Makes state invalid. */
+static uint64_t HighwayHashFinalize64(HighwayHashState* state);
+static void HighwayHashFinalize128(HighwayHashState* state, uint64_t hash[2]);
+static void HighwayHashFinalize256(HighwayHashState* state, uint64_t hash[4]);
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Non-cat API: single call on full data                                      */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+uint64_t HighwayHash64(const uint8_t* data, size_t size, const uint64_t key[4]);
+
+void HighwayHash128(const uint8_t* data, size_t size,
+                    const uint64_t key[4], uint64_t hash[2]);
+
+void HighwayHash256(const uint8_t* data, size_t size,
+                    const uint64_t key[4], uint64_t hash[4]);
+
+/*////////////////////////////////////////////////////////////////////////////*/
+/* Cat API: allows appending with multiple calls                              */
+/*////////////////////////////////////////////////////////////////////////////*/
+
+typedef struct {
+  HighwayHashState state;
+  uint8_t packet[32];
+  int num;
+} HighwayHashCat;
+
+/* Allocates new state for a new streaming hash computation */
+void HighwayHashCatStart(const uint64_t key[4], HighwayHashCat* state);
+
+void HighwayHashCatAppend(const uint8_t* bytes, size_t num,
+                          HighwayHashCat* state);
+
+/* Computes final hash value */
+uint64_t HighwayHashCatFinish64(const HighwayHashCat* state);
+void HighwayHashCatFinish128(const HighwayHashCat* state, uint64_t hash[2]);
+void HighwayHashCatFinish256(const HighwayHashCat* state, uint64_t hash[4]);
+
+/*
+Usage examples:
+
+#include <inttypes.h>
+#include <stdio.h>
+
+void Example64() {
+  uint64_t key[4] = {1, 2, 3, 4};
+  const char* text = "Hello world!";
+  size_t size = strlen(text);
+  uint64_t hash = HighwayHash64((const uint8_t*)text, size, key);
+  printf("%016"PRIx64"\n", hash);
+}
+
+void Example64Cat() {
+  uint64_t key[4] = {1, 2, 3, 4};
+  HighwayHashCat state;
+  uint64_t hash;
+
+  HighwayHashCatStart(key, &state);
+
+  HighwayHashCatAppend((const uint8_t*)"Hello", 5, &state);
+  HighwayHashCatAppend((const uint8_t*)" world!", 7, &state);
+
+  hash = HighwayHashCatFinish64(&state);
+  printf("%016"PRIx64"\n", hash);
+}
+*/
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  /* extern "C" */
+#endif
+
+#endif  // C_HIGHWAYHASH_H_
diff --git a/highwayhash/c/highwayhash_test.c b/highwayhash/c/highwayhash_test.c
new file mode 100644
index 000000000..9f9ee3367
--- /dev/null
+++ b/highwayhash/c/highwayhash_test.c
@@ -0,0 +1,70 @@
+#include "c/highwayhash.h"
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define kMaxSize 64
+
+static const uint64_t kTestKey1[4] = {
+  0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull,
+  0x1716151413121110ull, 0x1F1E1D1C1B1A1918ull
+};
+
+static const uint64_t kTestKey2[4] = {
+  1ull, 2ull, 3ull, 4ull
+};
+
+const uint64_t kExpected64[kMaxSize + 1] = {
+    0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull,
+    0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull,
+    0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull,
+    0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull,
+    0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull,
+    0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull,
+    0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull,
+    0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull,
+    0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full,
+    0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull,
+    0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull,
+    0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull,
+    0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull,
+    0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull,
+    0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull,
+    0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull,
+    0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull,
+    0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull,
+    0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull,
+    0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull,
+    0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull,
+    0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull};
+
+void TestHash64(uint64_t expected, const uint8_t* data, size_t size,
+                const uint64_t* key) {
+  uint64_t hash = HighwayHash64(data, size, key);
+  if (expected != hash) {
+    printf("Test failed: expected %016"PRIx64", got %016"PRIx64", size: %d\n",
+           expected, hash, (int) size);
+    exit(1);
+  }
+}
+
+int main() {
+  uint8_t data[kMaxSize + 1] = {0};
+  int i;
+  for (i = 0; i <= kMaxSize; i++) {
+    data[i] = i;
+    TestHash64(kExpected64[i], data, i, kTestKey1);
+  }
+
+  for (i = 0; i < 33; i++) {
+    data[i] = 128 + i;
+  }
+  TestHash64(0x53c516cce478cad7ull, data, 33, kTestKey2);
+
+  /* 128-bit and 256-bit tests to be added when they are declared frozen in the
+     C++ version */
+
+  printf("Test success\n");
+  return 0;
+}
diff --git a/highwayhash/google3/third_party/highwayhash/WORKSPACE b/highwayhash/google3/third_party/highwayhash/WORKSPACE
new file mode 100644
index 000000000..cca464c25
--- /dev/null
+++ b/highwayhash/google3/third_party/highwayhash/WORKSPACE
@@ -0,0 +1 @@
+workspace(name = "highwayhash")
diff --git a/highwayhash/highwayhash.3 b/highwayhash/highwayhash.3
new file mode 100644
index 000000000..54f3d1d93
--- /dev/null
+++ b/highwayhash/highwayhash.3
@@ -0,0 +1,107 @@
+.TH highwayhash 3 "April 25, 2017"
+
+.SH NAME
+highwayhash \- fast strong 64-bit hash functions
+
+.SH SYNOPSIS
+
+.B #include <highwayhash/c_bindings.h>  /* C */
+
+    uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size);
+
+    uint64_t SipHash13C(const uint64_t* key, const char* bytes, const uint64_t size);
+
+    uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size);
+
+.B #include <highwayhash/highwayhash.h> /* C++ */
+
+    using namespace highwayhash;
+
+    void HighwayHashT(State* HH_RESTRICT state,
+                      const char* HH_RESTRICT bytes, const size_t size,
+                      Result* HH_RESTRICT hash);
+
+.B #include <highwayhash/sip_hash.h> /* C++ */
+
+    using namespace highwayhash;
+
+    HH_U64 SipHash(const SipHashState::Key& key, const char* bytes,const HH_U64 size);
+
+Link with
+.I
+-lhighwayhash
+
+.SH DESCRIPTION
+
+Hash functions are widely used, so it is desirable to increase their speed and
+security. This package provides two 'strong' (well-distributed and
+unpredictable) hash functions: a faster version of SipHash, and an even faster
+algorithm we call HighwayHash.
+
+SipHash is a fast but 'cryptographically strong' pseudo-random function by
+Aumasson and Bernstein [https://www.131002.net/siphash/siphash.pdf].
+
+HighwayHash is a new way of mixing inputs which may inspire new
+cryptographically strong hashes. Large inputs are processed at a rate of 0.24
+cycles per byte, and latency remains low even for small inputs. HighwayHash is
+faster than SipHash for all input sizes, with 5 times higher throughput at 1
+KiB. We discuss design choices and provide statistical analysis and preliminary
+cryptanalysis in https://arxiv.org/abs/1612.06257.
+
+.I
+Note, SipHash wants an uint64_t[2] key while HighwayHash uint64_t[4] .
+
+.SH EXAMPLES
+
+64-bit SipHash for any CPU:
+
+    #include "highwayhash/sip_hash.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(16) const HH_U64 key2[2] = {1234, 5678};
+    char in[8] = {1};
+    return SipHash(key2, in, 8);
+
+64, 128 or 256 bit HighwayHash for the CPU determined by compiler flags:
+
+    #include "highwayhash/highwayhash.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+    char in[8] = {1};
+    HHResult64 result;  // or HHResult128 or HHResult256
+    HHStateT<HH_TARGET> state(key);
+    HighwayHashT(&state, in, 8, &result);
+
+64, 128 or 256 bit HighwayHash for the CPU on which we're currently running:
+
+    #include "highwayhash/highwayhash_target.h"
+    #include "highwayhash/instruction_sets.h"
+    using namespace highwayhash;
+    HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+    char in[8] = {1};
+    HHResult64 result;  // or HHResult128 or HHResult256
+    InstructionSets::Run<HighwayHash>(key, in, 8, &result);
+
+C-callable 64-bit HighwayHash for the CPU on which we're currently running:
+
+    #include "highwayhash/c_bindings.h"
+    const uint64_t key[4] = {1, 2, 3, 4};
+    char in[8] = {1};
+    return HighwayHash64(key, in, 8);
+
+.SH SEE ALSO
+
+/usr/include/highwayhash/c_bindings.h (C)
+
+/usr/include/highwayhash/highwayhash.h (C++)
+
+.SH BUGS
+
+https://github.com/google/highwayhash/issues
+
+.SH AUTHOR
+
+Upstream authors are Jan Wassenberg <jan.wassenberg@gmail.com> and Jyrki Alakuijala <jyrki.alakuijala@gmail.com>, updated 2017-02-07
+
+This manpage was created by Adam Borowski <kilobyte@angband.pl>,
+and completed by Zhou Mo <cdluminate@gmail.com> according to upstream readme
+and header files.
\ No newline at end of file
diff --git a/highwayhash/highwayhash/arch_specific.cc b/highwayhash/highwayhash/arch_specific.cc
new file mode 100644
index 000000000..2a05860ce
--- /dev/null
+++ b/highwayhash/highwayhash/arch_specific.cc
@@ -0,0 +1,193 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/arch_specific.h"
+
+#include <stdint.h>
+
+#if HH_ARCH_X64 && !HH_MSC_VERSION
+#include <cpuid.h>
+#endif
+
+#if HH_ARCH_PPC
+#if __GLIBC__
+#include <sys/platform/ppc.h>  // __ppc_get_timebase_freq
+#elif __FreeBSD__
+// clang-format off
+#include <sys/types.h>
+#include <sys/sysctl.h>                 /* must come after sys/types.h */
+// clang-format on
+#endif
+#endif
+
+#include <string.h>  // memcpy
+#include <string>
+
+namespace highwayhash {
+
+const char* TargetName(const TargetBits target_bit) {
+  switch (target_bit) {
+    case HH_TARGET_Portable:
+      return "Portable";
+    case HH_TARGET_SSE41:
+      return "SSE41";
+    case HH_TARGET_AVX2:
+      return "AVX2";
+    case HH_TARGET_VSX:
+      return "VSX";
+    case HH_TARGET_NEON:
+      return "NEON";
+    default:
+      return nullptr;  // zero, multiple, or unknown bits
+  }
+}
+
+#if HH_ARCH_X64
+
+namespace {
+
+std::string BrandString() {
+  char brand_string[49];
+  uint32_t abcd[4];
+
+  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+  Cpuid(0x80000000U, 0, abcd);
+  if (abcd[0] < 0x80000004U) {
+    return std::string();
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    Cpuid(0x80000002U + i, 0, abcd);
+    memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
+  }
+  brand_string[48] = 0;
+  return brand_string;
+}
+
+}  // namespace
+
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HH_RESTRICT abcd) {
+#if HH_MSC_VERSION
+  int regs[4];
+  __cpuidex(regs, level, count);
+  for (int i = 0; i < 4; ++i) {
+    abcd[i] = regs[i];
+  }
+#else
+  uint32_t a, b, c, d;
+  __cpuid_count(level, count, a, b, c, d);
+  abcd[0] = a;
+  abcd[1] = b;
+  abcd[2] = c;
+  abcd[3] = d;
+#endif
+}
+
+uint32_t ApicId() {
+  uint32_t abcd[4];
+  Cpuid(1, 0, abcd);
+  return abcd[1] >> 24;  // ebx
+}
+
+#endif  // HH_ARCH_X64
+
+namespace {
+
+double DetectNominalClockRate() {
+#if HH_ARCH_X64
+  const std::string& brand_string = BrandString();
+  // Brand strings include the maximum configured frequency. These prefixes are
+  // defined by Intel CPUID documentation.
+  const char* prefixes[3] = {"MHz", "GHz", "THz"};
+  const double multipliers[3] = {1E6, 1E9, 1E12};
+  for (size_t i = 0; i < 3; ++i) {
+    const size_t pos_prefix = brand_string.find(prefixes[i]);
+    if (pos_prefix != std::string::npos) {
+      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+      if (pos_space != std::string::npos) {
+        const std::string digits =
+            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+        return std::stod(digits) * multipliers[i];
+      }
+    }
+  }
+#elif HH_ARCH_PPC
+  double freq = -1;
+#if __linux__
+  char line[200];
+  char* s;
+  char* value;
+
+  FILE* f = fopen("/proc/cpuinfo", "r");
+  if (f != nullptr) {
+    while (fgets(line, sizeof(line), f) != nullptr) {
+      // NOTE: the ':' is the only character we can rely on
+      if (!(value = strchr(line, ':'))) continue;
+      // terminate the valuename
+      *value++ = '\0';
+      // skip any leading spaces
+      while (*value == ' ') value++;
+      if ((s = strchr(value, '\n'))) *s = '\0';
+
+      if (!strncasecmp(line, "clock", strlen("clock")) &&
+          sscanf(value, "%lf", &freq) == 1) {
+        freq *= 1E6;
+        break;
+      }
+    }
+    fclose(f);
+    return freq;
+  }
+#elif __FreeBSD__
+  size_t length = sizeof(freq);
+  sysctlbyname("dev.cpu.0.freq", &freq, &length, NULL, 0);
+  freq *= 1E6;
+  return freq;
+#endif
+#endif
+
+  return 0.0;
+}
+
+}  // namespace
+
+double NominalClockRate() {
+  // Thread-safe caching - this is called several times.
+  static const double cycles_per_second = DetectNominalClockRate();
+  return cycles_per_second;
+}
+
+double InvariantTicksPerSecond() {
+#if HH_ARCH_PPC
+#if __GLIBC__
+  static const double cycles_per_second = __ppc_get_timebase_freq();
+#elif __FreeBSD__
+  double cycles_per_second = 0;
+  size_t length = sizeof(cycles_per_second);
+  sysctlbyname("kern.timecounter.tc.timebase.frequency", &cycles_per_second,
+               &length, NULL, 0);
+#elif __OpenBSD__
+  /* There is currently no method of retrieving this via userland.
+   * This value is correct for Power8 and Power9.
+   */
+  static const double cycles_per_second = 512000000;
+#endif
+  return cycles_per_second;
+#else
+  return NominalClockRate();
+#endif
+}
+
+}  // namespace highwayhash
diff --git a/highwayhash/highwayhash/arch_specific.h b/highwayhash/highwayhash/arch_specific.h
new file mode 100644
index 000000000..0b8c38417
--- /dev/null
+++ b/highwayhash/highwayhash/arch_specific.h
@@ -0,0 +1,179 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ARCH_SPECIFIC_H_
+#define HIGHWAYHASH_ARCH_SPECIFIC_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME.
+//
+// Background: older GCC/Clang require flags such as -mavx2 before AVX2 SIMD
+// intrinsics can be used. These intrinsics are only used within blocks that
+// first verify CPU capabilities. However, the flag also allows the compiler to
+// generate AVX2 code in other places. This can violate the One Definition Rule,
+// which requires multiple instances of a function with external linkage
+// (e.g. extern inline in a header) to be "equivalent". To prevent the resulting
+// crashes on non-AVX2 CPUs, any header (transitively) included from a
+// translation unit compiled with different flags is "restricted". This means
+// all function definitions must have internal linkage (e.g. static inline), or
+// reside in namespace HH_TARGET_NAME, which expands to a name unique to the
+// current compiler flags.
+//
+// Most C system headers are safe to include, but C++ headers should generally
+// be avoided because they often do not specify static linkage and cannot
+// reliably be wrapped in a namespace.
+
+#include "highwayhash/compiler_specific.h"
+
+#include <stdint.h>
+
+#if HH_MSC_VERSION
+#include <intrin.h>  // _byteswap_*
+#endif
+
+namespace highwayhash {
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define HH_ARCH_X64 1
+#else
+#define HH_ARCH_X64 0
+#endif
+
+#if defined(__aarch64__) || defined(__arm64__)
+#define HH_ARCH_AARCH64 1
+#else
+#define HH_ARCH_AARCH64 0
+#endif
+
+#ifdef __arm__
+#define HH_ARCH_ARM 1
+#else
+#define HH_ARCH_ARM 0
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define HH_ARCH_NEON 1
+#else
+#define HH_ARCH_NEON 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define HH_ARCH_PPC 1
+#else
+#define HH_ARCH_PPC 0
+#endif
+
+// Target := instruction set extension(s) such as SSE41. A translation unit can
+// only provide a single target-specific implementation because they require
+// different compiler flags.
+
+// Either the build system specifies the target by defining HH_TARGET_NAME
+// (which is necessary for Portable on X64, and SSE41 on MSVC), or we'll choose
+// the most efficient one that can be compiled given the current flags:
+#ifndef HH_TARGET_NAME
+
+// To avoid excessive code size and dispatch overhead, we only support a few
+// groups of extensions, e.g. FMA+BMI2+AVX+AVX2 =: "AVX2". These names must
+// match the HH_TARGET_* suffixes below.
+#ifdef __AVX2__
+#define HH_TARGET_NAME AVX2
+// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
+// we at least get SSE4 on machines supporting AVX but not AVX2.
+// https://stackoverflow.com/questions/18563978/detect-the-availability-of-sse-sse2-instruction-set-in-visual-studio
+#elif defined(__SSE4_1__) || (HH_MSC_VERSION != 0 && defined(__AVX__))
+#define HH_TARGET_NAME SSE41
+#elif defined(__VSX__)
+#define HH_TARGET_NAME VSX
+#elif HH_ARCH_NEON
+#define HH_TARGET_NAME NEON
+#else
+#define HH_TARGET_NAME Portable
+#endif
+
+#endif  // HH_TARGET_NAME
+
+#define HH_CONCAT(first, second) first##second
+// Required due to macro expansion rules.
+#define HH_EXPAND_CONCAT(first, second) HH_CONCAT(first, second)
+// Appends HH_TARGET_NAME to "identifier_prefix".
+#define HH_ADD_TARGET_SUFFIX(identifier_prefix) \
+  HH_EXPAND_CONCAT(identifier_prefix, HH_TARGET_NAME)
+
+// HH_TARGET expands to an integer constant. Typical usage: HHStateT<HH_TARGET>.
+// This ensures your code will work correctly when compiler flags are changed,
+// and benefit from subsequently added targets/specializations.
+#define HH_TARGET HH_ADD_TARGET_SUFFIX(HH_TARGET_)
+
+// Deprecated former name of HH_TARGET; please use HH_TARGET instead.
+#define HH_TARGET_PREFERRED HH_TARGET
+
+// Associate targets with integer literals so the preprocessor can compare them
+// with HH_TARGET. Do not instantiate templates with these values - use
+// HH_TARGET instead. Must be unique powers of two, see TargetBits. Always
+// defined even if unavailable on this HH_ARCH to allow calling TargetName.
+// The suffixes must match the HH_TARGET_NAME identifiers.
+#define HH_TARGET_Portable 1
+#define HH_TARGET_SSE41 2
+#define HH_TARGET_AVX2 4
+#define HH_TARGET_VSX 8
+#define HH_TARGET_NEON 16
+
+// Bit array for one or more HH_TARGET_*. Used to indicate which target(s) are
+// supported or were called by InstructionSets::RunAll.
+using TargetBits = unsigned;
+
+namespace HH_TARGET_NAME {
+
+// Calls func(bit_value) for every nonzero bit in "bits".
+template <class Func>
+void ForeachTarget(TargetBits bits, const Func& func) {
+  while (bits != 0) {
+    const TargetBits lowest = bits & (~bits + 1);
+    func(lowest);
+    bits &= ~lowest;
+  }
+}
+
+}  // namespace HH_TARGET_NAME
+
+// Returns a brief human-readable string literal identifying one of the above
+// bits, or nullptr if zero, multiple, or unknown bits are set.
+const char* TargetName(const TargetBits target_bit);
+
+// Returns the nominal (without Turbo Boost) CPU clock rate [Hertz]. Useful for
+// (roughly) characterizing the CPU speed.
+double NominalClockRate();
+
+// Returns tsc_timer frequency, useful for converting ticks to seconds. This is
+// unaffected by CPU throttling ("invariant"). Thread-safe. Returns timebase
+// frequency on PPC and NominalClockRate on all other platforms.
+double InvariantTicksPerSecond();
+
+#if HH_ARCH_X64
+
+// Calls CPUID instruction with eax=level and ecx=count and returns the result
+// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HH_RESTRICT abcd);
+
+// Returns the APIC ID of the CPU on which we're currently running.
+uint32_t ApicId();
+
+#endif  // HH_ARCH_X64
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_ARCH_SPECIFIC_H_
diff --git a/highwayhash/highwayhash/benchmark.cc b/highwayhash/highwayhash/benchmark.cc
new file mode 100644
index 000000000..7cc304ffc
--- /dev/null
+++ b/highwayhash/highwayhash/benchmark.cc
@@ -0,0 +1,331 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Measures hash function throughput for various input sizes.
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include <map>
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/nanobenchmark.h"
+#include "highwayhash/robust_statistics.h"
+
+// Which functions to enable (includes check for compiler support)
+#define BENCHMARK_SIP 0
+#define BENCHMARK_SIP_TREE 0
+#define BENCHMARK_HIGHWAY 1
+#define BENCHMARK_HIGHWAY_CAT 1
+#define BENCHMARK_FARM 0
+#define BENCHMARK_INTERNAL 0
+
+#include "highwayhash/highwayhash_test_target.h"
+#if BENCHMARK_SIP
+#include "highwayhash/sip_hash.h"
+#endif
+#if BENCHMARK_SIP_TREE
+#include "highwayhash/scalar_sip_tree_hash.h"
+#include "highwayhash/sip_tree_hash.h"
+#endif
+#if BENCHMARK_FARM
+#include "third_party/farmhash/src/farmhash.h"
+#endif
+
+#if BENCHMARK_INTERNAL
+// Placeholder for include
+#endif
+
+namespace highwayhash {
+namespace {
+
+// Stores time measurements from benchmarks, with support for printing them
+// as LaTeX figures or tables.
+class Measurements {
+ public:
+  void Add(const char* caption, const size_t bytes, const double cycles) {
+    const float cpb = static_cast<float>(cycles / bytes);
+    results_.emplace_back(caption, static_cast<int>(bytes), cpb);
+  }
+
+  // Prints results as a LaTeX table (only for in_sizes matching the
+  // desired values).
+  void PrintTable(const std::vector<size_t>& in_sizes) {
+    std::vector<size_t> unique = in_sizes;
+    std::sort(unique.begin(), unique.end());
+    unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+
+    printf("\\begin{tabular}{");
+    for (size_t i = 0; i < unique.size() + 1; ++i) {
+      printf("%s", i == 0 ? "r" : "|r");
+    }
+    printf("}\n\\toprule\nAlgorithm");
+    for (const size_t in_size : unique) {
+      printf(" & %zu", in_size);
+    }
+    printf("\\\\\n\\midrule\n");
+
+    const SpeedsForCaption cpb_for_caption = SortByCaptionFilterBySize(unique);
+    for (const auto& item : cpb_for_caption) {
+      printf("%22s", item.first.c_str());
+      for (const float cpb : item.second) {
+        printf(" & %5.2f", cpb);
+      }
+      printf("\\\\\n");
+    }
+  }
+
+  // Prints results suitable for pgfplots.
+  void PrintPlots() {
+    const SpeedsForCaption cpb_for_caption = SortByCaption();
+    assert(!cpb_for_caption.empty());
+    const size_t num_sizes = cpb_for_caption.begin()->second.size();
+
+    printf("Size ");
+    // Flatten per-caption vectors into one iterator.
+    std::vector<std::vector<float>::const_iterator> iterators;
+    for (const auto& item : cpb_for_caption) {
+      printf("%21s ", item.first.c_str());
+      assert(item.second.size() == num_sizes);
+      iterators.push_back(item.second.begin());
+    }
+    printf("\n");
+
+    const std::vector<int>& sizes = UniqueSizes();
+    assert(num_sizes == sizes.size());
+    for (int i = 0; i < static_cast<int>(num_sizes); ++i) {
+      printf("%d ", sizes[i]);
+      for (auto& it : iterators) {
+        printf("%5.2f ", 1.0f / *it);  // bytes per cycle
+        ++it;
+      }
+      printf("\n");
+    }
+  }
+
+ private:
+  struct Result {
+    Result(const char* caption, const int in_size, const float cpb)
+        : caption(caption), in_size(in_size), cpb(cpb) {}
+
+    // Algorithm name.
+    std::string caption;
+    // Size of the input data [bytes].
+    int in_size;
+    // Measured throughput [cycles per byte].
+    float cpb;
+  };
+
+  // Returns set of all input sizes for the first column of a size/speed plot.
+  std::vector<int> UniqueSizes() {
+    std::vector<int> sizes;
+    sizes.reserve(results_.size());
+    for (const Result& result : results_) {
+      sizes.push_back(result.in_size);
+    }
+    std::sort(sizes.begin(), sizes.end());
+    sizes.erase(std::unique(sizes.begin(), sizes.end()), sizes.end());
+    return sizes;
+  }
+
+  using SpeedsForCaption = std::map<std::string, std::vector<float>>;
+
+  SpeedsForCaption SortByCaption() const {
+    SpeedsForCaption cpb_for_caption;
+    for (const Result& result : results_) {
+      cpb_for_caption[result.caption].push_back(result.cpb);
+    }
+    return cpb_for_caption;
+  }
+
+  // Only includes measurement results matching one of the given sizes.
+  SpeedsForCaption SortByCaptionFilterBySize(
+      const std::vector<size_t>& in_sizes) const {
+    SpeedsForCaption cpb_for_caption;
+    for (const Result& result : results_) {
+      for (const size_t in_size : in_sizes) {
+        if (result.in_size == static_cast<int>(in_size)) {
+          cpb_for_caption[result.caption].push_back(result.cpb);
+        }
+      }
+    }
+    return cpb_for_caption;
+  }
+
+  std::vector<Result> results_;
+};
+
+void AddMeasurements(DurationsForInputs* input_map, const char* caption,
+                     Measurements* measurements) {
+  for (size_t i = 0; i < input_map->num_items; ++i) {
+    const DurationsForInputs::Item& item = input_map->items[i];
+    std::vector<float> durations(item.durations,
+                                 item.durations + item.num_durations);
+    const float median_ticks = Median(&durations);
+    const float variability = MedianAbsoluteDeviation(durations, median_ticks);
+    const double median_cpu_cycles =
+        (median_ticks / InvariantTicksPerSecond()) * NominalClockRate();
+    printf("%s %4zu: median=%6.1f ticks; median L1 norm =%4.1f ticks\n",
+           caption, item.input, median_ticks, variability);
+    measurements->Add(caption, item.input, median_cpu_cycles);
+  }
+  input_map->num_items = 0;
+}
+
+#if BENCHMARK_SIP || BENCHMARK_FARM || BENCHMARK_INTERNAL || \
+    (BENCHMARK_SIP_TREE && defined(__AVX2__))
+
+void MeasureAndAdd(DurationsForInputs* input_map, const char* caption,
+                   const Func func, Measurements* measurements) {
+  MeasureDurations(func, input_map);
+  AddMeasurements(input_map, caption, measurements);
+}
+
+#endif
+
+// InstructionSets::RunAll callback.
+void AddMeasurementsWithPrefix(const char* prefix, const char* target_name,
+                               DurationsForInputs* input_map, void* context) {
+  std::string caption(prefix);
+  caption += target_name;
+  AddMeasurements(input_map, caption.c_str(),
+                  static_cast<Measurements*>(context));
+}
+
+#if BENCHMARK_SIP
+
+uint64_t RunSip(const void*, const size_t size) {
+  HH_ALIGNAS(16) const HH_U64 key2[2] = {0, 1};
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return SipHash(key2, in, size);
+}
+
+uint64_t RunSip13(const void*, const size_t size) {
+  HH_ALIGNAS(16) const HH_U64 key2[2] = {0, 1};
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return SipHash13(key2, in, size);
+}
+
+#endif
+
+#if BENCHMARK_SIP_TREE
+
+uint64_t RunSipTree(const void*, const size_t size) {
+  HH_ALIGNAS(32) const HH_U64 key4[4] = {0, 1, 2, 3};
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return SipTreeHash(key4, in, size);
+}
+
+uint64_t RunSipTree13(const void*, const size_t size) {
+  HH_ALIGNAS(32) const HH_U64 key4[4] = {0, 1, 2, 3};
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return SipTreeHash13(key4, in, size);
+}
+
+#endif
+
+#if BENCHMARK_FARM
+
+uint64_t RunFarm(const void*, const size_t size) {
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return farmhash::Fingerprint64(reinterpret_cast<const char*>(in), size);
+}
+
+#endif
+
+#if BENCHMARK_INTERNAL
+uint64_t RunInternal(const void*, const size_t size) {
+  char in[kMaxBenchmarkInputSize];
+  memcpy(in, &size, sizeof(size));
+  return in[rand() % size];
+}
+#endif
+
+void AddMeasurements(const std::vector<size_t>& in_sizes,
+                     Measurements* measurements) {
+  DurationsForInputs input_map(in_sizes.data(), in_sizes.size(), 40);
+#if BENCHMARK_SIP
+  MeasureAndAdd(&input_map, "SipHash", &RunSip, measurements);
+  MeasureAndAdd(&input_map, "SipHash13", &RunSip13, measurements);
+#endif
+
+#if BENCHMARK_SIP_TREE && defined(__AVX2__)
+  MeasureAndAdd(&input_map, "SipTreeHash", &RunSipTree, measurements);
+  MeasureAndAdd(&input_map, "SipTreeHash13", &RunSipTree13, measurements);
+#endif
+
+#if BENCHMARK_FARM
+  MeasureAndAdd(&input_map, "Farm", &RunFarm, measurements);
+#endif
+
+#if BENCHMARK_INTERNAL
+  MeasureAndAdd(&input_map, "Internal", &RunInternal, measurements);
+#endif
+
+#if BENCHMARK_HIGHWAY
+  InstructionSets::RunAll<HighwayHashBenchmark>(
+      &input_map, &AddMeasurementsWithPrefix, measurements);
+#endif
+
+#if BENCHMARK_HIGHWAY_CAT
+  InstructionSets::RunAll<HighwayHashCatBenchmark>(
+      &input_map, &AddMeasurementsWithPrefix, measurements);
+#endif
+}
+
+void PrintTable() {
+  const std::vector<size_t> in_sizes = {
+      7, 8, 31, 32, 63, 64, kMaxBenchmarkInputSize};
+  Measurements measurements;
+  AddMeasurements(in_sizes, &measurements);
+  measurements.PrintTable(in_sizes);
+}
+
+void PrintPlots() {
+  std::vector<size_t> in_sizes;
+  for (int num_vectors = 0; num_vectors < 12; ++num_vectors) {
+    for (int remainder : {0, 9, 18, 27}) {
+      in_sizes.push_back(num_vectors * 32 + remainder);
+      assert(in_sizes.back() <= kMaxBenchmarkInputSize);
+    }
+  }
+
+  Measurements measurements;
+  AddMeasurements(in_sizes, &measurements);
+  measurements.PrintPlots();
+}
+
+}  // namespace
+}  // namespace highwayhash
+
+int main(int argc, char* argv[]) {
+  // No argument or t => table
+  if (argc < 2 || argv[1][0] == 't') {
+    highwayhash::PrintTable();
+  } else if (argv[1][0] == 'p') {
+    highwayhash::PrintPlots();
+  }
+  return 0;
+}
diff --git a/highwayhash/highwayhash/c_bindings.cc b/highwayhash/highwayhash/c_bindings.cc
new file mode 100644
index 000000000..7e0488fb4
--- /dev/null
+++ b/highwayhash/highwayhash/c_bindings.cc
@@ -0,0 +1,35 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/c_bindings.h"
+
+#include "highwayhash/highwayhash_target.h"
+#include "highwayhash/instruction_sets.h"
+
+using highwayhash::InstructionSets;
+using highwayhash::HighwayHash;
+
+extern "C" {
+
+// Ideally this would reside in highwayhash_target.cc, but that file is
+// compiled multiple times and we must only define this function once.
+uint64_t HighwayHash64(const HHKey key, const char* bytes,
+                       const uint64_t size) {
+  HHResult64 result;
+  InstructionSets::Run<HighwayHash>(*reinterpret_cast<const HHKey*>(key), bytes,
+                                    size, &result);
+  return result;
+}
+
+}  // extern "C"
diff --git a/highwayhash/highwayhash/c_bindings.h b/highwayhash/highwayhash/c_bindings.h
new file mode 100644
index 000000000..903aabc0f
--- /dev/null
+++ b/highwayhash/highwayhash/c_bindings.h
@@ -0,0 +1,57 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_C_BINDINGS_H_
+#define HIGHWAYHASH_C_BINDINGS_H_
+
+// C-callable function prototypes, documented in the other header files.
+
+#include <stdint.h>
+
+#include "hh_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+
+// Bring the symbols out of the namespace.
+using highwayhash::HHKey;
+using highwayhash::HHPacket;
+using highwayhash::HHResult128;
+using highwayhash::HHResult256;
+using highwayhash::HHResult64;
+#endif
+
+uint64_t SipHashC(const uint64_t* key, const char* bytes, const uint64_t size);
+uint64_t SipHash13C(const uint64_t* key, const char* bytes,
+                    const uint64_t size);
+
+// Uses the best implementation of HighwayHash for the current CPU and
+// calculates 64-bit hash of given data.
+uint64_t HighwayHash64(const HHKey key, const char* bytes, const uint64_t size);
+
+// Defined by highwayhash_target.cc, which requires a _Target* suffix.
+uint64_t HighwayHash64_TargetPortable(const HHKey key, const char* bytes,
+                                      const uint64_t size);
+uint64_t HighwayHash64_TargetSSE41(const HHKey key, const char* bytes,
+                                   const uint64_t size);
+uint64_t HighwayHash64_TargetAVX2(const HHKey key, const char* bytes,
+                                  const uint64_t size);
+uint64_t HighwayHash64_TargetVSX(const HHKey key, const char* bytes,
+                                 const uint64_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // HIGHWAYHASH_C_BINDINGS_H_
diff --git a/highwayhash/highwayhash/compiler_specific.h b/highwayhash/highwayhash/compiler_specific.h
new file mode 100644
index 000000000..4789f9a61
--- /dev/null
+++ b/highwayhash/highwayhash/compiler_specific.h
@@ -0,0 +1,90 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_COMPILER_SPECIFIC_H_
+#define HIGHWAYHASH_COMPILER_SPECIFIC_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+// Compiler
+
+// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
+// otherwise 100 * major + minor version. Note that other packages check for
+// #ifdef COMPILER_MSVC, so we cannot use that same name.
+
+#ifdef _MSC_VER
+#define HH_MSC_VERSION _MSC_VER
+#else
+#define HH_MSC_VERSION 0
+#endif
+
+#ifdef __GNUC__
+#define HH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define HH_GCC_VERSION 0
+#endif
+
+#ifdef __clang__
+#define HH_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#define HH_CLANG_VERSION 0
+#endif
+
+//-----------------------------------------------------------------------------
+
+#if HH_GCC_VERSION && HH_GCC_VERSION < 408
+#define HH_ALIGNAS(multiple) __attribute__((aligned(multiple)))
+#else
+#define HH_ALIGNAS(multiple) alignas(multiple)  // C++11
+#endif
+
+#if HH_MSC_VERSION
+#define HH_RESTRICT __restrict
+#elif HH_GCC_VERSION
+#define HH_RESTRICT __restrict__
+#else
+#define HH_RESTRICT
+#endif
+
+#if HH_MSC_VERSION
+#define HH_INLINE __forceinline
+#define HH_NOINLINE __declspec(noinline)
+#else
+#define HH_INLINE inline
+#define HH_NOINLINE __attribute__((noinline))
+#endif
+
+#if HH_MSC_VERSION
+// Unsupported, __assume is not the same.
+#define HH_LIKELY(expr) expr
+#define HH_UNLIKELY(expr) expr
+#else
+#define HH_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HH_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+#if HH_MSC_VERSION
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+#define HH_COMPILER_FENCE _ReadWriteBarrier()
+#elif HH_GCC_VERSION
+#define HH_COMPILER_FENCE asm volatile("" : : : "memory")
+#else
+#define HH_COMPILER_FENCE
+#endif
+
+#endif  // HIGHWAYHASH_COMPILER_SPECIFIC_H_
diff --git a/highwayhash/highwayhash/data_parallel.h b/highwayhash/highwayhash/data_parallel.h
new file mode 100644
index 000000000..d72afc953
--- /dev/null
+++ b/highwayhash/highwayhash/data_parallel.h
@@ -0,0 +1,341 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_DATA_PARALLEL_H_
+#define HIGHWAYHASH_DATA_PARALLEL_H_
+
+// Portable C++11 alternative to OpenMP for data-parallel computations:
+// provides low-overhead ThreadPool, plus PerThread with support for reduction.
+
+#include <stdio.h>
+#include <algorithm>  // find_if
+#include <atomic>
+#include <condition_variable>  //NOLINT
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>  //NOLINT
+#include <thread>  //NOLINT
+#include <utility>
+#include <vector>
+
+#define DATA_PARALLEL_CHECK(condition)                           \
+  while (!(condition)) {                                         \
+    printf("data_parallel check failed at line %d\n", __LINE__); \
+    abort();                                                     \
+  }
+
+namespace highwayhash {
+
+// Highly scalable thread pool, especially suitable for data-parallel
+// computations in the fork-join model, where clients need to know when all
+// tasks have completed.
+//
+// Thread pools usually store small numbers of heterogeneous tasks in a queue.
+// When tasks are identical or differ only by an integer input parameter, it is
+// much faster to store just one function of an integer parameter and call it
+// for each value.
+//
+// This thread pool can efficiently load-balance millions of tasks using an
+// atomic counter, thus avoiding per-task syscalls. With 48 hyperthreads and
+// 1M tasks that add to an atomic counter, overall runtime is 10-20x higher
+// when using std::async, and up to 200x for a queue-based ThreadPool.
+//
+// Usage:
+// ThreadPool pool;
+// pool.Run(0, 1000000, [](const int i) { Func1(i); });
+// // When Run returns, all of its tasks have finished.
+//
+// pool.RunTasks({Func2, Func3, Func4});
+// // The destructor waits until all worker threads have exited cleanly.
+class ThreadPool {
+ public:
+  // Starts the given number of worker threads and blocks until they are ready.
+  // "num_threads" defaults to one per hyperthread.
+  explicit ThreadPool(
+      const int num_threads = std::thread::hardware_concurrency())
+      : num_threads_(num_threads) {
+    DATA_PARALLEL_CHECK(num_threads_ > 0);
+    threads_.reserve(num_threads_);
+    for (int i = 0; i < num_threads_; ++i) {
+      threads_.emplace_back(ThreadFunc, this);
+    }
+
+    padding_[0] = 0;  // avoid unused member warning.
+
+    WorkersReadyBarrier();
+  }
+
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator&(const ThreadPool&) = delete;
+
+  // Waits for all threads to exit.
+  ~ThreadPool() {
+    StartWorkers(kWorkerExit);
+
+    for (std::thread& thread : threads_) {
+      thread.join();
+    }
+  }
+
+  // Runs func(i) on worker thread(s) for every i in [begin, end).
+  // Not thread-safe - no two calls to Run and RunTasks may overlap.
+  // Subsequent calls will reuse the same threads.
+  //
+  // Precondition: 0 <= begin <= end.
+  template <class Func>
+  void Run(const int begin, const int end, const Func& func) {
+    DATA_PARALLEL_CHECK(0 <= begin && begin <= end);
+    if (begin == end) {
+      return;
+    }
+    const WorkerCommand worker_command = (WorkerCommand(end) << 32) + begin;
+    // Ensure the inputs do not result in a reserved command.
+    DATA_PARALLEL_CHECK(worker_command != kWorkerWait);
+    DATA_PARALLEL_CHECK(worker_command != kWorkerExit);
+
+    // If Func is large (many captures), this will allocate memory, but it is
+    // still slower to use a std::ref wrapper.
+    task_ = func;
+    num_reserved_.store(0);
+
+    StartWorkers(worker_command);
+    WorkersReadyBarrier();
+  }
+
+  // Runs each task (closure, typically a lambda function) on worker thread(s).
+  // Not thread-safe - no two calls to Run and RunTasks may overlap.
+  // Subsequent calls will reuse the same threads.
+  //
+  // This is a more conventional interface for heterogeneous tasks that may be
+  // independent/unrelated.
+  void RunTasks(const std::vector<std::function<void(void)>>& tasks) {
+    Run(0, static_cast<int>(tasks.size()),
+        [&tasks](const int i) { tasks[i](); });
+  }
+
+  // Statically (and deterministically) splits [begin, end) into ranges and
+  // calls "func" for each of them. Useful when "func" involves some overhead
+  // (e.g. for PerThread::Get or random seeding) that should be amortized over
+  // a range of values. "func" is void(int chunk, uint32_t begin, uint32_t end).
+  template <class Func>
+  void RunRanges(const uint32_t begin, const uint32_t end, const Func& func) {
+    const uint32_t length = end - begin;
+
+    // Use constant rather than num_threads_ for machine-independent splitting.
+    const uint32_t chunk = std::max(1U, (length + 127) / 128);
+    std::vector<std::pair<uint32_t, uint32_t>> ranges;  // begin/end
+    ranges.reserve(length / chunk + 1);
+    for (uint32_t i = 0; i < length; i += chunk) {
+      ranges.emplace_back(begin + i, begin + std::min(i + chunk, length));
+    }
+
+    Run(0, static_cast<int>(ranges.size()), [&ranges, func](const int i) {
+      func(i, ranges[i].first, ranges[i].second);
+    });
+  }
+
+ private:
+  // After construction and between calls to Run, workers are "ready", i.e.
+  // waiting on worker_start_cv_. They are "started" by sending a "command"
+  // and notifying all worker_start_cv_ waiters. (That is why all workers
+  // must be ready/waiting - otherwise, the notification will not reach all of
+  // them and the main thread waits in vain for them to report readiness.)
+  using WorkerCommand = uint64_t;
+
+  // Special values; all others encode the begin/end parameters.
+  static constexpr WorkerCommand kWorkerWait = 0;
+  static constexpr WorkerCommand kWorkerExit = ~0ULL;
+
+  void WorkersReadyBarrier() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    workers_ready_cv_.wait(lock,
+                           [this]() { return workers_ready_ == num_threads_; });
+    workers_ready_ = 0;
+  }
+
+  // Precondition: all workers are ready.
+  void StartWorkers(const WorkerCommand worker_command) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    worker_start_command_ = worker_command;
+    // Workers will need this lock, so release it before they wake up.
+    lock.unlock();
+    worker_start_cv_.notify_all();
+  }
+
+  // Attempts to reserve and perform some work from the global range of tasks,
+  // which is encoded within "command". Returns after all tasks are reserved.
+  static void RunRange(ThreadPool* self, const WorkerCommand command) {
+    const int begin = command & 0xFFFFFFFF;
+    const int end = command >> 32;
+    const int num_tasks = end - begin;
+
+    // OpenMP introduced several "schedule" strategies:
+    // "single" (static assignment of exactly one chunk per thread): slower.
+    // "dynamic" (allocates k tasks at a time): competitive for well-chosen k.
+    // "guided" (allocates k tasks, decreases k): computing k = remaining/n
+    //   is faster than halving k each iteration. We prefer this strategy
+    //   because it avoids user-specified parameters.
+
+    for (;;) {
+      const int num_reserved = self->num_reserved_.load();
+      const int num_remaining = num_tasks - num_reserved;
+      const int my_size = std::max(num_remaining / (self->num_threads_ * 2), 1);
+      const int my_begin = begin + self->num_reserved_.fetch_add(my_size);
+      const int my_end = std::min(my_begin + my_size, begin + num_tasks);
+      // Another thread already reserved the last task.
+      if (my_begin >= my_end) {
+        break;
+      }
+      for (int i = my_begin; i < my_end; ++i) {
+        self->task_(i);
+      }
+    }
+  }
+
+  static void ThreadFunc(ThreadPool* self) {
+    // Until kWorkerExit command received:
+    for (;;) {
+      std::unique_lock<std::mutex> lock(self->mutex_);
+      // Notify main thread that this thread is ready.
+      if (++self->workers_ready_ == self->num_threads_) {
+        self->workers_ready_cv_.notify_one();
+      }
+    RESUME_WAIT:
+      // Wait for a command.
+      self->worker_start_cv_.wait(lock);
+      const WorkerCommand command = self->worker_start_command_;
+      switch (command) {
+        case kWorkerWait:    // spurious wakeup:
+          goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
+        case kWorkerExit:
+          return;  // exits thread
+      }
+
+      lock.unlock();
+      RunRange(self, command);
+    }
+  }
+
+  const int num_threads_;
+
+  // Unmodified after ctor, but cannot be const because we call thread::join().
+  std::vector<std::thread> threads_;
+
+  std::mutex mutex_;  // guards both cv and their variables.
+  std::condition_variable workers_ready_cv_;
+  int workers_ready_ = 0;
+  std::condition_variable worker_start_cv_;
+  WorkerCommand worker_start_command_;
+
+  // Written by main thread, read by workers (after mutex lock/unlock).
+  std::function<void(int)> task_;
+
+  // Updated by workers; alignment/padding avoids false sharing.
+  alignas(64) std::atomic<int> num_reserved_{0};
+  int padding_[15];
+};
+
+// Thread-local storage with support for reduction (combining into one result).
+// The "T" type must be unique to the call site because the list of threads'
+// copies is a static member. (With knowledge of the underlying threads, we
+// could eliminate this list and T allocations, but that is difficult to
+// arrange and we prefer this to be usable independently of ThreadPool.)
+//
+// Usage:
+// for (int i = 0; i < N; ++i) {
+//   // in each thread:
+//   T& my_copy = PerThread<T>::Get();
+//   my_copy.Modify();
+//
+//   // single-threaded:
+//   T& combined = PerThread<T>::Reduce();
+//   Use(combined);
+//   PerThread<T>::Destroy();
+// }
+//
+// T is duck-typed and implements the following interface:
+//
+// // Returns true if T is default-initialized or Destroy was called without
+// // any subsequent re-initialization.
+// bool IsNull() const;
+//
+// // Releases any resources. Postcondition: IsNull() == true.
+// void Destroy();
+//
+// // Merges in data from "victim". Precondition: !IsNull() && !victim.IsNull().
+// void Assimilate(const T& victim);
+template <class T>
+class PerThread {
+ public:
+  // Returns reference to this thread's T instance (dynamically allocated,
+  // so its address is unique). Callers are responsible for any initialization
+  // beyond the default ctor.
+  static T& Get() {
+    static thread_local T* t;
+    if (t == nullptr) {
+      t = new T;
+      static std::mutex mutex;
+      std::lock_guard<std::mutex> lock(mutex);
+      Threads().push_back(t);
+    }
+    return *t;
+  }
+
+  // Returns vector of all per-thread T. Used inside Reduce() or by clients
+  // that require direct access to T instead of Assimilating them.
+  // Function wrapper avoids separate static member variable definition.
+  static std::vector<T*>& Threads() {
+    static std::vector<T*> threads;
+    return threads;
+  }
+
+  // Returns the first non-null T after assimilating all other threads' T
+  // into it. Precondition: at least one non-null T exists (caller must have
+  // called Get() and initialized the result).
+  static T& Reduce() {
+    std::vector<T*>& threads = Threads();
+
+    // Find first non-null T
+    const auto it = std::find_if(threads.begin(), threads.end(),
+                                 [](const T* t) { return !t->IsNull(); });
+    if (it == threads.end()) {
+      abort();
+    }
+    T* const first = *it;
+
+    for (const T* t : threads) {
+      if (t != first && !t->IsNull()) {
+        first->Assimilate(*t);
+      }
+    }
+    return *first;
+  }
+
+  // Calls each thread's T::Destroy to release resources and/or prepare for
+  // reuse by the same threads/ThreadPool. Note that all T remain allocated
+  // (we need thread-independent pointers for iterating over each thread's T,
+  // and deleting them would leave dangling pointers in each thread, which is
+  // unacceptable because the same thread may call Get() again later.)
+  static void Destroy() {
+    for (T* t : Threads()) {
+      t->Destroy();
+    }
+  }
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_DATA_PARALLEL_H_
diff --git a/highwayhash/highwayhash/data_parallel_benchmark.cc b/highwayhash/highwayhash/data_parallel_benchmark.cc
new file mode 100644
index 000000000..b8817c5c1
--- /dev/null
+++ b/highwayhash/highwayhash/data_parallel_benchmark.cc
@@ -0,0 +1,157 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <future>  //NOLINT
+#include <set>
+
+#include "testing/base/public/gunit.h"
+#include "third_party/absl/container/btree_set.h"
+#include "third_party/absl/time/clock.h"
+#include "third_party/absl/time/time.h"
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/data_parallel.h"
+#include "thread/threadpool.h"
+
+namespace highwayhash {
+namespace {
+
+constexpr int kBenchmarkTasks = 1000000;
+
+// Returns elapsed time [nanoseconds] for std::async.
+double BenchmarkAsync(uint64_t* total) {
+  const absl::Time t0 = absl::Now();
+  std::atomic<uint64_t> sum1{0};
+  std::atomic<uint64_t> sum2{0};
+
+  std::vector<std::future<void>> futures;
+  futures.reserve(kBenchmarkTasks);
+  for (int i = 0; i < kBenchmarkTasks; ++i) {
+    futures.push_back(std::async(
+        [&sum1, &sum2](const int i) {
+          sum1.fetch_add(i);
+          sum2.fetch_add(1);
+        },
+        i));
+  }
+
+  for (auto& future : futures) {
+    future.get();
+  }
+
+  const absl::Time t1 = absl::Now();
+  *total = sum1.load() + sum2.load();
+  return absl::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Returns elapsed time [nanoseconds] for (atomic) ThreadPool.
+double BenchmarkPoolA(uint64_t* total) {
+  const absl::Time t0 = absl::Now();
+  std::atomic<uint64_t> sum1{0};
+  std::atomic<uint64_t> sum2{0};
+
+  ThreadPool pool;
+  pool.Run(0, kBenchmarkTasks, [&sum1, &sum2](const int i) {
+    sum1.fetch_add(i);
+    sum2.fetch_add(1);
+  });
+
+  const absl::Time t1 = absl::Now();
+  *total = sum1.load() + sum2.load();
+  return absl::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Returns elapsed time [nanoseconds] for ::ThreadPool.
+double BenchmarkPoolG(uint64_t* total) {
+  const absl::Time t0 = absl::Now();
+  std::atomic<uint64_t> sum1{0};
+  std::atomic<uint64_t> sum2{0};
+
+  {
+    ::ThreadPool pool(std::thread::hardware_concurrency());
+    pool.StartWorkers();
+    for (int i = 0; i < kBenchmarkTasks; ++i) {
+      pool.Schedule([&sum1, &sum2, i]() {
+        sum1.fetch_add(i);
+        sum2.fetch_add(1);
+      });
+    }
+  }
+
+  const absl::Time t1 = absl::Now();
+  *total = sum1.load() + sum2.load();
+  return absl::ToDoubleNanoseconds(t1 - t0);
+}
+
+// Compares ThreadPool speed to std::async and ::ThreadPool.
+TEST(DataParallelTest, Benchmarks) {
+  uint64_t sum1, sum2, sum3;
+  const double async_ns = BenchmarkAsync(&sum1);
+  const double poolA_ns = BenchmarkPoolA(&sum2);
+  const double poolG_ns = BenchmarkPoolG(&sum3);
+
+  printf("Async %11.0f ns\nPoolA %11.0f ns\nPoolG %11.0f ns\n", async_ns,
+         poolA_ns, poolG_ns);
+  // baseline 20x, 10x with asan or msan, 5x with tsan
+  EXPECT_GT(async_ns, poolA_ns * 4);
+  // baseline 200x, 180x with asan, 70x with msan, 50x with tsan.
+  EXPECT_GT(poolG_ns, poolA_ns * 20);
+
+  // Should reach same result.
+  EXPECT_EQ(sum1, sum2);
+  EXPECT_EQ(sum2, sum3);
+}
+
+#if HH_ARCH_X64
+// Ensures multiple hardware threads are used (decided by the OS scheduler).
+TEST(DataParallelTest, TestApicIds) {
+  for (int num_threads = 1; num_threads <= std::thread::hardware_concurrency();
+       ++num_threads) {
+    ThreadPool pool(num_threads);
+
+    std::mutex mutex;
+    absl::btree_set<unsigned> ids;
+    double total = 0.0;
+    pool.Run(0, 2 * num_threads, [&mutex, &ids, &total](const int i) {
+      // Useless computations to keep the processor busy so that threads
+      // can't just reuse the same processor.
+      double sum = 0.0;
+      for (int rep = 0; rep < 900 * (i + 30); ++rep) {
+        sum += pow(rep, 0.5);
+      }
+
+      mutex.lock();
+      ids.insert(ApicId());
+      total += sum;
+      mutex.unlock();
+    });
+
+    // No core ID / APIC ID available
+    if (num_threads > 1 && ids.size() == 1) {
+      EXPECT_EQ(0, *ids.begin());
+    } else {
+      // (The Linux scheduler doesn't use all available HTs, but the
+      // computations should at least keep most cores busy.)
+      EXPECT_GT(ids.size() + 2, num_threads / 4);
+    }
+
+    // (Ensure the busy-work is not elided.)
+    EXPECT_GT(total, 1E4);
+  }
+}
+#endif  // HH_ARCH_X64
+
+}  // namespace
+}  // namespace highwayhash
diff --git a/highwayhash/highwayhash/data_parallel_test.cc b/highwayhash/highwayhash/data_parallel_test.cc
new file mode 100644
index 000000000..2728b7d3a
--- /dev/null
+++ b/highwayhash/highwayhash/data_parallel_test.cc
@@ -0,0 +1,175 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <cstdint>
+
+#include "testing/base/public/gunit.h"
+#include "highwayhash/data_parallel.h"
+
+namespace highwayhash {
+namespace {
+
+int PopulationCount(uint64_t bits) {
+  int num_set = 0;
+  while (bits != 0) {
+    num_set += bits & 1;
+    bits >>= 1;
+  }
+  return num_set;
+}
+
+std::atomic<int> func_counts{0};
+
+void Func2() {
+  usleep(200000);
+  func_counts.fetch_add(4);
+}
+
+void Func3() {
+  usleep(300000);
+  func_counts.fetch_add(16);
+}
+
+void Func4() {
+  usleep(400000);
+  func_counts.fetch_add(256);
+}
+
+// Exercises the RunTasks feature (running arbitrary tasks/closures)
+TEST(DataParallelTest, TestRunTasks) {
+  ThreadPool pool(4);
+  pool.RunTasks({Func2, Func3, Func4});
+  EXPECT_EQ(276, func_counts.load());
+}
+
+// Ensures task parameter is in bounds, every parameter is reached,
+// pool can be reused (multiple consecutive Run calls), pool can be destroyed
+// (joining with its threads).
+TEST(DataParallelTest, TestPool) {
+  for (int num_threads = 1; num_threads <= 18; ++num_threads) {
+    ThreadPool pool(num_threads);
+    for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
+      std::vector<int> mementos(num_tasks, 0);
+      for (int begin = 0; begin < 32; ++begin) {
+        std::fill(mementos.begin(), mementos.end(), 0);
+        pool.Run(begin, begin + num_tasks,
+                 [begin, num_tasks, &mementos](const int i) {
+                   // Parameter is in the given range
+                   EXPECT_GE(i, begin);
+                   EXPECT_LT(i, begin + num_tasks);
+
+                   // Store mementos to be sure we visited each i.
+                   mementos.at(i - begin) = 1000 + i;
+                 });
+        for (int i = begin; i < begin + num_tasks; ++i) {
+          EXPECT_EQ(1000 + i, mementos.at(i - begin));
+        }
+      }
+    }
+  }
+}
+
+TEST(DataParallelTest, TestRunRanges) {
+  for (int num_threads = 1; num_threads <= 18; ++num_threads) {
+    ThreadPool pool(num_threads);
+    for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
+      std::vector<int> mementos(num_tasks, 0);
+      for (int begin = 0; begin < 32; ++begin) {
+        std::fill(mementos.begin(), mementos.end(), 0);
+        pool.RunRanges(begin, begin + num_tasks,
+                       [begin, num_tasks, &mementos](const int chunk,
+                                                     const uint32_t my_begin,
+                                                     const uint32_t my_end) {
+                         for (uint32_t i = my_begin; i < my_end; ++i) {
+                           // Parameter is in the given range
+                           EXPECT_GE(i, begin);
+                           EXPECT_LT(i, begin + num_tasks);
+
+                           // Store mementos to be sure we visited each i.
+                           mementos.at(i - begin) = 1000 + i;
+                         }
+                       });
+        for (int i = begin; i < begin + num_tasks; ++i) {
+          EXPECT_EQ(1000 + i, mementos.at(i - begin));
+        }
+      }
+    }
+  }
+}
+
+// Ensures each of N threads processes exactly 1 of N tasks, i.e. the
+// work distribution is perfectly fair for small counts.
+TEST(DataParallelTest, TestSmallAssignments) {
+  for (int num_threads = 1; num_threads <= 64; ++num_threads) {
+    ThreadPool pool(num_threads);
+
+    std::atomic<int> counter{0};
+    // (Avoid mutex because it may perturb the worker thread scheduling)
+    std::atomic<uint64_t> id_bits{0};
+
+    pool.Run(0, num_threads, [&counter, num_threads, &id_bits](const int i) {
+      const int id = counter.fetch_add(1);
+      EXPECT_LT(id, num_threads);
+      uint64_t bits = id_bits.load(std::memory_order_relaxed);
+      while (!id_bits.compare_exchange_weak(bits, bits | (1ULL << id))) {
+      }
+    });
+
+    const int num_participants = PopulationCount(id_bits.load());
+    EXPECT_EQ(num_threads, num_participants);
+  }
+}
+
+// Test payload for PerThread.
+struct CheckUniqueIDs {
+  bool IsNull() const { return false; }
+  void Destroy() { id_bits = 0; }
+  void Assimilate(const CheckUniqueIDs& victim) {
+    // Cannot overlap because each PerThread has unique bits.
+    EXPECT_EQ(0, id_bits & victim.id_bits);
+    id_bits |= victim.id_bits;
+  }
+
+  uint64_t id_bits = 0;
+};
+
+// Ensures each thread has a PerThread instance, that they are successfully
+// combined/reduced into a single result, and that reuse is possible after
+// Destroy().
+TEST(DataParallelTest, TestPerThread) {
+  // We use a uint64_t bit array for convenience => no more than 64 threads.
+  const int max_threads = std::min(64U, std::thread::hardware_concurrency());
+  for (int num_threads = 1; num_threads <= max_threads; ++num_threads) {
+    ThreadPool pool(num_threads);
+
+    std::atomic<int> counter{0};
+    pool.Run(0, num_threads, [&counter, num_threads](const int i) {
+      const int id = counter.fetch_add(1);
+      EXPECT_LT(id, num_threads);
+      PerThread<CheckUniqueIDs>::Get().id_bits |= 1ULL << id;
+    });
+
+    // Verify each thread's bit is set.
+    const uint64_t all_bits = PerThread<CheckUniqueIDs>::Reduce().id_bits;
+    // Avoid shifting by 64 (undefined).
+    const uint64_t expected =
+        num_threads == 64 ? ~0ULL : (1ULL << num_threads) - 1;
+    EXPECT_EQ(expected, all_bits);
+    PerThread<CheckUniqueIDs>::Destroy();
+  }
+}
+
+}  // namespace
+}  // namespace highwayhash
diff --git a/highwayhash/highwayhash/endianess.h b/highwayhash/highwayhash/endianess.h
new file mode 100644
index 000000000..776a02fa2
--- /dev/null
+++ b/highwayhash/highwayhash/endianess.h
@@ -0,0 +1,108 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ENDIANESS_H_
+#define HIGHWAYHASH_ENDIANESS_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stdint.h>
+
+#if defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
+
+  /* Someone has already included <endian.h> or equivalent. */
+
+#elif defined(__LITTLE_ENDIAN__)
+
+#  define HH_IS_LITTLE_ENDIAN  1
+#  define HH_IS_BIG_ENDIAN     0
+#  ifdef __BIG_ENDIAN__
+#    error "Platform is both little and big endian?"
+#  endif
+
+#elif defined(__BIG_ENDIAN__)
+
+#    define HH_IS_LITTLE_ENDIAN  0
+#    define HH_IS_BIG_ENDIAN     1
+
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+      defined(__ORDER_LITTLE_ENDIAN__)
+
+#  define HH_IS_LITTLE_ENDIAN  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#  define HH_IS_BIG_ENDIAN     (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+
+#elif defined(__linux__) || defined(__CYGWIN__) || defined( __GNUC__ ) || \
+      defined( __GNU_LIBRARY__ )
+
+#  include <endian.h>
+
+#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || \
+      defined(__DragonFly__)
+
+#  include <sys/endian.h>
+
+#elif defined(_WIN32)
+
+#define HH_IS_LITTLE_ENDIAN 1
+#define HH_IS_BIG_ENDIAN 0
+
+#else
+
+#  error "Unsupported platform.  Cannot determine byte order."
+
+#endif
+
+
+#ifndef HH_IS_LITTLE_ENDIAN
+#  define HH_IS_LITTLE_ENDIAN  (BYTE_ORDER == LITTLE_ENDIAN)
+#  define HH_IS_BIG_ENDIAN     (BYTE_ORDER == BIG_ENDIAN)
+#endif
+
+
+namespace highwayhash {
+
+#if HH_IS_LITTLE_ENDIAN
+
+static inline uint32_t le32_from_host(uint32_t x) { return x; }
+static inline uint32_t host_from_le32(uint32_t x) { return x; }
+static inline uint64_t le64_from_host(uint64_t x) { return x; }
+static inline uint64_t host_from_le64(uint64_t x) { return x; }
+
+#elif !HH_IS_BIG_ENDIAN
+
+#  error "Unsupported byte order."
+
+#elif defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
+
+#include <intrin.h>
+static inline uint32_t host_from_le32(uint32_t x) { return _byteswap_ulong(x); }
+static inline uint32_t le32_from_host(uint32_t x) { return _byteswap_ulong(x); }
+static inline uint64_t host_from_le64(uint64_t x) { return _byteswap_uint64(x);}
+static inline uint64_t le64_from_host(uint64_t x) { return _byteswap_uint64(x);}
+
+#else
+
+static inline uint32_t host_from_le32(uint32_t x) {return __builtin_bswap32(x);}
+static inline uint32_t le32_from_host(uint32_t x) {return __builtin_bswap32(x);}
+static inline uint64_t host_from_le64(uint64_t x) {return __builtin_bswap64(x);}
+static inline uint64_t le64_from_host(uint64_t x) {return __builtin_bswap64(x);}
+
+#endif
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_ENDIANESS_H_
diff --git a/highwayhash/highwayhash/example.cc b/highwayhash/highwayhash/example.cc
new file mode 100644
index 000000000..e3939dd4a
--- /dev/null
+++ b/highwayhash/highwayhash/example.cc
@@ -0,0 +1,40 @@
+// Minimal usage example: prints a hash. Tested on x86, ppc, arm.
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include "highwayhash/highwayhash.h"
+
+using namespace highwayhash;
+
+int main(int argc, char* argv[]) {
+  // We read from the args on purpose, to ensure a compile time constant will
+  // not be used, for verifying assembly on the supported platforms.
+  if (argc != 2) {
+    std::cout << "Please provide 1 argument with a text to hash" << std::endl;
+    return 1;
+  }
+
+  // Please use a different key to ensure your hashes aren't identical.
+  HH_ALIGNAS(32) const HHKey key = {1, 2, 3, 4};
+
+  // Aligning inputs to 32 bytes may help but is not required.
+  const char* in = argv[1];
+  const size_t size = strlen(in);
+
+  // Type determines the hash size; can also be HHResult128 or HHResult256.
+  HHResult64 result;
+
+  // HH_TARGET_PREFERRED expands to the best specialization available for the
+  // CPU detected via compiler flags (e.g. AVX2 #ifdef __AVX2__).
+  HHStateT<HH_TARGET_PREFERRED> state(key);
+  HighwayHashT(&state, in, size, &result);
+  std::cout << "Hash   : " << result << std::endl;
+
+  HighwayHashCatT<HH_TARGET_PREFERRED> cat(key);
+  cat.Append(in, size);
+  cat.Finalize(&result);
+  std::cout << "HashCat: " << result << std::endl;
+  return 0;
+}
diff --git a/highwayhash/highwayhash/hh_avx2.cc b/highwayhash/highwayhash/hh_avx2.cc
new file mode 100644
index 000000000..7e3ddff0d
--- /dev/null
+++ b/highwayhash/highwayhash/hh_avx2.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/highwayhash_target.cc"
diff --git a/highwayhash/highwayhash/hh_avx2.h b/highwayhash/highwayhash/hh_avx2.h
new file mode 100644
index 000000000..db44f533c
--- /dev/null
+++ b/highwayhash/highwayhash/hh_avx2.h
@@ -0,0 +1,381 @@
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_AVX2_H_
+#define HIGHWAYHASH_HH_AVX2_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_buffer.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+#include "highwayhash/vector128.h"
+#include "highwayhash/vector256.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -mavx2 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; matching it here makes
+// it easier use the vector128 symbols, but requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+class HHStateAVX2 {
+ public:
+  explicit HH_INLINE HHStateAVX2(const HHKey key_lanes) { Reset(key_lanes); }
+
+  HH_INLINE void Reset(const HHKey key_lanes) {
+    // "Nothing up my sleeve" numbers, concatenated hex digits of Pi from
+    // http://www.numberworld.org/digits/Pi/, retrieved Feb 22, 2016.
+    //
+    // We use this python code to generate the fourth number to have
+    // more even mixture of bits:
+    /*
+def x(a,b,c):
+  retval = 0
+  for i in range(64):
+    count = ((a >> i) & 1) + ((b >> i) & 1) + ((c >> i) & 1)
+    if (count <= 1):
+      retval |= 1 << i
+  return retval
+    */
+    const V4x64U init0(0x243f6a8885a308d3ull, 0x13198a2e03707344ull,
+                       0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
+    const V4x64U init1(0x452821e638d01377ull, 0xbe5466cf34e90c6cull,
+                       0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
+    const V4x64U key = LoadUnaligned<V4x64U>(key_lanes);
+    v0 = key ^ init0;
+    v1 = Rotate64By32(key) ^ init1;
+    mul0 = init0;
+    mul1 = init1;
+  }
+
+  HH_INLINE void Update(const HHPacket& packet_bytes) {
+    const uint64_t* HH_RESTRICT packet =
+        reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+    Update(LoadUnaligned<V4x64U>(packet));
+  }
+
+  HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+    // 'Length padding' differentiates zero-valued inputs that have the same
+    // size/32. mod32 is sufficient because each Update behaves as if a
+    // counter were injected, because the state is large and mixed thoroughly.
+    const V8x32U size256(
+        _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32)));
+    // Equivalent to storing size_mod32 in packet.
+    v0 += V4x64U(size256);
+    // Boosts the avalanche effect of mod32.
+    v1 = Rotate32By(v1, size256);
+
+    const char* remainder = bytes + (size_mod32 & ~3);
+    const size_t size_mod4 = size_mod32 & 3;
+
+    const V4x32U size(_mm256_castsi256_si128(size256));
+
+    // (Branching is faster than a single _mm256_maskload_epi32.)
+    if (HH_UNLIKELY(size_mod32 & 16)) {  // 16..31 bytes left
+      const V4x32U packetL =
+          LoadUnaligned<V4x32U>(reinterpret_cast<const uint32_t*>(bytes));
+
+      const V4x32U int_mask = IntMask<16>()(size);
+      const V4x32U int_lanes = MaskedLoadInt(bytes + 16, int_mask);
+      const uint32_t last4 =
+          Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+      // The upper four bytes of packetH are zero, so insert there.
+      const V4x32U packetH(_mm_insert_epi32(int_lanes, last4, 3));
+      Update(packetH, packetL);
+    } else {  // size_mod32 < 16
+      const V4x32U int_mask = IntMask<0>()(size);
+      const V4x32U packetL = MaskedLoadInt(bytes, int_mask);
+      const uint64_t last3 =
+          Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+      // Rather than insert into packetL[3], it is faster to initialize
+      // the otherwise empty packetH.
+      const V4x32U packetH(_mm_cvtsi64_si128(last3));
+      Update(packetH, packetL);
+    }
+  }
+
+  HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+    // Mix together all lanes. It is slightly better to permute v0 than v1;
+    // it will be added to v1.
+    Update(Permute(v0));
+    Update(Permute(v0));
+    Update(Permute(v0));
+    Update(Permute(v0));
+
+    const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
+    const V2x64U sum1(_mm256_castsi256_si128(v1 + mul1));
+    const V2x64U hash = sum0 + sum1;
+    // Each lane is sufficiently mixed, so just truncate to 64 bits.
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
+  }
+
+  HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+    for (int n = 0; n < 6; n++) {
+      Update(Permute(v0));
+    }
+
+    const V2x64U sum0(_mm256_castsi256_si128(v0 + mul0));
+    const V2x64U sum1(_mm256_extracti128_si256(v1 + mul1, 1));
+    const V2x64U hash = sum0 + sum1;
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(result), hash);
+  }
+
+  HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+    for (int n = 0; n < 10; n++) {
+      Update(Permute(v0));
+    }
+
+    const V4x64U sum0 = v0 + mul0;
+    const V4x64U sum1 = v1 + mul1;
+    const V4x64U hash = ModularReduction(sum1, sum0);
+    StoreUnaligned(hash, &(*result)[0]);
+  }
+
+  // "buffer" must be 32-byte aligned.
+  static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_store_si256(reinterpret_cast<__m256i*>(buffer), zero);
+  }
+
+  // "buffer" must be 32-byte aligned.
+  static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+                                    const size_t size_mod32,
+                                    char* HH_RESTRICT buffer) {
+    const V4x32U size(size_mod32);
+    const uint32_t* const HH_RESTRICT from_u32 =
+        reinterpret_cast<const uint32_t * HH_RESTRICT>(from);
+    uint32_t* const HH_RESTRICT buffer_u32 =
+        reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
+    if (HH_UNLIKELY(size_mod32 & 16)) {  // Copying 16..31 bytes
+      const V4x32U inL = LoadUnaligned<V4x32U>(from_u32);
+      Store(inL, buffer_u32);
+      const V4x32U inH = Load0To16<16, Load3::AllowReadBefore>(
+          from + 16, size_mod32 - 16, size);
+      Store(inH, buffer_u32 + V4x32U::N);
+    } else {  // Copying 0..15 bytes
+      const V4x32U inL = Load0To16<>(from, size_mod32, size);
+      Store(inL, buffer_u32);
+      // No need to change upper 16 bytes of buffer.
+    }
+  }
+
+  // "buffer" must be 32-byte aligned.
+  static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+                                      const size_t size_mod32,
+                                      char* HH_RESTRICT buffer,
+                                      const size_t buffer_valid) {
+    const V4x32U size(size_mod32);
+    uint32_t* const HH_RESTRICT buffer_u32 =
+        reinterpret_cast<uint32_t * HH_RESTRICT>(buffer);
+    // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
+    if (HH_UNLIKELY(buffer_valid & 16)) {
+      const V4x32U suffix = Load0To16<>(from, size_mod32, size);
+      const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
+      const V4x32U outH = Concatenate(bufferH, buffer_valid - 16, suffix);
+      Store(outH, buffer_u32 + V4x32U::N);
+    } else {  // Appending 0..32 bytes starting at offset 0..15.
+      const V4x32U bufferL = Load<V4x32U>(buffer_u32);
+      const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
+      const V4x32U outL = Concatenate(bufferL, buffer_valid, suffixL);
+      Store(outL, buffer_u32);
+      const size_t offsetH = sizeof(V4x32U) - buffer_valid;
+      // Do we have enough input to start filling the upper 16 buffer bytes?
+      if (size_mod32 > offsetH) {
+        const size_t sizeH = size_mod32 - offsetH;
+        const V4x32U outH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
+        Store(outH, buffer_u32 + V4x32U::N);
+      }
+    }
+  }
+
+  // "buffer" must be 32-byte aligned.
+  HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+                                 const size_t size_mod32,
+                                 const char* HH_RESTRICT buffer,
+                                 const size_t buffer_valid) {
+    const V4x32U size(size_mod32);
+    const uint32_t* const HH_RESTRICT buffer_u32 =
+        reinterpret_cast<const uint32_t * HH_RESTRICT>(buffer);
+    // buffer_valid + size <= 32 => appending 0..16 bytes inside upper 16 bytes.
+    if (HH_UNLIKELY(buffer_valid & 16)) {
+      const V4x32U suffix = Load0To16<>(from, size_mod32, size);
+      const V4x32U packetL = Load<V4x32U>(buffer_u32);
+      const V4x32U bufferH = Load<V4x32U>(buffer_u32 + V4x32U::N);
+      const V4x32U packetH = Concatenate(bufferH, buffer_valid - 16, suffix);
+      Update(packetH, packetL);
+    } else {  // Appending 0..32 bytes starting at offset 0..15.
+      const V4x32U bufferL = Load<V4x32U>(buffer_u32);
+      const V4x32U suffixL = Load0To16<>(from, size_mod32, size);
+      const V4x32U packetL = Concatenate(bufferL, buffer_valid, suffixL);
+      const size_t offsetH = sizeof(V4x32U) - buffer_valid;
+      V4x32U packetH = packetL - packetL;
+      // Do we have enough input to start filling the upper 16 packet bytes?
+      if (size_mod32 > offsetH) {
+        const size_t sizeH = size_mod32 - offsetH;
+        packetH = Load0To16<>(from + offsetH, sizeH, V4x32U(sizeH));
+      }
+
+      Update(packetH, packetL);
+    }
+  }
+
+ private:
+  static HH_INLINE V4x32U MaskedLoadInt(const char* from,
+                                        const V4x32U& int_mask) {
+    // No faults will be raised when reading n=0..3 ints from "from" provided
+    // int_mask[n] = 0.
+    const int* HH_RESTRICT int_from = reinterpret_cast<const int*>(from);
+    return V4x32U(_mm_maskload_epi32(int_from, int_mask));
+  }
+
+  // Loads <= 16 bytes without accessing any byte outside [from, from + size).
+  // from[i] is loaded into lane i; from[i >= size] is undefined.
+  template <uint32_t kSizeOffset = 0, class Load3Policy = Load3::AllowNone>
+  static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32,
+                                    const V4x32U& size) {
+    const char* remainder = from + (size_mod32 & ~3);
+    const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3);
+    const V4x32U int_mask = IntMask<kSizeOffset>()(size);
+    const V4x32U int_lanes = MaskedLoadInt(from, int_mask);
+    return Insert4AboveMask(last3, int_mask, int_lanes);
+  }
+
+  static HH_INLINE V4x64U Rotate64By32(const V4x64U& v) {
+    return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+  }
+
+  // Rotates 32-bit lanes by "count" bits.
+  static HH_INLINE V4x64U Rotate32By(const V4x64U& v, const V8x32U& count) {
+    // Use variable shifts because sll_epi32 has 4 cycle latency (presumably
+    // to broadcast the shift count).
+    const V4x64U shifted_left(_mm256_sllv_epi32(v, count));
+    const V4x64U shifted_right(_mm256_srlv_epi32(v, V8x32U(32) - count));
+    return shifted_left | shifted_right;
+  }
+
+  static HH_INLINE V4x64U Permute(const V4x64U& v) {
+    // For complete mixing, we need to swap the upper and lower 128-bit halves;
+    // we also swap all 32-bit halves. This is faster than extracti128 plus
+    // inserti128 followed by Rotate64By32.
+    const V4x64U indices(0x0000000200000003ull, 0x0000000000000001ull,
+                         0x0000000600000007ull, 0x0000000400000005ull);
+    return V4x64U(_mm256_permutevar8x32_epi32(v, indices));
+  }
+
+  static HH_INLINE V4x64U MulLow32(const V4x64U& a, const V4x64U& b) {
+    return V4x64U(_mm256_mul_epu32(a, b));
+  }
+
+  static HH_INLINE V4x64U ZipperMerge(const V4x64U& v) {
+    // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+    // varying degrees. In descending order of goodness, bytes
+    // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+    // As expected, the upper and lower bytes are much worse.
+    // For each 64-bit lane, our objectives are:
+    // 1) maximizing and equalizing total goodness across the four lanes.
+    // 2) mixing with bytes from the neighboring lane (AVX-2 makes it difficult
+    //    to cross the 128-bit wall, but PermuteAndUpdate takes care of that);
+    // 3) placing the worst bytes in the upper 32 bits because those will not
+    //    be used in the next 32x32 multiplication.
+    const uint64_t hi = 0x070806090D0A040Bull;
+    const uint64_t lo = 0x000F010E05020C03ull;
+    return V4x64U(_mm256_shuffle_epi8(v, V4x64U(hi, lo, hi, lo)));
+  }
+
+  // Updates four hash lanes in parallel by injecting four 64-bit packets.
+  HH_INLINE void Update(const V4x64U& packet) {
+    v1 += packet;
+    v1 += mul0;
+    mul0 ^= MulLow32(v1, v0 >> 32);
+    HH_COMPILER_FENCE;
+    v0 += mul1;
+    mul1 ^= MulLow32(v0, v1 >> 32);
+    HH_COMPILER_FENCE;
+    v0 += ZipperMerge(v1);
+    v1 += ZipperMerge(v0);
+  }
+
+  HH_INLINE void Update(const V4x32U& packetH, const V4x32U& packetL) {
+    const __m256i packetL256 = _mm256_castsi128_si256(packetL);
+    Update(V4x64U(_mm256_inserti128_si256(packetL256, packetH, 1)));
+  }
+
+  // XORs a << 1 and a << 2 into *out after clearing the upper two bits of a.
+  // Also does the same for the upper 128 bit lane "b". Bit shifts are only
+  // possible on independent 64-bit lanes. We therefore insert the upper bits
+  // of a[0] that were lost into a[1]. Thanks to D. Lemire for helpful comments!
+  static HH_INLINE void XorByShift128Left12(const V4x64U& ba,
+                                            V4x64U* HH_RESTRICT out) {
+    const V4x64U zero = ba ^ ba;
+    const V4x64U top_bits2 = ba >> (64 - 2);
+    const V4x64U ones = ba == ba;              // FF .. FF
+    const V4x64U shifted1_unmasked = ba + ba;  // (avoids needing port0)
+    HH_COMPILER_FENCE;
+
+    // Only the lower halves of top_bits1's 128 bit lanes will be used, so we
+    // can compute it before clearing the upper two bits of ba.
+    const V4x64U top_bits1 = ba >> (64 - 1);
+    const V4x64U upper_8bytes(_mm256_slli_si256(ones, 8));  // F 0 F 0
+    const V4x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
+    HH_COMPILER_FENCE;
+
+    const V4x64U upper_bit_of_128 = upper_8bytes << 63;  // 80..00 80..00
+    const V4x64U new_low_bits2(_mm256_unpacklo_epi64(zero, top_bits2));
+    *out ^= shifted2;
+    HH_COMPILER_FENCE;
+
+    // The result must be as if the upper two bits of the input had been clear,
+    // otherwise we're no longer computing a reduction.
+    const V4x64U shifted1 = AndNot(upper_bit_of_128, shifted1_unmasked);
+    *out ^= new_low_bits2;
+    HH_COMPILER_FENCE;
+
+    const V4x64U new_low_bits1(_mm256_unpacklo_epi64(zero, top_bits1));
+    *out ^= shifted1;
+
+    *out ^= new_low_bits1;
+  }
+
+  // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+  // Input: two 256-bit numbers a3210 and b3210, interleaved in 2 vectors.
+  // The upper and lower 128-bit halves are processed independently.
+  static HH_INLINE V4x64U ModularReduction(const V4x64U& b32a32,
+                                           const V4x64U& b10a10) {
+    // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+    V4x64U out = b10a10;
+    XorByShift128Left12(b32a32, &out);
+    return out;
+  }
+
+  V4x64U v0;
+  V4x64U v1;
+  V4x64U mul0;
+  V4x64U mul1;
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HH_AVX2_H_
diff --git a/highwayhash/highwayhash/hh_buffer.h b/highwayhash/highwayhash/hh_buffer.h
new file mode 100644
index 000000000..7b1dad0d1
--- /dev/null
+++ b/highwayhash/highwayhash/hh_buffer.h
@@ -0,0 +1,116 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_BUFFER_H_
+#define HIGHWAYHASH_HH_BUFFER_H_
+
+// Helper functions used by hh_avx2 and hh_sse41.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#if HH_TARGET == HH_TARGET_NEON
+#include "highwayhash/vector_neon.h"
+#else
+#include "highwayhash/vector128.h"
+#endif
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+template <uint32_t kSizeOffset>
+struct IntMask {};  // primary template
+
+template <>
+struct IntMask<0> {
+  // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes.
+  // Typical case: size = 0..16, nothing deducted.
+  HH_INLINE V4x32U operator()(const V4x32U& size) const {
+    // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have
+    // greater-than comparisons and don't want a negated mask.
+#if HH_TARGET == HH_TARGET_NEON
+    return V4x32U(vcgtq_u32(size, V4x32U(15, 11, 7, 3)));
+#else
+    return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3)));
+#endif
+  }
+};
+
+template <>
+struct IntMask<16> {
+  // "size" is 16..31; this is for loading the upper half of a packet, so
+  // effectively deduct 16 from size by changing the comparands.
+  HH_INLINE V4x32U operator()(const V4x32U& size) const {
+#if HH_TARGET == HH_TARGET_NEON
+    return V4x32U(vcgtq_u32(size, V4x32U(31, 27, 23, 19)));
+#else
+    return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19)));
+#endif
+  }
+};
+
+// Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0.
+// Assumes prev[j] == 0 if mask[j] = 0.
+HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask,
+                                  const V4x32U& prev) {
+  // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a
+  // control mask requires a table lookup. We know the shift count is a
+  // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except
+  // those where mask != 0. This works because any upper output lanes need not
+  // be zero.
+  return prev | AndNot(mask, V4x32U(bytes4));
+}
+
+#if HH_TARGET == HH_TARGET_AVX2
+// Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of
+// "prefix", and returns the merged/concatenated bytes.
+HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len,
+                             const V4x32U& suffix) {
+  static const uint64_t table[V16x8U::N][V2x64U::N] = {
+      {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull},
+      {0x06050403020100FFull, 0x0E0D0C0B0A090807ull},
+      {0x050403020100FFFFull, 0x0D0C0B0A09080706ull},
+      {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull},
+      {0x03020100FFFFFFFFull, 0x0B0A090807060504ull},
+      {0x020100FFFFFFFFFFull, 0x0A09080706050403ull},
+      {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull},
+      {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull},
+      {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull},
+      {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull},
+      {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}};
+  const V2x64U control = Load<V2x64U>(&table[prefix_len][0]);
+  const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control));
+  return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control));
+}
+#endif
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HH_BUFFER_H_
diff --git a/highwayhash/highwayhash/hh_neon.cc b/highwayhash/highwayhash/hh_neon.cc
new file mode 100644
index 000000000..981c094db
--- /dev/null
+++ b/highwayhash/highwayhash/hh_neon.cc
@@ -0,0 +1,22 @@
+// Copyright 2017-2019 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME NEON
+// GCC 4.5.4 only defines the former; 5.4 defines both.
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "highwayhash/highwayhash_target.cc"
+#endif
diff --git a/highwayhash/highwayhash/hh_neon.h b/highwayhash/highwayhash/hh_neon.h
new file mode 100644
index 000000000..286ad7ec0
--- /dev/null
+++ b/highwayhash/highwayhash/hh_neon.h
@@ -0,0 +1,336 @@
+// Copyright 2015-2019 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_NEON_H_
+#define HIGHWAYHASH_HH_NEON_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_buffer.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+#include "highwayhash/vector_neon.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents.
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+
+// See vector_neon.h for why this namespace is necessary; matching it here makes
+// it easier use the vector_neon symbols, but requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010
+// Uses the same method that SSE4.1 uses, only with NEON used instead.
+class HHStateNEON {
+ public:
+  explicit HH_INLINE HHStateNEON(const HHKey key) { Reset(key); }
+
+  HH_INLINE void Reset(const HHKey key) {
+    // "Nothing up my sleeve numbers"; see HHStateTAVX2.
+    const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
+    const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull);
+    const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
+    const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull);
+    const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0);
+    const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2);
+    v0L = keyL ^ init0L;
+    v0H = keyH ^ init0H;
+    v1L = Rotate64By32(keyL) ^ init1L;
+    v1H = Rotate64By32(keyH) ^ init1H;
+    mul0L = init0L;
+    mul0H = init0H;
+    mul1L = init1L;
+    mul1H = init1H;
+  }
+
+  HH_INLINE void Update(const HHPacket& packet_bytes) {
+    const uint64_t* HH_RESTRICT packet =
+        reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+    const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0);
+    const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2);
+    Update(packetH, packetL);
+  }
+
+  HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+    // 'Length padding' differentiates zero-valued inputs that have the same
+    // size/32. mod32 is sufficient because each Update behaves as if a
+    // counter were injected, because the state is large and mixed thoroughly.
+
+    // We can't use vshl/vsra because it needs a constant expression.
+    // In order to do this right now, we would need a switch statement.
+    const int32x4_t vsize_mod32(vdupq_n_s32(static_cast<int32_t>(size_mod32)));
+    // -32 - size_mod32
+    const int32x4_t shift_right_amt =
+        vdupq_n_s32(static_cast<int32_t>(size_mod32) + (~32 + 1));
+    // Equivalent to storing size_mod32 in packet.
+    v0L += V2x64U(vreinterpretq_u64_s32(vsize_mod32));
+    v0H += V2x64U(vreinterpretq_u64_s32(vsize_mod32));
+
+    // Boosts the avalanche effect of mod32.
+    v1L = V2x64U(vreinterpretq_u64_u32(
+        vorrq_u32(vshlq_u32(vreinterpretq_u32_u64(v1L), vsize_mod32),
+                  vshlq_u32(vreinterpretq_u32_u64(v1L), shift_right_amt))));
+    v1H = V2x64U(vreinterpretq_u64_u32(
+        vorrq_u32(vshlq_u32(vreinterpretq_u32_u64(v1H), vsize_mod32),
+                  vshlq_u32(vreinterpretq_u32_u64(v1H), shift_right_amt))));
+
+    const size_t size_mod4 = size_mod32 & 3;
+    const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3);
+
+    if (HH_UNLIKELY(size_mod32 & 16)) {  // 16..31 bytes left
+      const V2x64U packetL =
+          LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes));
+
+      V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32);
+
+      const uint32_t last4 =
+          Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+      // The upper four bytes of packetH are zero, so insert there.
+      packetH = V2x64U(vreinterpretq_u64_u32(
+          vsetq_lane_u32(last4, vreinterpretq_u32_u64(packetH), 3)));
+      Update(packetH, packetL);
+    } else {  // size_mod32 < 16
+      const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32);
+
+      const uint64_t last4 =
+          Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+      // Rather than insert into packetL[3], it is faster to initialize
+      // the otherwise empty packetH.
+      HH_ALIGNAS(16) uint64_t tmp[2] = {last4, 0};
+      const V2x64U packetH(vld1q_u64(tmp));
+      Update(packetH, packetL);
+    }
+  }
+
+  HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+    // Mix together all lanes.
+    for (int n = 0; n < 4; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0 = v0L + mul0L;
+    const V2x64U sum1 = v1L + mul1L;
+    const V2x64U hash = sum0 + sum1;
+    vst1q_low_u64(reinterpret_cast<uint64_t*>(result), hash);
+  }
+
+  HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+    for (int n = 0; n < 6; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0 = v0L + mul0L;
+    const V2x64U sum1 = v1H + mul1H;
+    const V2x64U hash = sum0 + sum1;
+    StoreUnaligned(hash, &(*result)[0]);
+  }
+
+  HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+    for (int n = 0; n < 10; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0L = v0L + mul0L;
+    const V2x64U sum1L = v1L + mul1L;
+    const V2x64U sum0H = v0H + mul0H;
+    const V2x64U sum1H = v1H + mul1H;
+    const V2x64U hashL = ModularReduction(sum1L, sum0L);
+    const V2x64U hashH = ModularReduction(sum1H, sum0H);
+    StoreUnaligned(hashL, &(*result)[0]);
+    StoreUnaligned(hashH, &(*result)[2]);
+  }
+
+  static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) {
+    for (size_t i = 0; i < sizeof(HHPacket); ++i) {
+      buffer_bytes[i] = 0;
+    }
+  }
+
+  static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+                                    const size_t size_mod32,
+                                    char* HH_RESTRICT buffer) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[i] = from[i];
+    }
+  }
+
+  static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+                                      const size_t size_mod32,
+                                      char* HH_RESTRICT buffer,
+                                      const size_t buffer_valid) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[buffer_valid + i] = from[i];
+    }
+  }
+
+  HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+                                 const size_t size_mod32,
+                                 const char* HH_RESTRICT buffer,
+                                 const size_t buffer_valid) {
+    HH_ALIGNAS(32) HHPacket tmp;
+    for (size_t i = 0; i < buffer_valid; ++i) {
+      tmp[i] = buffer[i];
+    }
+    for (size_t i = 0; i < size_mod32; ++i) {
+      tmp[buffer_valid + i] = from[i];
+    }
+    Update(tmp);
+  }
+
+ private:
+  // Swap 32-bit halves of each lane (caller swaps 128-bit halves)
+  static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) {
+    return V2x64U(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64(v))));
+  }
+
+  static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) {
+    // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+    // varying degrees. In descending order of goodness, bytes
+    // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+    // As expected, the upper and lower bytes are much worse.
+    // For each 64-bit lane, our objectives are:
+    // 1) maximizing and equalizing total goodness across each lane's bytes;
+    // 2) mixing with bytes from the neighboring lane;
+    // 3) placing the worst bytes in the upper 32 bits because those will not
+    //    be used in the next 32x32 multiplication.
+
+    // The positions of each byte in the new vector.
+    const uint8_t shuffle_positions[] = {3,  12, 2,  5,  14, 1, 15, 0,
+                                         11, 4,  10, 13, 9,  6, 8,  7};
+    const uint8x16_t tbl = vld1q_u8(shuffle_positions);
+
+    // Note: vqtbl1q_u8 is polyfilled for ARMv7a in vector_neon.h.
+    return V2x64U(
+        vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(v), tbl)));
+  }
+
+  HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) {
+    v1L += packetL;
+    v1H += packetH;
+    v1L += mul0L;
+    v1H += mul0H;
+    // mul0L ^= (v1L & 0xFFFFFFFF) * (v0L >> 32);
+    mul0L ^= V2x64U(vmull_u32(vmovn_u64(v1L), vshrn_n_u64(v0L, 32)));
+    // mul0H ^= (v1H & 0xFFFFFFFF) * (v0H >> 32);
+    mul0H ^= V2x64U(vmull_u32(vmovn_u64(v1H), vshrn_n_u64(v0H, 32)));
+    v0L += mul1L;
+    v0H += mul1H;
+    // mul1L ^= (v0L & 0xFFFFFFFF) * (v1L >> 32);
+    mul1L ^= V2x64U(vmull_u32(vmovn_u64(v0L), vshrn_n_u64(v1L, 32)));
+    // mul1H ^= (v0H & 0xFFFFFFFF) * (v1H >> 32);
+    mul1H ^= V2x64U(vmull_u32(vmovn_u64(v0H), vshrn_n_u64(v1H, 32)));
+    v0L += ZipperMerge(v1L);
+    v0H += ZipperMerge(v1H);
+    v1L += ZipperMerge(v0L);
+    v1H += ZipperMerge(v0H);
+  }
+
+  HH_INLINE void PermuteAndUpdate() {
+    // It is slightly better to permute v0 than v1; it will be added to v1.
+    Update(Rotate64By32(v0L), Rotate64By32(v0H));
+  }
+
+  // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12
+  // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32.
+  static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes,
+                                             const size_t size) {
+    const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
+    // Mask of 1-bits where the final 4 bytes should be inserted (replacement
+    // for variable shift/insert using broadcast+blend).
+    alignas(16) const uint64_t mask_pattern[2] = {0xFFFFFFFFULL, 0};
+    V2x64U mask4(vld1q_u64(mask_pattern));  // 'insert' into lane 0
+    V2x64U ret(vdupq_n_u64(0));
+    if (size & 8) {
+      ret = V2x64U(vld1q_low_u64(reinterpret_cast<const uint64_t*>(words)));
+      // mask4 = 0 ~0 0 0 ('insert' into lane 2)
+      mask4 = V2x64U(vshlq_n_u128(mask4, 8));
+      words += 2;
+    }
+    // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret.
+    if (size & 4) {
+      // = 0 word2 0 word2; mask4 will select which lane to keep.
+      const V2x64U broadcast(
+          vreinterpretq_u64_u32(vdupq_n_u32(LoadUnaligned(words))));
+      // (slightly faster than blendv_epi8)
+      ret |= V2x64U(broadcast & mask4);
+    }
+    return ret;
+  }
+
+  // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x.
+  // Bit shifts are only possible on independent 64-bit lanes. We therefore
+  // insert the upper bits of x[0] that were lost into x[1].
+  // Thanks to D. Lemire for helpful comments!
+  static HH_INLINE void XorByShift128Left12(const V2x64U& x,
+                                            V2x64U* HH_RESTRICT out) {
+    const V4x32U zero(vdupq_n_u32(0));
+    const V2x64U sign_bit128(
+        vreinterpretq_u64_u32(vsetq_lane_u32(0x80000000u, zero, 3)));
+    const V2x64U top_bits2 = x >> (64 - 2);
+    HH_COMPILER_FENCE;
+    const V2x64U shifted1_unmasked = x + x;  // (avoids needing port0)
+
+    // Only the lower half of top_bits1 will be used, so we
+    // can compute it before clearing the upper two bits of x.
+    const V2x64U top_bits1 = x >> (64 - 1);
+    const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
+    HH_COMPILER_FENCE;
+
+    const V2x64U new_low_bits2(vshlq_n_u128(top_bits2, 8));
+    *out ^= shifted2;
+    // The result must be as if the upper two bits of the input had been clear,
+    // otherwise we're no longer computing a reduction.
+    const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked);
+    HH_COMPILER_FENCE;
+
+    const V2x64U new_low_bits1(vshlq_n_u128(top_bits1, 8));
+    *out ^= new_low_bits2;
+    *out ^= shifted1;
+    *out ^= new_low_bits1;
+  }
+
+  // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+  // Input: a 256-bit number a3210.
+  static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked,
+                                           const V2x64U& a10) {
+    // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+    V2x64U out = a10;
+    XorByShift128Left12(a32_unmasked, &out);
+    return out;
+  }
+
+  V2x64U v0L;
+  V2x64U v0H;
+  V2x64U v1L;
+  V2x64U v1H;
+  V2x64U mul0L;
+  V2x64U mul0H;
+  V2x64U mul1L;
+  V2x64U mul1H;
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HH_NEON_H_
diff --git a/highwayhash/highwayhash/hh_portable.cc b/highwayhash/highwayhash/hh_portable.cc
new file mode 100644
index 000000000..3e0de9ed9
--- /dev/null
+++ b/highwayhash/highwayhash/hh_portable.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/highwayhash_target.cc"
diff --git a/highwayhash/highwayhash/hh_portable.h b/highwayhash/highwayhash/hh_portable.h
new file mode 100644
index 000000000..ab6e2faf2
--- /dev/null
+++ b/highwayhash/highwayhash/hh_portable.h
@@ -0,0 +1,302 @@
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_PORTABLE_H_
+#define HIGHWAYHASH_HH_PORTABLE_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; we match it here for
+// consistency. As a result, this header requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+class HHStatePortable {
+ public:
+  static const int kNumLanes = 4;
+  using Lanes = uint64_t[kNumLanes];
+
+  explicit HH_INLINE HHStatePortable(const HHKey keys) { Reset(keys); }
+
+  HH_INLINE void Reset(const HHKey keys) {
+    static const Lanes init0 = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull,
+                                0x13198a2e03707344ull, 0x243f6a8885a308d3ull};
+    static const Lanes init1 = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull,
+                                0xbe5466cf34e90c6cull, 0x452821e638d01377ull};
+    Lanes rotated_keys;
+    Rotate64By32(keys, &rotated_keys);
+    Copy(init0, &mul0);
+    Copy(init1, &mul1);
+    Xor(init0, keys, &v0);
+    Xor(init1, rotated_keys, &v1);
+  }
+
+  HH_INLINE void Update(const HHPacket& packet) {
+    Lanes packet_lanes;
+    CopyPartial(&packet[0], sizeof(HHPacket),
+                reinterpret_cast<char*>(&packet_lanes));
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      packet_lanes[lane] = host_from_le64(packet_lanes[lane]);
+    }
+    Update(packet_lanes);
+  }
+
+  HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+    // 'Length padding' differentiates zero-valued inputs that have the same
+    // size/32. mod32 is sufficient because each Update behaves as if a
+    // counter were injected, because the state is large and mixed thoroughly.
+    const uint64_t mod32_pair =
+        (static_cast<uint64_t>(size_mod32) << 32) + size_mod32;
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      v0[lane] += mod32_pair;
+    }
+    Rotate32By(reinterpret_cast<uint32_t*>(&v1), size_mod32);
+
+    const size_t size_mod4 = size_mod32 & 3;
+    const char* remainder = bytes + (size_mod32 & ~3);
+
+    HH_ALIGNAS(32) HHPacket packet = {0};
+    CopyPartial(bytes, remainder - bytes, &packet[0]);
+
+    if (size_mod32 & 16) {  // 16..31 bytes left
+      // Read the last 0..3 bytes and previous 1..4 into the upper bits.
+      // Insert into the upper four bytes of packet, which are zero.
+      uint32_t last4 =
+          Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+      last4 = host_from_le32(last4);
+
+      CopyPartial(reinterpret_cast<const char*>(&last4), 4, &packet[28]);
+    } else {  // size_mod32 < 16
+      uint64_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+      last4 = host_from_le64(last4);
+
+      // Rather than insert at packet + 28, it is faster to initialize
+      // the otherwise empty packet + 16 with up to 64 bits of padding.
+      CopyPartial(reinterpret_cast<const char*>(&last4), sizeof(last4),
+                  &packet[16]);
+    }
+    Update(packet);
+  }
+
+  HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+    for (int n = 0; n < 4; n++) {
+      PermuteAndUpdate();
+    }
+
+    *result = v0[0] + v1[0] + mul0[0] + mul1[0];
+  }
+
+  HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+    for (int n = 0; n < 6; n++) {
+      PermuteAndUpdate();
+    }
+
+    (*result)[0] = v0[0] + mul0[0] + v1[2] + mul1[2];
+    (*result)[1] = v0[1] + mul0[1] + v1[3] + mul1[3];
+  }
+
+  HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+    for (int n = 0; n < 10; n++) {
+      PermuteAndUpdate();
+    }
+
+    ModularReduction(v1[1] + mul1[1], v1[0] + mul1[0], v0[1] + mul0[1],
+                     v0[0] + mul0[0], &(*result)[1], &(*result)[0]);
+    ModularReduction(v1[3] + mul1[3], v1[2] + mul1[2], v0[3] + mul0[3],
+                     v0[2] + mul0[2], &(*result)[3], &(*result)[2]);
+  }
+
+  static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer) {
+    for (size_t i = 0; i < sizeof(HHPacket); ++i) {
+      buffer[i] = 0;
+    }
+  }
+
+  static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+                                    const size_t size_mod32,
+                                    char* HH_RESTRICT buffer) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[i] = from[i];
+    }
+  }
+
+  static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+                                      const size_t size_mod32,
+                                      char* HH_RESTRICT buffer,
+                                      const size_t buffer_valid) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[buffer_valid + i] = from[i];
+    }
+  }
+
+  HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+                                 const size_t size_mod32,
+                                 const char* HH_RESTRICT buffer,
+                                 const size_t buffer_valid) {
+    HH_ALIGNAS(32) HHPacket tmp;
+    for (size_t i = 0; i < buffer_valid; ++i) {
+      tmp[i] = buffer[i];
+    }
+    for (size_t i = 0; i < size_mod32; ++i) {
+      tmp[buffer_valid + i] = from[i];
+    }
+    Update(tmp);
+  }
+
+ private:
+  static HH_INLINE void Copy(const Lanes& source, Lanes* HH_RESTRICT dest) {
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      (*dest)[lane] = source[lane];
+    }
+  }
+
+  static HH_INLINE void Add(const Lanes& source, Lanes* HH_RESTRICT dest) {
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      (*dest)[lane] += source[lane];
+    }
+  }
+
+  template <typename LanesOrPointer>
+  static HH_INLINE void Xor(const Lanes& op1, const LanesOrPointer& op2,
+                            Lanes* HH_RESTRICT dest) {
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      (*dest)[lane] = op1[lane] ^ op2[lane];
+    }
+  }
+
+// Clears all bits except one byte at the given offset.
+#define MASK(v, bytes) ((v) & (0xFFull << ((bytes)*8)))
+
+  // 16-byte permutation; shifting is about 10% faster than byte loads.
+  // Adds zipper-merge result to add*.
+  static HH_INLINE void ZipperMergeAndAdd(const uint64_t v1, const uint64_t v0,
+                                          uint64_t* HH_RESTRICT add1,
+                                          uint64_t* HH_RESTRICT add0) {
+    *add0 += ((MASK(v0, 3) + MASK(v1, 4)) >> 24) +
+             ((MASK(v0, 5) + MASK(v1, 6)) >> 16) + MASK(v0, 2) +
+             (MASK(v0, 1) << 32) + (MASK(v1, 7) >> 8) + (v0 << 56);
+
+    *add1 += ((MASK(v1, 3) + MASK(v0, 4)) >> 24) + MASK(v1, 2) +
+             (MASK(v1, 5) >> 16) + (MASK(v1, 1) << 24) + (MASK(v0, 6) >> 8) +
+             (MASK(v1, 0) << 48) + MASK(v0, 7);
+  }
+
+#undef MASK
+
+  // For inputs that are already in native byte order (e.g. PermuteAndAdd)
+  HH_INLINE void Update(const Lanes& packet_lanes) {
+    Add(packet_lanes, &v1);
+    Add(mul0, &v1);
+
+    // (Loop is faster than unrolling)
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      const uint32_t v1_32 = static_cast<uint32_t>(v1[lane]);
+      mul0[lane] ^= v1_32 * (v0[lane] >> 32);
+      v0[lane] += mul1[lane];
+      const uint32_t v0_32 = static_cast<uint32_t>(v0[lane]);
+      mul1[lane] ^= v0_32 * (v1[lane] >> 32);
+    }
+
+    ZipperMergeAndAdd(v1[1], v1[0], &v0[1], &v0[0]);
+    ZipperMergeAndAdd(v1[3], v1[2], &v0[3], &v0[2]);
+
+    ZipperMergeAndAdd(v0[1], v0[0], &v1[1], &v1[0]);
+    ZipperMergeAndAdd(v0[3], v0[2], &v1[3], &v1[2]);
+  }
+
+  static HH_INLINE uint64_t Rotate64By32(const uint64_t x) {
+    return (x >> 32) | (x << 32);
+  }
+
+  template <typename LanesOrPointer>
+  static HH_INLINE void Rotate64By32(const LanesOrPointer& v,
+                                     Lanes* HH_RESTRICT rotated) {
+    for (int i = 0; i < kNumLanes; ++i) {
+      (*rotated)[i] = Rotate64By32(v[i]);
+    }
+  }
+
+  static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) {
+    for (int i = 0; i < 2 * kNumLanes; ++i) {
+      const uint32_t x = halves[i];
+      halves[i] = (x << count) | (x >> (32 - count));
+    }
+  }
+
+  static HH_INLINE void Permute(const Lanes& v, Lanes* HH_RESTRICT permuted) {
+    (*permuted)[0] = Rotate64By32(v[2]);
+    (*permuted)[1] = Rotate64By32(v[3]);
+    (*permuted)[2] = Rotate64By32(v[0]);
+    (*permuted)[3] = Rotate64By32(v[1]);
+  }
+
+  HH_INLINE void PermuteAndUpdate() {
+    Lanes permuted;
+    Permute(v0, &permuted);
+    Update(permuted);
+  }
+
+  // Computes a << kBits for 128-bit a = (a1, a0).
+  // Bit shifts are only possible on independent 64-bit lanes. We therefore
+  // insert the upper bits of a0 that were lost into a1. This is slightly
+  // shorter than Lemire's (a << 1) | (((a >> 8) << 1) << 8) approach.
+  template <int kBits>
+  static HH_INLINE void Shift128Left(uint64_t* HH_RESTRICT a1,
+                                     uint64_t* HH_RESTRICT a0) {
+    const uint64_t shifted1 = (*a1) << kBits;
+    const uint64_t top_bits = (*a0) >> (64 - kBits);
+    *a0 <<= kBits;
+    *a1 = shifted1 | top_bits;
+  }
+
+  // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+  // Input: a 256-bit number a3210.
+  static HH_INLINE void ModularReduction(const uint64_t a3_unmasked,
+                                         const uint64_t a2, const uint64_t a1,
+                                         const uint64_t a0,
+                                         uint64_t* HH_RESTRICT m1,
+                                         uint64_t* HH_RESTRICT m0) {
+    // The upper two bits must be clear, otherwise a3 << 2 would lose bits,
+    // in which case we're no longer computing a reduction.
+    const uint64_t a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFull;
+    // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+    uint64_t a3_shl1 = a3;
+    uint64_t a2_shl1 = a2;
+    uint64_t a3_shl2 = a3;
+    uint64_t a2_shl2 = a2;
+    Shift128Left<1>(&a3_shl1, &a2_shl1);
+    Shift128Left<2>(&a3_shl2, &a2_shl2);
+    *m1 = a1 ^ a3_shl1 ^ a3_shl2;
+    *m0 = a0 ^ a2_shl1 ^ a2_shl2;
+  }
+
+  Lanes v0;
+  Lanes v1;
+  Lanes mul0;
+  Lanes mul1;
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_HH_PORTABLE_H_
diff --git a/highwayhash/highwayhash/hh_sse41.cc b/highwayhash/highwayhash/hh_sse41.cc
new file mode 100644
index 000000000..9d6a0b968
--- /dev/null
+++ b/highwayhash/highwayhash/hh_sse41.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/highwayhash_target.cc"
diff --git a/highwayhash/highwayhash/hh_sse41.h b/highwayhash/highwayhash/hh_sse41.h
new file mode 100644
index 000000000..333db1d1b
--- /dev/null
+++ b/highwayhash/highwayhash/hh_sse41.h
@@ -0,0 +1,336 @@
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_SSE41_H_
+#define HIGHWAYHASH_HH_SSE41_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <string.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_buffer.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+#include "highwayhash/vector128.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+namespace highwayhash {
+// See vector128.h for why this namespace is necessary; matching it here makes
+// it easier use the vector128 symbols, but requires textual inclusion.
+namespace HH_TARGET_NAME {
+
+template <class T>
+HH_INLINE T LoadUnaligned(const void* from) {
+  T ret;
+  memcpy(&ret, from, sizeof(ret));
+  return ret;
+}
+
+// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010
+// Uses pairs of SSE4.1 instructions to emulate the AVX-2 algorithm.
+class HHStateSSE41 {
+ public:
+  explicit HH_INLINE HHStateSSE41(const HHKey key) { Reset(key); }
+
+  HH_INLINE void Reset(const HHKey key) {
+    // "Nothing up my sleeve numbers"; see HHStateTAVX2.
+    const V2x64U init0L(0xa4093822299f31d0ull, 0xdbe6d5d5fe4cce2full);
+    const V2x64U init0H(0x243f6a8885a308d3ull, 0x13198a2e03707344ull);
+    const V2x64U init1L(0xc0acf169b5f18a8cull, 0x3bd39e10cb0ef593ull);
+    const V2x64U init1H(0x452821e638d01377ull, 0xbe5466cf34e90c6cull);
+    const V2x64U keyL = LoadUnaligned<V2x64U>(key + 0);
+    const V2x64U keyH = LoadUnaligned<V2x64U>(key + 2);
+    v0L = keyL ^ init0L;
+    v0H = keyH ^ init0H;
+    v1L = Rotate64By32(keyL) ^ init1L;
+    v1H = Rotate64By32(keyH) ^ init1H;
+    mul0L = init0L;
+    mul0H = init0H;
+    mul1L = init1L;
+    mul1H = init1H;
+  }
+
+  HH_INLINE void Update(const HHPacket& packet_bytes) {
+    const uint64_t* HH_RESTRICT packet =
+        reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+    const V2x64U packetL = LoadUnaligned<V2x64U>(packet + 0);
+    const V2x64U packetH = LoadUnaligned<V2x64U>(packet + 2);
+    Update(packetH, packetL);
+  }
+
+  HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+    // 'Length padding' differentiates zero-valued inputs that have the same
+    // size/32. mod32 is sufficient because each Update behaves as if a
+    // counter were injected, because the state is large and mixed thoroughly.
+    const V4x32U vsize_mod32(static_cast<uint32_t>(size_mod32));
+    // Equivalent to storing size_mod32 in packet.
+    v0L += V2x64U(vsize_mod32);
+    v0H += V2x64U(vsize_mod32);
+    // Boosts the avalanche effect of mod32.
+    Rotate32By(&v1H, &v1L, size_mod32);
+
+    const size_t size_mod4 = size_mod32 & 3;
+    const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3);
+
+    if (HH_UNLIKELY(size_mod32 & 16)) {  // 16..31 bytes left
+      const V2x64U packetL =
+          LoadUnaligned<V2x64U>(reinterpret_cast<const uint64_t*>(bytes));
+
+      V2x64U packetH = LoadMultipleOfFour(bytes + 16, size_mod32);
+
+      const uint32_t last4 =
+          Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+      // The upper four bytes of packetH are zero, so insert there.
+      packetH = V2x64U(_mm_insert_epi32(packetH, last4, 3));
+      Update(packetH, packetL);
+    } else {  // size_mod32 < 16
+      const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32);
+
+      const uint64_t last4 =
+          Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+      // Rather than insert into packetL[3], it is faster to initialize
+      // the otherwise empty packetH.
+      const V2x64U packetH(_mm_cvtsi64_si128(last4));
+      Update(packetH, packetL);
+    }
+  }
+
+  HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+    // Mix together all lanes.
+    for (int n = 0; n < 4; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0 = v0L + mul0L;
+    const V2x64U sum1 = v1L + mul1L;
+    const V2x64U hash = sum0 + sum1;
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(result), hash);
+  }
+
+  HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+    for (int n = 0; n < 6; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0 = v0L + mul0L;
+    const V2x64U sum1 = v1H + mul1H;
+    const V2x64U hash = sum0 + sum1;
+    StoreUnaligned(hash, &(*result)[0]);
+  }
+
+  HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+    for (int n = 0; n < 10; n++) {
+      PermuteAndUpdate();
+    }
+
+    const V2x64U sum0L = v0L + mul0L;
+    const V2x64U sum1L = v1L + mul1L;
+    const V2x64U sum0H = v0H + mul0H;
+    const V2x64U sum1H = v1H + mul1H;
+    const V2x64U hashL = ModularReduction(sum1L, sum0L);
+    const V2x64U hashH = ModularReduction(sum1H, sum0H);
+    StoreUnaligned(hashL, &(*result)[0]);
+    StoreUnaligned(hashH, &(*result)[2]);
+  }
+
+  static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) {
+    __m128i* buffer = reinterpret_cast<__m128i*>(buffer_bytes);
+    const __m128i zero = _mm_setzero_si128();
+    _mm_store_si128(buffer + 0, zero);
+    _mm_store_si128(buffer + 1, zero);
+  }
+
+  static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+                                    const size_t size_mod32,
+                                    char* HH_RESTRICT buffer) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[i] = from[i];
+    }
+  }
+
+  static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+                                      const size_t size_mod32,
+                                      char* HH_RESTRICT buffer,
+                                      const size_t buffer_valid) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[buffer_valid + i] = from[i];
+    }
+  }
+
+  HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+                                 const size_t size_mod32,
+                                 const char* HH_RESTRICT buffer,
+                                 const size_t buffer_valid) {
+    HH_ALIGNAS(32) HHPacket tmp;
+    for (size_t i = 0; i < buffer_valid; ++i) {
+      tmp[i] = buffer[i];
+    }
+    for (size_t i = 0; i < size_mod32; ++i) {
+      tmp[buffer_valid + i] = from[i];
+    }
+    Update(tmp);
+  }
+
+ private:
+  // Swap 32-bit halves of each lane (caller swaps 128-bit halves)
+  static HH_INLINE V2x64U Rotate64By32(const V2x64U& v) {
+    return V2x64U(_mm_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+  }
+
+  // Rotates 32-bit lanes by "count" bits.
+  static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH,
+                                   V2x64U* HH_RESTRICT vL,
+                                   const uint64_t count) {
+    // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32,
+    // which is broadcast into 32-bit lanes.
+    const __m128i count_left = _mm_cvtsi64_si128(count);
+    const __m128i count_right = _mm_cvtsi64_si128(32 - count);
+    const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left));
+    const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left));
+    const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right));
+    const V2x64U shifted_rightH(_mm_srl_epi32(*vH, count_right));
+    *vL = shifted_leftL | shifted_rightL;
+    *vH = shifted_leftH | shifted_rightH;
+  }
+
+  static HH_INLINE V2x64U ZipperMerge(const V2x64U& v) {
+    // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+    // varying degrees. In descending order of goodness, bytes
+    // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+    // As expected, the upper and lower bytes are much worse.
+    // For each 64-bit lane, our objectives are:
+    // 1) maximizing and equalizing total goodness across each lane's bytes;
+    // 2) mixing with bytes from the neighboring lane;
+    // 3) placing the worst bytes in the upper 32 bits because those will not
+    //    be used in the next 32x32 multiplication.
+    const uint64_t hi = 0x070806090D0A040Bull;
+    const uint64_t lo = 0x000F010E05020C03ull;
+    return V2x64U(_mm_shuffle_epi8(v, V2x64U(hi, lo)));
+  }
+
+  HH_INLINE void Update(const V2x64U& packetH, const V2x64U& packetL) {
+    v1L += packetL;
+    v1H += packetH;
+    v1L += mul0L;
+    v1H += mul0H;
+    mul0L ^= V2x64U(_mm_mul_epu32(v1L, Rotate64By32(v0L)));
+    mul0H ^= V2x64U(_mm_mul_epu32(v1H, v0H >> 32));
+    v0L += mul1L;
+    v0H += mul1H;
+    mul1L ^= V2x64U(_mm_mul_epu32(v0L, Rotate64By32(v1L)));
+    mul1H ^= V2x64U(_mm_mul_epu32(v0H, v1H >> 32));
+    v0L += ZipperMerge(v1L);
+    v0H += ZipperMerge(v1H);
+    v1L += ZipperMerge(v0L);
+    v1H += ZipperMerge(v0H);
+  }
+
+  HH_INLINE void PermuteAndUpdate() {
+    // It is slightly better to permute v0 than v1; it will be added to v1.
+    // AVX-2 Permute also swaps 128-bit halves, so swap input operands.
+    Update(Rotate64By32(v0L), Rotate64By32(v0H));
+  }
+
+  // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12
+  // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32.
+  static HH_INLINE V2x64U LoadMultipleOfFour(const char* bytes,
+                                             const size_t size) {
+    const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
+    // Mask of 1-bits where the final 4 bytes should be inserted (replacement
+    // for variable shift/insert using broadcast+blend).
+    V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL));  // 'insert' into lane 0
+    V2x64U ret(0);
+    if (size & 8) {
+      ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words)));
+      // mask4 = 0 ~0 0 0 ('insert' into lane 2)
+      mask4 = V2x64U(_mm_slli_si128(mask4, 8));
+      words += 2;
+    }
+    // Final 4 (possibly after the 8 above); 'insert' into lane 0 or 2 of ret.
+    if (size & 4) {
+      const __m128i word2 = _mm_cvtsi32_si128(LoadUnaligned<uint32_t>(words));
+      // = 0 word2 0 word2; mask4 will select which lane to keep.
+      const V2x64U broadcast(_mm_shuffle_epi32(word2, 0x00));
+      // (slightly faster than blendv_epi8)
+      ret |= V2x64U(broadcast & mask4);
+    }
+    return ret;
+  }
+
+  // XORs x << 1 and x << 2 into *out after clearing the upper two bits of x.
+  // Bit shifts are only possible on independent 64-bit lanes. We therefore
+  // insert the upper bits of x[0] that were lost into x[1].
+  // Thanks to D. Lemire for helpful comments!
+  static HH_INLINE void XorByShift128Left12(const V2x64U& x,
+                                            V2x64U* HH_RESTRICT out) {
+    const V2x64U zero(_mm_setzero_si128());
+    const V2x64U sign_bit128(_mm_insert_epi32(zero, 0x80000000u, 3));
+    const V2x64U top_bits2 = x >> (64 - 2);
+    HH_COMPILER_FENCE;
+    const V2x64U shifted1_unmasked = x + x;  // (avoids needing port0)
+
+    // Only the lower half of top_bits1 will be used, so we
+    // can compute it before clearing the upper two bits of x.
+    const V2x64U top_bits1 = x >> (64 - 1);
+    const V2x64U shifted2 = shifted1_unmasked + shifted1_unmasked;
+    HH_COMPILER_FENCE;
+
+    const V2x64U new_low_bits2(_mm_slli_si128(top_bits2, 8));
+    *out ^= shifted2;
+    // The result must be as if the upper two bits of the input had been clear,
+    // otherwise we're no longer computing a reduction.
+    const V2x64U shifted1 = AndNot(sign_bit128, shifted1_unmasked);
+    HH_COMPILER_FENCE;
+
+    const V2x64U new_low_bits1(_mm_slli_si128(top_bits1, 8));
+    *out ^= new_low_bits2;
+    *out ^= shifted1;
+    *out ^= new_low_bits1;
+  }
+
+  // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+  // Input: a 256-bit number a3210.
+  static HH_INLINE V2x64U ModularReduction(const V2x64U& a32_unmasked,
+                                           const V2x64U& a10) {
+    // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+    V2x64U out = a10;
+    XorByShift128Left12(a32_unmasked, &out);
+    return out;
+  }
+
+  V2x64U v0L;
+  V2x64U v0H;
+  V2x64U v1L;
+  V2x64U v1H;
+  V2x64U mul0L;
+  V2x64U mul0H;
+  V2x64U mul1L;
+  V2x64U mul1H;
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HH_SSE41_H_
diff --git a/highwayhash/highwayhash/hh_types.h b/highwayhash/highwayhash/hh_types.h
new file mode 100644
index 000000000..f350d70f6
--- /dev/null
+++ b/highwayhash/highwayhash/hh_types.h
@@ -0,0 +1,50 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_TYPES_H_
+#define HIGHWAYHASH_HH_TYPES_H_
+
+// WARNING: included from c_bindings => must be C-compatible.
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>  // size_t
+#include <stdint.h>
+
+#ifdef __cplusplus
+namespace highwayhash {
+#endif
+
+// 256-bit secret key that should remain unknown to attackers.
+// We recommend initializing it to a random value.
+typedef uint64_t HHKey[4];
+
+// How much input is hashed by one call to HHStateT::Update.
+typedef char HHPacket[32];
+
+// Hash 'return' types.
+typedef uint64_t HHResult64;  // returned directly
+typedef uint64_t HHResult128[2];
+typedef uint64_t HHResult256[4];
+
+// Called if a test fails, indicating which target and size.
+typedef void (*HHNotify)(const char*, size_t);
+
+#ifdef __cplusplus
+}  // namespace highwayhash
+#endif
+
+#endif  // HIGHWAYHASH_HH_TYPES_H_
diff --git a/highwayhash/highwayhash/hh_vsx.cc b/highwayhash/highwayhash/hh_vsx.cc
new file mode 100644
index 000000000..6479a7a80
--- /dev/null
+++ b/highwayhash/highwayhash/hh_vsx.cc
@@ -0,0 +1,22 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME VSX
+
+#ifdef __VSX__
+#include "highwayhash/highwayhash_target.cc"
+#endif
diff --git a/highwayhash/highwayhash/hh_vsx.h b/highwayhash/highwayhash/hh_vsx.h
new file mode 100644
index 000000000..e503abe1f
--- /dev/null
+++ b/highwayhash/highwayhash/hh_vsx.h
@@ -0,0 +1,335 @@
+// Copyright 2015-2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_VSX_H_
+#define HIGHWAYHASH_HH_VSX_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/load3.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+#include <altivec.h>
+#undef vector
+#undef pixel
+#undef bool
+
+namespace highwayhash {
+
+typedef __vector unsigned long long PPC_VEC_U64;  // NOLINT
+typedef __vector unsigned int PPC_VEC_U32;
+typedef __vector unsigned char PPC_VEC_U8;
+
+// See vector128.h for why this namespace is necessary;
+namespace HH_TARGET_NAME {
+
+// Helper Functions
+
+// gcc doesn't support vec_mule() and vec_mulo() for vector long.
+// Use the generic version, which is defined here only for gcc.
+
+#ifndef __clang__
+static HH_INLINE PPC_VEC_U64 vec_mule(PPC_VEC_U32 a, PPC_VEC_U32 b) {  // NOLINT
+  PPC_VEC_U64 result;                                                  // NOLINT
+#ifdef __LITTLE_ENDIAN__
+  asm("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+#else
+  asm("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+#endif
+  return result;
+}
+#endif
+
+// LoadUnaligned uses vec_vsx_ld(offset, address) format,
+// Offset here is number of bytes and is 0 for this implementation.
+static HH_INLINE PPC_VEC_U64
+LoadUnaligned(const uint64_t* const HH_RESTRICT from) {
+  const PPC_VEC_U64* const HH_RESTRICT p =
+      reinterpret_cast<const PPC_VEC_U64*>(from);
+  return vec_vsx_ld(0, p);
+}
+
+static HH_INLINE void StoreUnaligned(const PPC_VEC_U64& hash,
+                                     uint64_t* const HH_RESTRICT to) {
+  PPC_VEC_U64* HH_RESTRICT p = reinterpret_cast<PPC_VEC_U64 * HH_RESTRICT>(to);
+  vec_vsx_st(hash, 0, p);
+}
+
+static HH_INLINE PPC_VEC_U64 MultiplyVectors(const PPC_VEC_U64& vec1,
+                                             const PPC_VEC_U64& vec2) {
+  return vec_mule(reinterpret_cast<const PPC_VEC_U32>(vec1),
+                  reinterpret_cast<const PPC_VEC_U32>(vec2));
+}
+
+// J-lanes tree hashing: see https://doi.org/10.4236/jis.2014.53010
+class HHStateVSX {
+ public:
+  explicit HH_INLINE HHStateVSX(const HHKey key) { Reset(key); }
+
+  HH_INLINE void Reset(const HHKey key) {
+    // "Nothing up my sleeve numbers";
+    const PPC_VEC_U64 init0L = {0xdbe6d5d5fe4cce2full, 0xa4093822299f31d0ull};
+    const PPC_VEC_U64 init0H = {0x13198a2e03707344ull, 0x243f6a8885a308d3ull};
+    const PPC_VEC_U64 init1L = {0x3bd39e10cb0ef593ull, 0xc0acf169b5f18a8cull};
+    const PPC_VEC_U64 init1H = {0xbe5466cf34e90c6cull, 0x452821e638d01377ull};
+    const PPC_VEC_U64 keyL = LoadUnaligned(key);
+    const PPC_VEC_U64 keyH = LoadUnaligned(key + 2);
+    v0L = keyL ^ init0L;
+    v0H = keyH ^ init0H;
+    v1L = Rotate64By32(keyL) ^ init1L;
+    v1H = Rotate64By32(keyH) ^ init1H;
+    mul0L = init0L;
+    mul0H = init0H;
+    mul1L = init1L;
+    mul1H = init1H;
+  }
+
+  HH_INLINE void Update(const HHPacket& packet_bytes) {
+    const uint64_t* HH_RESTRICT packet =
+        reinterpret_cast<const uint64_t * HH_RESTRICT>(packet_bytes);
+    const PPC_VEC_U64 packetL = LoadUnaligned(packet);
+    const PPC_VEC_U64 packetH = LoadUnaligned(packet + 2);
+    Update(packetH, packetL);
+  }
+
+  HH_INLINE void UpdateRemainder(const char* bytes, const size_t size_mod32) {
+    // 'Length padding' differentiates zero-valued inputs that have the same
+    // size/32. mod32 is sufficient because each Update behaves as if a
+    // counter were injected, because the state is large and mixed thoroughly.
+    uint32_t size_rounded = static_cast<uint32_t>(size_mod32);
+    PPC_VEC_U32 vsize_mod32 = {size_rounded, size_rounded, size_rounded,
+                               size_rounded};
+    // Equivalent to storing size_mod32 in packet.
+    v0L += reinterpret_cast<PPC_VEC_U64>(vsize_mod32);
+    v0H += reinterpret_cast<PPC_VEC_U64>(vsize_mod32);
+
+    // Boosts the avalanche effect of mod32.
+    Rotate32By(&v1H, &v1L, size_mod32);
+
+    const size_t size_mod4 = size_mod32 & 3;
+    const char* HH_RESTRICT remainder = bytes + (size_mod32 & ~3);
+
+    if (HH_UNLIKELY(size_mod32 & 16)) {  // 16..31 bytes left
+      const PPC_VEC_U64 packetL =
+          vec_vsx_ld(0, reinterpret_cast<const PPC_VEC_U64*>(bytes));
+
+      PPC_VEC_U64 packetH = LoadMultipleOfFour(bytes + 16, size_mod32);
+
+      const uint32_t last4 =
+          Load3()(Load3::AllowReadBeforeAndReturn(), remainder, size_mod4);
+
+      // The upper four bytes of packetH are zero, so insert there.
+      PPC_VEC_U32 packetH_32 = reinterpret_cast<PPC_VEC_U32>(packetH);
+      packetH_32[3] = last4;
+      packetH = reinterpret_cast<PPC_VEC_U64>(packetH_32);
+      Update(packetH, packetL);
+    } else {  // size_mod32 < 16
+      const PPC_VEC_U64 packetL = LoadMultipleOfFour(bytes, size_mod32);
+
+      const uint64_t last4 =
+          Load3()(Load3::AllowUnordered(), remainder, size_mod4);
+
+      // Rather than insert into packetL[3], it is faster to initialize
+      // the otherwise empty packetH.
+      const PPC_VEC_U64 packetH = {last4, 0};
+      Update(packetH, packetL);
+    }
+  }
+
+  HH_INLINE void Finalize(HHResult64* HH_RESTRICT result) {
+    // Mix together all lanes.
+    for (int n = 0; n < 4; n++) {
+      PermuteAndUpdate();
+    }
+    const PPC_VEC_U64 hash = v0L + v1L + mul0L + mul1L;
+    *result = hash[0];
+  }
+
+  HH_INLINE void Finalize(HHResult128* HH_RESTRICT result) {
+    for (int n = 0; n < 6; n++) {
+      PermuteAndUpdate();
+    }
+    const PPC_VEC_U64 hash = v0L + mul0L + v1H + mul1H;
+    StoreUnaligned(hash, *result);
+  }
+
+  HH_INLINE void Finalize(HHResult256* HH_RESTRICT result) {
+    for (int n = 0; n < 10; n++) {
+      PermuteAndUpdate();
+    }
+    const PPC_VEC_U64 sum0L = v0L + mul0L;
+    const PPC_VEC_U64 sum1L = v1L + mul1L;
+    const PPC_VEC_U64 sum0H = v0H + mul0H;
+    const PPC_VEC_U64 sum1H = v1H + mul1H;
+    const PPC_VEC_U64 hashL = ModularReduction(sum1L, sum0L);
+    const PPC_VEC_U64 hashH = ModularReduction(sum1H, sum0H);
+    StoreUnaligned(hashL, *result);
+    StoreUnaligned(hashH, *result + 2);
+  }
+
+  static HH_INLINE void ZeroInitialize(char* HH_RESTRICT buffer_bytes) {
+    for (size_t i = 0; i < sizeof(HHPacket); ++i) {
+      buffer_bytes[i] = 0;
+    }
+  }
+
+  static HH_INLINE void CopyPartial(const char* HH_RESTRICT from,
+                                    const size_t size_mod32,
+                                    char* HH_RESTRICT buffer) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[i] = from[i];
+    }
+  }
+
+  static HH_INLINE void AppendPartial(const char* HH_RESTRICT from,
+                                      const size_t size_mod32,
+                                      char* HH_RESTRICT buffer,
+                                      const size_t buffer_valid) {
+    for (size_t i = 0; i < size_mod32; ++i) {
+      buffer[buffer_valid + i] = from[i];
+    }
+  }
+
+  HH_INLINE void AppendAndUpdate(const char* HH_RESTRICT from,
+                                 const size_t size_mod32,
+                                 const char* HH_RESTRICT buffer,
+                                 const size_t buffer_valid) {
+    HH_ALIGNAS(32) HHPacket tmp;
+    for (size_t i = 0; i < buffer_valid; ++i) {
+      tmp[i] = buffer[i];
+    }
+    for (size_t i = 0; i < size_mod32; ++i) {
+      tmp[buffer_valid + i] = from[i];
+    }
+    Update(tmp);
+  }
+
+ private:
+  // Swap 32-bit halves of each lane (caller swaps 128-bit halves)
+  static HH_INLINE PPC_VEC_U64 Rotate64By32(const PPC_VEC_U64& v) {
+    PPC_VEC_U64 shuffle_vec = {32, 32};
+    return vec_rl(v, shuffle_vec);
+  }
+
+  // Rotates 32-bit lanes by "count" bits.
+  static HH_INLINE void Rotate32By(PPC_VEC_U64* HH_RESTRICT vH,
+                                   PPC_VEC_U64* HH_RESTRICT vL,
+                                   const uint64_t count) {
+    // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32,
+    // which is broadcast into 32-bit lanes.
+    uint32_t count_rl = uint32_t(count);
+    PPC_VEC_U32 rot_left = {count_rl, count_rl, count_rl, count_rl};
+    *vL = reinterpret_cast<PPC_VEC_U64>(vec_rl(PPC_VEC_U32(*vL), rot_left));
+    *vH = reinterpret_cast<PPC_VEC_U64>(vec_rl(PPC_VEC_U32(*vH), rot_left));
+  }
+
+  static HH_INLINE PPC_VEC_U64 ZipperMerge(const PPC_VEC_U64& v) {
+    // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+    // varying degrees. In descending order of goodness, bytes
+    // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+    // As expected, the upper and lower bytes are much worse.
+    // For each 64-bit lane, our objectives are:
+    // 1) maximizing and equalizing total goodness across each lane's bytes;
+    // 2) mixing with bytes from the neighboring lane;
+    // 3) placing the worst bytes in the upper 32 bits because those will not
+    //    be used in the next 32x32 multiplication.
+
+    const PPC_VEC_U64 mask = {0x000F010E05020C03ull, 0x070806090D0A040Bull};
+    return vec_vperm(v, v, reinterpret_cast<const PPC_VEC_U8>(mask));
+  }
+
+  HH_INLINE void Update(const PPC_VEC_U64& packetH,
+                        const PPC_VEC_U64& packetL) {
+    // Tried rearranging the instructions below and benchmarks are similar
+    v1L += packetL + mul0L;
+    v1H += packetH + mul0H;
+    mul0L ^= MultiplyVectors(v1L, Rotate64By32(v0L));
+    mul0H ^= MultiplyVectors(v1H, v0H >> 32);
+    v0L += mul1L;
+    v0H += mul1H;
+    mul1L ^= MultiplyVectors(v0L, Rotate64By32(v1L));
+    mul1H ^= MultiplyVectors(v0H, v1H >> 32);
+    v0L += ZipperMerge(v1L);
+    v1L += ZipperMerge(v0L);
+    v0H += ZipperMerge(v1H);
+    v1H += ZipperMerge(v0H);
+  }
+
+  HH_INLINE void PermuteAndUpdate() {
+    // Permutes v0L and V0H by swapping 32 bits halves of each lane
+    Update(Rotate64By32(v0L), Rotate64By32(v0H));
+  }
+
+  // Returns zero-initialized vector with the lower "size" = 0, 4, 8 or 12
+  // bytes loaded from "bytes". Serves as a replacement for AVX2 maskload_epi32.
+  static HH_INLINE PPC_VEC_U64 LoadMultipleOfFour(const char* bytes,
+                                                  const size_t size) {
+    const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
+    // Updating the entries, as if done by vec_insert function call
+    PPC_VEC_U32 ret = {0, 0, 0, 0};
+    if (size & 8) {
+      ret[0] = words[0];
+      ret[1] = words[1];
+      words += 2;
+      if (size & 4) {
+        ret[2] = words[0];
+      }
+    } else if (size & 4) {
+      ret[0] = words[0];
+    }
+    return reinterpret_cast<PPC_VEC_U64>(ret);
+  }
+
+  // Modular reduction by the irreducible polynomial (x^128 + x^2 + x).
+  // Input: a 256-bit number a3210.
+  static HH_INLINE PPC_VEC_U64 ModularReduction(const PPC_VEC_U64& a32_unmasked,
+                                                const PPC_VEC_U64& a10) {
+    // See Lemire, https://arxiv.org/pdf/1503.03465v8.pdf.
+    PPC_VEC_U64 out = a10;
+    const PPC_VEC_U64 shifted1 = reinterpret_cast<PPC_VEC_U64>(
+        vec_sll(reinterpret_cast<PPC_VEC_U32>(a32_unmasked), vec_splat_u8(1)));
+    const PPC_VEC_U64 shifted2 = reinterpret_cast<PPC_VEC_U64>(
+        vec_sll(reinterpret_cast<PPC_VEC_U32>(a32_unmasked), vec_splat_u8(2)));
+    // The result must be as if the upper two bits of the input had been clear,
+    // otherwise we're no longer computing a reduction.
+    const PPC_VEC_U64 mask = {0xFFFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull};
+    const PPC_VEC_U64 shifted1_masked = shifted1 & mask;
+    out ^= shifted1_masked ^ shifted2;
+    return out;
+  }
+
+  PPC_VEC_U64 v0L;
+  PPC_VEC_U64 v0H;
+  PPC_VEC_U64 v1L;
+  PPC_VEC_U64 v1H;
+  PPC_VEC_U64 mul0L;
+  PPC_VEC_U64 mul0H;
+  PPC_VEC_U64 mul1L;
+  PPC_VEC_U64 mul1H;
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HH_VSX_H_
diff --git a/highwayhash/highwayhash/highwayhash.h b/highwayhash/highwayhash/highwayhash.h
new file mode 100644
index 000000000..fea4922b2
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash.h
@@ -0,0 +1,216 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_H_
+#define HIGHWAYHASH_HIGHWAYHASH_H_
+
+// This header's templates are useful for inlining into other CPU-specific code:
+// template<TargetBits Target> CodeUsingHash() { HighwayHashT<Target>(...); },
+// and can also be instantiated with HH_TARGET when callers don't care about the
+// exact implementation. Otherwise, they are implementation details of the
+// highwayhash_target wrapper. Use that instead if you need to detect the best
+// available implementation at runtime.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+
+#if HH_ARCH_X64
+#include "highwayhash/iaca.h"
+#endif
+
+// Include exactly one (see arch_specific.h) header, which defines a state
+// object in a target-specific namespace, e.g. AVX2::HHStateAVX2.
+// Attempts to use "computed includes" (#define MACRO "path/or_just_filename",
+// #include MACRO) fail with 'file not found', so we need an #if chain.
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/hh_avx2.h"
+#elif HH_TARGET == HH_TARGET_SSE41
+#include "highwayhash/hh_sse41.h"
+#elif HH_TARGET == HH_TARGET_VSX
+#include "highwayhash/hh_vsx.h"
+#elif HH_TARGET == HH_TARGET_NEON
+#include "highwayhash/hh_neon.h"
+#elif HH_TARGET == HH_TARGET_Portable
+#include "highwayhash/hh_portable.h"
+#else
+#error "Unknown target, add its hh_*.h include here."
+#endif
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+
+// Alias templates (HHStateT) cannot be specialized, so we need a helper struct.
+// Note that hh_*.h don't just specialize HHStateT directly because vector128.h
+// must reside in a distinct namespace (to allow including it from multiple
+// translation units), and it is easier if its users, i.e. the concrete HHState,
+// also reside in that same namespace, which precludes specialization.
+template <TargetBits Target>
+struct HHStateForTarget {};
+
+template <>
+struct HHStateForTarget<HH_TARGET> {
+  // (The namespace is sufficient and the additional HH_TARGET_NAME suffix is
+  // technically redundant, but it makes searching easier.)
+  using type = HH_TARGET_NAME::HH_ADD_TARGET_SUFFIX(HHState);
+};
+
+// Typically used as HHStateT<HH_TARGET>. It would be easier to just have a
+// concrete type HH_STATE, but this alias template is required by the
+// templates in highwayhash_target.cc.
+template <TargetBits Target>
+using HHStateT = typename HHStateForTarget<Target>::type;
+
+// Computes HighwayHash of "bytes" using the implementation chosen by "State".
+//
+// "state" is a HHStateT<> initialized with a key.
+// "bytes" is the data to hash (possibly unaligned).
+// "size" is the number of bytes to hash; we do not read any additional bytes.
+// "hash" is a HHResult* (either 64, 128 or 256 bits).
+//
+// HighwayHash is a strong pseudorandom function with security claims
+// [https://arxiv.org/abs/1612.06257]. It is intended as a safer general-purpose
+// hash, about 4x faster than SipHash and 10x faster than BLAKE2.
+//
+// This template allows callers (e.g. tests) to invoke a specific
+// implementation. It must be compiled with the flags required by the desired
+// implementation. If the entire program cannot be built with these flags, use
+// the wrapper in highwayhash_target.h instead.
+//
+// Callers wanting to hash multiple pieces of data should duplicate this
+// function, calling HHStateT::Update for each input and only Finalizing once.
+template <class State, typename Result>
+HH_INLINE void HighwayHashT(State* HH_RESTRICT state,
+                            const char* HH_RESTRICT bytes, const size_t size,
+                            Result* HH_RESTRICT hash) {
+  // BeginIACA();
+  const size_t remainder = size & (sizeof(HHPacket) - 1);
+  const size_t truncated = size & ~(sizeof(HHPacket) - 1);
+  for (size_t offset = 0; offset < truncated; offset += sizeof(HHPacket)) {
+    state->Update(*reinterpret_cast<const HHPacket*>(bytes + offset));
+  }
+
+  if (remainder != 0) {
+    state->UpdateRemainder(bytes + truncated, remainder);
+  }
+
+  state->Finalize(hash);
+  // EndIACA();
+}
+
+// Wrapper class for incrementally hashing a series of data ranges. The final
+// result is the same as HighwayHashT of the concatenation of all the ranges.
+// This is useful for computing the hash of cords, iovecs, and similar
+// data structures.
+template <TargetBits Target>
+class HighwayHashCatT {
+ public:
+  HH_INLINE HighwayHashCatT(const HHKey& key) : state_(key) {
+    // Avoids msan uninitialized-memory warnings.
+    HHStateT<Target>::ZeroInitialize(buffer_);
+  }
+
+  // Resets the state of the hasher so it can be used to hash a new string.
+  HH_INLINE void Reset(const HHKey& key) {
+    state_.Reset(key);
+    buffer_usage_ = 0;
+  }
+
+  // Adds "bytes" to the internal buffer, feeding it to HHStateT::Update as
+  // required. Call this as often as desired. Only reads bytes within the
+  // interval [bytes, bytes + num_bytes). "num_bytes" == 0 has no effect.
+  //
+  // Beware that this implies hashing two strings {"A", ""} has the same result
+  // as {"", "A"}. To prevent this when hashing independent fields, you can
+  // append some extra (non-empty) data when a field is empty, or
+  // unconditionally also Append the field length. Either option would ensure
+  // the two examples above result in a different hash.
+  //
+  // There are no alignment requirements.
+  HH_INLINE void Append(const char* HH_RESTRICT bytes, size_t num_bytes) {
+    // BeginIACA();
+    const size_t capacity = sizeof(HHPacket) - buffer_usage_;
+    // New bytes fit within buffer, but still not enough to Update.
+    if (HH_UNLIKELY(num_bytes < capacity)) {
+      HHStateT<Target>::AppendPartial(bytes, num_bytes, buffer_, buffer_usage_);
+      buffer_usage_ += num_bytes;
+      return;
+    }
+
+    // HACK: ensures the state is kept in SIMD registers; otherwise, Update
+    // constantly load/stores its operands, which is much slower.
+    // Restrict-qualified pointers to external state or the state_ member are
+    // not sufficient for keeping this in registers.
+    HHStateT<Target> state_copy = state_;
+
+    // Have prior bytes to flush.
+    const size_t buffer_usage = buffer_usage_;
+    if (HH_LIKELY(buffer_usage != 0)) {
+      // Calls update with prior buffer contents plus new data. Does not modify
+      // the buffer because some implementations can load into SIMD registers
+      // and Append to them directly.
+      state_copy.AppendAndUpdate(bytes, capacity, buffer_, buffer_usage);
+      bytes += capacity;
+      num_bytes -= capacity;
+    }
+
+    // Buffer currently empty => Update directly from the source.
+    while (num_bytes >= sizeof(HHPacket)) {
+      state_copy.Update(*reinterpret_cast<const HHPacket*>(bytes));
+      bytes += sizeof(HHPacket);
+      num_bytes -= sizeof(HHPacket);
+    }
+
+    // Unconditionally assign even if zero because we didn't reset to zero
+    // after the AppendAndUpdate above.
+    buffer_usage_ = num_bytes;
+
+    state_ = state_copy;
+
+    // Store any remainders in buffer, no-op if multiple of a packet.
+    if (HH_LIKELY(num_bytes != 0)) {
+      HHStateT<Target>::CopyPartial(bytes, num_bytes, buffer_);
+    }
+    // EndIACA();
+  }
+
+  // Stores the resulting 64, 128 or 256-bit hash of data previously passed to
+  // Append since construction or a prior call to Reset.
+  template <typename Result>  // HHResult*
+  HH_INLINE void Finalize(Result* HH_RESTRICT hash) const {
+    // BeginIACA();
+    HHStateT<Target> state_copy = state_;
+    const size_t buffer_usage = buffer_usage_;
+    if (HH_LIKELY(buffer_usage != 0)) {
+      state_copy.UpdateRemainder(buffer_, buffer_usage);
+    }
+    state_copy.Finalize(hash);
+    // EndIACA();
+  }
+
+ private:
+  HH_ALIGNAS(64) HHPacket buffer_;
+  HH_ALIGNAS(32) HHStateT<Target> state_;
+  // How many bytes in buffer_ (starting with offset 0) are valid.
+  size_t buffer_usage_ = 0;
+};
+
+}  // namespace highwayhash
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_HIGHWAYHASH_H_
diff --git a/highwayhash/highwayhash/highwayhash_fuzzer.cc b/highwayhash/highwayhash/highwayhash_fuzzer.cc
new file mode 100644
index 000000000..5234fcb01
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_fuzzer.cc
@@ -0,0 +1,25 @@
+#include "highwayhash/highwayhash_target.h"
+#include "highwayhash/instruction_sets.h"
+
+using highwayhash::HHKey;
+using highwayhash::HHResult64;
+using highwayhash::HighwayHash;
+using highwayhash::InstructionSets;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size < sizeof(uint64_t) * 4) {
+    return 0;
+  }
+
+  // Generate the key.
+  const uint64_t *u64s = reinterpret_cast<const uint64_t*>(data);
+  HH_ALIGNAS(32) const HHKey key = {u64s[0], u64s[1], u64s[2], u64s[3]};
+  data += sizeof(uint64_t) * 4;
+  size -= sizeof(uint64_t) * 4;
+
+  // Compute the hash.
+  HHResult64 result;
+  InstructionSets::Run<HighwayHash>(key, reinterpret_cast<const char *>(data),
+                                    size, &result);
+  return 0;
+}
diff --git a/highwayhash/highwayhash/highwayhash_target.cc b/highwayhash/highwayhash/highwayhash_target.cc
new file mode 100644
index 000000000..74022f64b
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_target.cc
@@ -0,0 +1,104 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/highwayhash_target.h"
+
+#include "highwayhash/highwayhash.h"
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+
+extern "C" {
+uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key,
+                                              const char* bytes,
+                                              const uint64_t size) {
+  HHStateT<HH_TARGET> state(key);
+  HHResult64 result;
+  HighwayHashT(&state, bytes, size, &result);
+  return result;
+}
+}  // extern "C"
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+                                     const char* HH_RESTRICT bytes,
+                                     const size_t size,
+                                     HHResult64* HH_RESTRICT hash) const {
+  HHStateT<Target> state(key);
+  HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+                                     const char* HH_RESTRICT bytes,
+                                     const size_t size,
+                                     HHResult128* HH_RESTRICT hash) const {
+  HHStateT<Target> state(key);
+  HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHash<Target>::operator()(const HHKey& key,
+                                     const char* HH_RESTRICT bytes,
+                                     const size_t size,
+                                     HHResult256* HH_RESTRICT hash) const {
+  HHStateT<Target> state(key);
+  HighwayHashT(&state, bytes, size, hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+                                        const StringView* HH_RESTRICT fragments,
+                                        const size_t num_fragments,
+                                        HHResult64* HH_RESTRICT hash) const {
+  HighwayHashCatT<Target> cat(key);
+  for (size_t i = 0; i < num_fragments; ++i) {
+    cat.Append(fragments[i].data, fragments[i].num_bytes);
+  }
+  cat.Finalize(hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+                                        const StringView* HH_RESTRICT fragments,
+                                        const size_t num_fragments,
+                                        HHResult128* HH_RESTRICT hash) const {
+  HighwayHashCatT<Target> cat(key);
+  for (size_t i = 0; i < num_fragments; ++i) {
+    cat.Append(fragments[i].data, fragments[i].num_bytes);
+  }
+  cat.Finalize(hash);
+}
+
+template <TargetBits Target>
+void HighwayHashCat<Target>::operator()(const HHKey& key,
+                                        const StringView* HH_RESTRICT fragments,
+                                        const size_t num_fragments,
+                                        HHResult256* HH_RESTRICT hash) const {
+  HighwayHashCatT<Target> cat(key);
+  for (size_t i = 0; i < num_fragments; ++i) {
+    cat.Append(fragments[i].data, fragments[i].num_bytes);
+  }
+  cat.Finalize(hash);
+}
+
+// Instantiate for the current target.
+template struct HighwayHash<HH_TARGET>;
+template struct HighwayHashCat<HH_TARGET>;
+
+}  // namespace highwayhash
+#endif  // HH_DISABLE_TARGET_SPECIFIC
diff --git a/highwayhash/highwayhash/highwayhash_target.h b/highwayhash/highwayhash/highwayhash_target.h
new file mode 100644
index 000000000..08b803f19
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_target.h
@@ -0,0 +1,91 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+#define HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
+
+// Adapter for the InstructionSets::Run dispatcher, which invokes the best
+// implementations available on the current CPU.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+
+namespace highwayhash {
+
+// Usage: InstructionSets::Run<HighwayHash>(key, bytes, size, hash).
+// This incurs some small dispatch overhead. If the entire program is compiled
+// for the target CPU, you can instead call HighwayHashT directly to avoid any
+// overhead. This template is instantiated in the source file, which is
+// compiled once for every target with the required flags (e.g. -mavx2).
+template <TargetBits Target>
+struct HighwayHash {
+  // Stores a 64/128/256 bit hash of "bytes" using the HighwayHashT
+  // implementation for the "Target" CPU. The hash result is identical
+  // regardless of which implementation is used.
+  //
+  // "key" is a (randomly generated or hard-coded) HHKey.
+  // "bytes" is the data to hash (possibly unaligned).
+  // "size" is the number of bytes to hash; we do not read any additional bytes.
+  // "hash" is a HHResult* (either 64, 128 or 256 bits).
+  //
+  // HighwayHash is a strong pseudorandom function with security claims
+  // [https://arxiv.org/abs/1612.06257]. It is intended as a safer
+  // general-purpose hash, 5x faster than SipHash and 10x faster than BLAKE2.
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, HHResult64* HH_RESTRICT hash) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, HHResult128* HH_RESTRICT hash) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, HHResult256* HH_RESTRICT hash) const;
+};
+
+// Replacement for C++17 std::string_view that avoids dependencies.
+// A struct requires fewer allocations when calling HighwayHashCat with
+// non-const "num_fragments".
+struct StringView {
+  const char* data;  // not necessarily aligned/padded
+  size_t num_bytes;  // possibly zero
+};
+
+// Note: this interface avoids dispatch overhead per fragment.
+template <TargetBits Target>
+struct HighwayHashCat {
+  // Stores a 64/128/256 bit hash of all "num_fragments" "fragments" using the
+  // HighwayHashCatT implementation for "Target". The hash result is identical
+  // to HighwayHash of the flattened data, regardless of Target.
+  //
+  // "key" is a (randomly generated or hard-coded) HHKey.
+  // "fragments" contain unaligned pointers and the number of valid bytes.
+  // "num_fragments" indicates the number of entries in "fragments".
+  // "hash" is a HHResult* (either 64, 128 or 256 bits).
+  void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+                  const size_t num_fragments,
+                  HHResult64* HH_RESTRICT hash) const;
+  void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+                  const size_t num_fragments,
+                  HHResult128* HH_RESTRICT hash) const;
+  void operator()(const HHKey& key, const StringView* HH_RESTRICT fragments,
+                  const size_t num_fragments,
+                  HHResult256* HH_RESTRICT hash) const;
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_HIGHWAYHASH_TARGET_H_
diff --git a/highwayhash/highwayhash/highwayhash_test.cc b/highwayhash/highwayhash/highwayhash_test.cc
new file mode 100644
index 000000000..aed9a9eed
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test.cc
@@ -0,0 +1,391 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Ensures each implementation of HighwayHash returns consistent and unchanging
+// hash values.
+
+#include "highwayhash/highwayhash_test_target.h"
+
+#include <stddef.h>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef HH_GOOGLETEST
+#include "testing/base/public/gunit.h"
+#endif
+
+#include "highwayhash/data_parallel.h"
+#include "highwayhash/highwayhash_target.h"
+#include "highwayhash/instruction_sets.h"
+
+// Define to nonzero in order to print the (new) golden outputs.
+// WARNING: HighwayHash is frozen, so the golden values must not change.
+#define PRINT_RESULTS 0
+
+namespace highwayhash {
+namespace {
+
+// Known-good outputs are verified for all lengths in [0, 64].
+const size_t kMaxSize = 64;
+
+#if PRINT_RESULTS
+void Print(const HHResult64 result) { printf("0x%016lXull,\n", result); }
+
+// For HHResult128/256.
+template <int kNumLanes>
+void Print(const HHResult64 (&result)[kNumLanes]) {
+  printf("{ ");
+  for (int i = 0; i < kNumLanes; ++i) {
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("0x%016lXull", result[i]);
+  }
+  printf("},\n");
+}
+#endif  // PRINT_RESULTS
+
+// Called when any test fails; exits immediately because one mismatch usually
+// implies many others.
+void OnFailure(const char* target_name, const size_t size) {
+  printf("Mismatch at size %zu for target %s\n", size, target_name);
+#ifdef HH_GOOGLETEST
+  EXPECT_TRUE(false);
+#endif
+  exit(1);
+}
+
+// Verifies every combination of implementation and input size. Returns which
+// targets were run/verified.
+template <typename Result>
+TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) {
+  const HHKey key = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
+                     0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
+
+  TargetBits targets = ~0U;
+
+  // For each test input: empty string, 00, 00 01, ...
+  char in[kMaxSize + 1] = {0};
+  // Fast enough that we don't need a thread pool.
+  for (uint64_t size = 0; size <= kMaxSize; ++size) {
+    in[size] = static_cast<char>(size);
+#if PRINT_RESULTS
+    Result actual;
+    targets &= InstructionSets::Run<HighwayHash>(key, in, size, &actual);
+    Print(actual);
+#else
+    const Result* expected = &known_good[size];
+    targets &= InstructionSets::RunAll<HighwayHashTest>(key, in, size, expected,
+                                                        &OnFailure);
+#endif
+  }
+  return targets;
+}
+
+// Cat
+
+void OnCatFailure(const char* target_name, const size_t size) {
+  printf("Cat mismatch at size %zu\n", size);
+#ifdef HH_GOOGLETEST
+  EXPECT_TRUE(false);
+#endif
+  exit(1);
+}
+
+// Returns which targets were run/verified.
+template <typename Result>
+TargetBits VerifyCat(ThreadPool* pool) {
+  // Reversed order vs prior test.
+  const HHKey key = {0x1F1E1D1C1B1A1918ULL, 0x1716151413121110ULL,
+                     0x0F0E0D0C0B0A0908ULL, 0x0706050403020100ULL};
+
+  const size_t kMaxSize = 3 * 35;
+  char flat[kMaxSize];
+  srand(129);
+  for (size_t size = 0; size < kMaxSize; ++size) {
+    flat[size] = static_cast<char>(rand() & 0xFF);
+  }
+
+  std::atomic<TargetBits> targets{~0U};
+
+  pool->Run(0, kMaxSize, [&key, &flat, &targets](const uint32_t i) {
+    Result dummy;
+    targets.fetch_and(InstructionSets::RunAll<HighwayHashCatTest>(
+        key, flat, i, &dummy, &OnCatFailure));
+  });
+  return targets.load();
+}
+
+// WARNING: HighwayHash is frozen, so the golden values must not change.
+const HHResult64 kExpected64[kMaxSize + 1] = {
+    0x907A56DE22C26E53ull, 0x7EAB43AAC7CDDD78ull, 0xB8D0569AB0B53D62ull,
+    0x5C6BEFAB8A463D80ull, 0xF205A46893007EDAull, 0x2B8A1668E4A94541ull,
+    0xBD4CCC325BEFCA6Full, 0x4D02AE1738F59482ull, 0xE1205108E55F3171ull,
+    0x32D2644EC77A1584ull, 0xF6E10ACDB103A90Bull, 0xC3BBF4615B415C15ull,
+    0x243CC2040063FA9Cull, 0xA89A58CE65E641FFull, 0x24B031A348455A23ull,
+    0x40793F86A449F33Bull, 0xCFAB3489F97EB832ull, 0x19FE67D2C8C5C0E2ull,
+    0x04DD90A69C565CC2ull, 0x75D9518E2371C504ull, 0x38AD9B1141D3DD16ull,
+    0x0264432CCD8A70E0ull, 0xA9DB5A6288683390ull, 0xD7B05492003F028Cull,
+    0x205F615AEA59E51Eull, 0xEEE0C89621052884ull, 0x1BFC1A93A7284F4Full,
+    0x512175B5B70DA91Dull, 0xF71F8976A0A2C639ull, 0xAE093FEF1F84E3E7ull,
+    0x22CA92B01161860Full, 0x9FC7007CCF035A68ull, 0xA0C964D9ECD580FCull,
+    0x2C90F73CA03181FCull, 0x185CF84E5691EB9Eull, 0x4FC1F5EF2752AA9Bull,
+    0xF5B7391A5E0A33EBull, 0xB9B84B83B4E96C9Cull, 0x5E42FE712A5CD9B4ull,
+    0xA150F2F90C3F97DCull, 0x7FA522D75E2D637Dull, 0x181AD0CC0DFFD32Bull,
+    0x3889ED981E854028ull, 0xFB4297E8C586EE2Dull, 0x6D064A45BB28059Cull,
+    0x90563609B3EC860Cull, 0x7AA4FCE94097C666ull, 0x1326BAC06B911E08ull,
+    0xB926168D2B154F34ull, 0x9919848945B1948Dull, 0xA2A98FC534825EBEull,
+    0xE9809095213EF0B6ull, 0x582E5483707BC0E9ull, 0x086E9414A88A6AF5ull,
+    0xEE86B98D20F6743Dull, 0xF89B7FF609B1C0A7ull, 0x4C7D9CC19E22C3E8ull,
+    0x9A97005024562A6Full, 0x5DD41CF423E6EBEFull, 0xDF13609C0468E227ull,
+    0x6E0DA4F64188155Aull, 0xB755BA4B50D7D4A1ull, 0x887A3484647479BDull,
+    0xAB8EEBE9BF2139A0ull, 0x75542C5D4CD2A6FFull};
+
+// WARNING: HighwayHash is frozen, so the golden values must not change.
+const HHResult128 kExpected128[kMaxSize + 1] = {
+    {0x0FED268F9D8FFEC7ull, 0x33565E767F093E6Full},
+    {0xD6B0A8893681E7A8ull, 0xDC291DF9EB9CDCB4ull},
+    {0x3D15AD265A16DA04ull, 0x78085638DC32E868ull},
+    {0x0607621B295F0BEBull, 0xBFE69A0FD9CEDD79ull},
+    {0x26399EB46DACE49Eull, 0x2E922AD039319208ull},
+    {0x3250BDC386D12ED8ull, 0x193810906C63C23Aull},
+    {0x6F476AB3CB896547ull, 0x7CDE576F37ED1019ull},
+    {0x2A401FCA697171B4ull, 0xBE1F03FF9F02796Cull},
+    {0xA1E96D84280552E8ull, 0x695CF1C63BEC0AC2ull},
+    {0x142A2102F31E63B2ull, 0x1A85B98C5B5000CCull},
+    {0x51A1B70E26B6BC5Bull, 0x929E1F3B2DA45559ull},
+    {0x88990362059A415Bull, 0xBED21F22C47B7D13ull},
+    {0xCD1F1F5F1CAF9566ull, 0xA818BA8CE0F9C8D4ull},
+    {0xA225564112FE6157ull, 0xB2E94C78B8DDB848ull},
+    {0xBD492FEBD1CC0919ull, 0xCECD1DBC025641A2ull},
+    {0x142237A52BC4AF54ull, 0xE0796C0B6E26BCD7ull},
+    {0x414460FFD5A401ADull, 0x029EA3D5019F18C8ull},
+    {0xC52A4B96C51C9962ull, 0xECB878B1169B5EA0ull},
+    {0xD940CA8F11FBEACEull, 0xF93A46D616F8D531ull},
+    {0x8AC49D0AE5C0CBF5ull, 0x3FFDBF8DF51D7C93ull},
+    {0xAC6D279B852D00A8ull, 0x7DCD3A6BA5EBAA46ull},
+    {0xF11621BD93F08A56ull, 0x3173C398163DD9D5ull},
+    {0x0C4CE250F68CF89Full, 0xB3123CDA411898EDull},
+    {0x15AB97ED3D9A51CEull, 0x7CE274479169080Eull},
+    {0xCD001E198D4845B8ull, 0xD0D9D98BD8AA2D77ull},
+    {0x34F3D617A0493D79ull, 0x7DD304F6397F7E16ull},
+    {0x5CB56890A9F4C6B6ull, 0x130829166567304Full},
+    {0x30DA6F8B245BD1C0ull, 0x6F828B7E3FD9748Cull},
+    {0xE0580349204C12C0ull, 0x93F6DA0CAC5F441Cull},
+    {0xF648731BA5073045ull, 0x5FB897114FB65976ull},
+    {0x024F8354738A5206ull, 0x509A4918EB7E0991ull},
+    {0x06E7B465E8A57C29ull, 0x52415E3A07F5D446ull},
+    {0x1984DF66C1434AAAull, 0x16FC1958F9B3E4B9ull},
+    {0x111678AFE0C6C36Cull, 0xF958B59DE5A2849Dull},
+    {0x773FBC8440FB0490ull, 0xC96ED5D243658536ull},
+    {0x91E3DC710BB6C941ull, 0xEA336A0BC1EEACE9ull},
+    {0x25CFE3815D7AD9D4ull, 0xF2E94F8C828FC59Eull},
+    {0xB9FB38B83CC288F2ull, 0x7479C4C8F850EC04ull},
+    {0x1D85D5C525982B8Cull, 0x6E26B1C16F48DBF4ull},
+    {0x8A4E55BD6060BDE7ull, 0x2134D599058B3FD0ull},
+    {0x2A958FF994778F36ull, 0xE8052D1AE61D6423ull},
+    {0x89233AE6BE453233ull, 0x3ACF9C87D7E8C0B9ull},
+    {0x4458F5E27EA9C8D5ull, 0x418FB49BCA2A5140ull},
+    {0x090301837ED12A68ull, 0x1017F69633C861E6ull},
+    {0x330DD84704D49590ull, 0x339DF1AD3A4BA6E4ull},
+    {0x569363A663F2C576ull, 0x363B3D95E3C95EF6ull},
+    {0xACC8D08586B90737ull, 0x2BA0E8087D4E28E9ull},
+    {0x39C27A27C86D9520ull, 0x8DB620A45160932Eull},
+    {0x8E6A4AEB671A072Dull, 0x6ED3561A10E47EE6ull},
+    {0x0011D765B1BEC74Aull, 0xD80E6E656EDE842Eull},
+    {0x2515D62B936AC64Cull, 0xCE088794D7088A7Dull},
+    {0x91621552C16E23AFull, 0x264F0094EB23CCEFull},
+    {0x1E21880D97263480ull, 0xD8654807D3A31086ull},
+    {0x39D76AAF097F432Dull, 0xA517E1E09D074739ull},
+    {0x0F17A4F337C65A14ull, 0x2F51215F69F976D4ull},
+    {0xA0FB5CDA12895E44ull, 0x568C3DC4D1F13CD1ull},
+    {0x93C8FC00D89C46CEull, 0xBAD5DA947E330E69ull},
+    {0x817C07501D1A5694ull, 0x584D6EE72CBFAC2Bull},
+    {0x91D668AF73F053BFull, 0xF98E647683C1E0EDull},
+    {0x5281E1EF6B3CCF8Bull, 0xBC4CC3DF166083D8ull},
+    {0xAAD61B6DBEAAEEB9ull, 0xFF969D000C16787Bull},
+    {0x4325D84FC0475879ull, 0x14B919BD905F1C2Dull},
+    {0x79A176D1AA6BA6D1ull, 0xF1F720C5A53A2B86ull},
+    {0x74BD7018022F3EF0ull, 0x3AEA94A8AD5F4BCBull},
+    {0x98BB1F7198D4C4F2ull, 0xE0BC0571DE918FC8ull}};
+
+// WARNING: HighwayHash is frozen, so the golden values must not change.
+const HHResult256 kExpected256[kMaxSize + 1] = {
+    {0xDD44482AC2C874F5ull, 0xD946017313C7351Full, 0xB3AEBECCB98714FFull,
+     0x41DA233145751DF4ull},
+    {0xEDB941BCE45F8254ull, 0xE20D44EF3DCAC60Full, 0x72651B9BCB324A47ull,
+     0x2073624CB275E484ull},
+    {0x3FDFF9DF24AFE454ull, 0x11C4BF1A1B0AE873ull, 0x115169CC6922597Aull,
+     0x1208F6590D33B42Cull},
+    {0x480AA0D70DD1D95Cull, 0x89225E7C6911D1D0ull, 0x8EA8426B8BBB865Aull,
+     0xE23DFBC390E1C722ull},
+    {0xC9CFC497212BE4DCull, 0xA85F9DF6AFD2929Bull, 0x1FDA9F211DF4109Eull,
+     0x07E4277A374D4F9Bull},
+    {0xB4B4F566A4DC85B3ull, 0xBF4B63BA5E460142ull, 0x15F48E68CDDC1DE3ull,
+     0x0F74587D388085C6ull},
+    {0x6445C70A86ADB9B4ull, 0xA99CFB2784B4CEB6ull, 0xDAE29D40A0B2DB13ull,
+     0xB6526DF29A9D1170ull},
+    {0xD666B1A00987AD81ull, 0xA4F1F838EB8C6D37ull, 0xE9226E07D463E030ull,
+     0x5754D67D062C526Cull},
+    {0xF1B905B0ED768BC0ull, 0xE6976FF3FCFF3A45ull, 0x4FBE518DD9D09778ull,
+     0xD9A0AFEB371E0D33ull},
+    {0x80D8E4D70D3C2981ull, 0xF10FBBD16424F1A1ull, 0xCF5C2DBE9D3F0CD1ull,
+     0xC0BFE8F701B673F2ull},
+    {0xADE48C50E5A262BEull, 0x8E9492B1FDFE38E0ull, 0x0784B74B2FE9B838ull,
+     0x0E41D574DB656DCDull},
+    {0xA1BE77B9531807CFull, 0xBA97A7DE6A1A9738ull, 0xAF274CEF9C8E261Full,
+     0x3E39B935C74CE8E8ull},
+    {0x15AD3802E3405857ull, 0x9D11CBDC39E853A0ull, 0x23EA3E993C31B225ull,
+     0x6CD9E9E3CAF4212Eull},
+    {0x01C96F5EB1D77C36ull, 0xA367F9C1531F95A6ull, 0x1F94A3427CDADCB8ull,
+     0x97F1000ABF3BD5D3ull},
+    {0x0815E91EEEFF8E41ull, 0x0E0C28FA6E21DF5Dull, 0x4EAD8E62ED095374ull,
+     0x3FFD01DA1C9D73E6ull},
+    {0xC11905707842602Eull, 0x62C3DB018501B146ull, 0x85F5AD17FA3406C1ull,
+     0xC884F87BD4FEC347ull},
+    {0xF51AD989A1B6CD1Full, 0xF7F075D62A627BD9ull, 0x7E01D5F579F28A06ull,
+     0x1AD415C16A174D9Full},
+    {0x19F4CFA82CA4068Eull, 0x3B9D4ABD3A9275B9ull, 0x8000B0DDE9C010C6ull,
+     0x8884D50949215613ull},
+    {0x126D6C7F81AB9F5Dull, 0x4EDAA3C5097716EEull, 0xAF121573A7DD3E49ull,
+     0x9001AC85AA80C32Dull},
+    {0x06AABEF9149155FAull, 0xDF864F4144E71C3Dull, 0xFDBABCE860BC64DAull,
+     0xDE2BA54792491CB6ull},
+    {0xADFC6B4035079FDBull, 0xA087B7328E486E65ull, 0x46D1A9935A4623EAull,
+     0xE3895C440D3CEE44ull},
+    {0xB5F9D31DEEA3B3DFull, 0x8F3024E20A06E133ull, 0xF24C38C8288FE120ull,
+     0x703F1DCF9BD69749ull},
+    {0x2B3C0B854794EFE3ull, 0x1C5D3F969BDACEA0ull, 0x81F16AAFA563AC2Eull,
+     0x23441C5A79D03075ull},
+    {0x418AF8C793FD3762ull, 0xBC6B8E9461D7F924ull, 0x776FF26A2A1A9E78ull,
+     0x3AA0B7BFD417CA6Eull},
+    {0xCD03EA2AD255A3C1ull, 0x0185FEE5B59C1B2Aull, 0xD1F438D44F9773E4ull,
+     0xBE69DD67F83B76E4ull},
+    {0xF951A8873887A0FBull, 0x2C7B31D2A548E0AEull, 0x44803838B6186EFAull,
+     0xA3C78EC7BE219F72ull},
+    {0x958FF151EA0D8C08ull, 0x4B7E8997B4F63488ull, 0xC78E074351C5386Dull,
+     0xD95577556F20EEFAull},
+    {0x29A917807FB05406ull, 0x3318F884351F578Cull, 0xDD24EA6EF6F6A7FAull,
+     0xE74393465E97AEFFull},
+    {0x98240880935E6CCBull, 0x1FD0D271B09F97DAull, 0x56E786472700B183ull,
+     0x291649F99F747817ull},
+    {0x1BD4954F7054C556ull, 0xFFDB2EFF7C596CEBull, 0x7C6AC69A1BAB6B5Bull,
+     0x0F037670537FC153ull},
+    {0x8825E38897597498ull, 0x647CF6EBAF6332C1ull, 0x552BD903DC28C917ull,
+     0x72D7632C00BFC5ABull},
+    {0x6880E276601A644Dull, 0xB3728B20B10FB7DAull, 0xD0BD12060610D16Eull,
+     0x8AEF14EF33452EF2ull},
+    {0xBCE38C9039A1C3FEull, 0x42D56326A3C11289ull, 0xE35595F764FCAEA9ull,
+     0xC9B03C6BC9475A99ull},
+    {0xF60115CBF034A6E5ull, 0x6C36EA75BFCE46D0ull, 0x3B17C8D382725990ull,
+     0x7EDAA2ED11007A35ull},
+    {0x1326E959EDF9DEA2ull, 0xC4776801739F720Cull, 0x5169500FD762F62Full,
+     0x8A0DD0D90A2529ABull},
+    {0x935149D503D442D4ull, 0xFF6BB41302DAD144ull, 0x339CB012CD9D36ECull,
+     0xE61D53619ECC2230ull},
+    {0x528BC888AA50B696ull, 0xB8AEECA36084E1FCull, 0xA158151EC0243476ull,
+     0x02C14AAD097CEC44ull},
+    {0xBED688A72217C327ull, 0x1EE65114F760873Full, 0x3F5C26B37D3002A6ull,
+     0xDDF2E895631597B9ull},
+    {0xE7DB21CF2B0B51ADull, 0xFAFC6324F4B0AB6Cull, 0xB0857244C22D9C5Bull,
+     0xF0AD888D1E05849Cull},
+    {0x05519793CD4DCB00ull, 0x3C594A3163067DEBull, 0xAC75081ACF119E34ull,
+     0x5AC86297805CB094ull},
+    {0x09228D8C22B5779Eull, 0x19644DB2516B7E84ull, 0x2B92C8ABF83141A0ull,
+     0x7F785AD725E19391ull},
+    {0x59C42E5D46D0A74Bull, 0x5EA53C65CA036064ull, 0x48A9916BB635AEB4ull,
+     0xBAE6DF143F54E9D4ull},
+    {0x5EB623696D03D0E3ull, 0xD53D78BCB41DA092ull, 0xFE2348DC52F6B10Dull,
+     0x64802457632C8C11ull},
+    {0x43B61BB2C4B85481ull, 0xC6318C25717E80A1ull, 0x8C4A7F4D6F9C687Dull,
+     0xBD0217E035401D7Cull},
+    {0x7F51CA5743824C37ull, 0xB04C4D5EB11D703Aull, 0x4D511E1ECBF6F369ull,
+     0xD66775EA215456E2ull},
+    {0x39B409EEF87E45CCull, 0x52B8E8C459FC79B3ull, 0x44920918D1858C24ull,
+     0x80F07B645EEE0149ull},
+    {0xCE8694D1BE9AD514ull, 0xBFA19026526836E7ull, 0x1EA4FDF6E4902A7Dull,
+     0x380C4458D696E1FEull},
+    {0xD189E18BF823A0A4ull, 0x1F3B353BE501A7D7ull, 0xA24F77B4E02E2884ull,
+     0x7E94646F74F9180Cull},
+    {0xAFF8C635D325EC48ull, 0x2C2E0AA414038D0Bull, 0x4ED37F611A447467ull,
+     0x39EC38E33B501489ull},
+    {0x2A2BFDAD5F83F197ull, 0x013D3E6EBEF274CCull, 0xE1563C0477726155ull,
+     0xF15A8A5DE932037Eull},
+    {0xD5D1F91EC8126332ull, 0x10110B9BF9B1FF11ull, 0xA175AB26541C6032ull,
+     0x87BADC5728701552ull},
+    {0xC7B5A92CD8082884ull, 0xDDA62AB61B2EEEFBull, 0x8F9882ECFEAE732Full,
+     0x6B38BD5CC01F4FFBull},
+    {0xCF6EF275733D32F0ull, 0xA3F0822DA2BF7D8Bull, 0x304E7435F512406Aull,
+     0x0B28E3EFEBB3172Dull},
+    {0xE698F80701B2E9DBull, 0x66AE2A819A8A8828ull, 0x14EA9024C9B8F2C9ull,
+     0xA7416170523EB5A4ull},
+    {0x3A917E87E307EDB7ull, 0x17B4DEDAE34452C1ull, 0xF689F162E711CC70ull,
+     0x29CE6BFE789CDD0Eull},
+    {0x0EFF3AD8CB155D8Eull, 0x47CD9EAD4C0844A2ull, 0x46C8E40EE6FE21EBull,
+     0xDEF3C25DF0340A51ull},
+    {0x03FD86E62B82D04Dull, 0x32AB0D600717136Dull, 0x682B0E832B857A89ull,
+     0x138CE3F1443739B1ull},
+    {0x2F77C754C4D7F902ull, 0x1053E0A9D9ADBFEAull, 0x58E66368544AE70Aull,
+     0xC48A829C72DD83CAull},
+    {0xF900EB19E466A09Full, 0x31BE9E01A8C7D314ull, 0x3AFEC6B8CA08F471ull,
+     0xB8C0EB0F87FFE7FBull},
+    {0xDB277D8FBE3C8EFBull, 0x53CE6877E11AA57Bull, 0x719C94D20D9A7E7Dull,
+     0xB345B56392453CC9ull},
+    {0x37639C3BDBA4F2C9ull, 0x6095E7B336466DC8ull, 0x3A8049791E65B88Aull,
+     0x82C988CDE5927CD5ull},
+    {0x6B1FB1A714234AE4ull, 0x20562E255BA6467Eull, 0x3E2B892D40F3D675ull,
+     0xF40CE3FBE41ED768ull},
+    {0x8EE11CB1B287C92Aull, 0x8FC2AAEFF63D266Dull, 0x66643487E6EB9F03ull,
+     0x578AA91DE8D56873ull},
+    {0xF5B1F8266A3AEB67ull, 0x83B040BE4DEC1ADDull, 0x7FE1C8635B26FBAEull,
+     0xF4A3A447DEFED79Full},
+    {0x90D8E6FF6AC12475ull, 0x1A422A196EDAC1F2ull, 0x9E3765FE1F8EB002ull,
+     0xC1BDD7C4C351CFBEull}};
+
+void RunTests() {
+  // TODO(janwas): detect number of cores.
+  ThreadPool pool(4);
+
+  TargetBits tested = ~0U;
+  tested &= VerifyImplementations(kExpected64);
+  tested &= VerifyImplementations(kExpected128);
+  tested &= VerifyImplementations(kExpected256);
+  // Any failure causes immediate exit, so apparently all succeeded.
+  HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+    printf("%10s: OK\n", TargetName(target));
+  });
+
+  tested = ~0U;
+  tested &= VerifyCat<HHResult64>(&pool);
+  tested &= VerifyCat<HHResult128>(&pool);
+  tested &= VerifyCat<HHResult256>(&pool);
+  HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+    printf("%10sCat: OK\n", TargetName(target));
+  });
+}
+
+#ifdef HH_GOOGLETEST
+TEST(HighwayhashTest, OutputMatchesExpectations) { RunTests(); }
+#endif
+
+}  // namespace
+}  // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+  highwayhash::RunTests();
+  return 0;
+}
+#endif
diff --git a/highwayhash/highwayhash/highwayhash_test_avx2.cc b/highwayhash/highwayhash/highwayhash_test_avx2.cc
new file mode 100644
index 000000000..f1efe0b5f
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_avx2.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/highwayhash/highwayhash/highwayhash_test_neon.cc b/highwayhash/highwayhash/highwayhash_test_neon.cc
new file mode 100644
index 000000000..df5058829
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_neon.cc
@@ -0,0 +1,22 @@
+// Copyright 2017-2019 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME NEON
+// GCC 4.5.4 only defines the former; 5.4 defines both.
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "highwayhash/highwayhash_test_target.cc"
+#endif
diff --git a/highwayhash/highwayhash/highwayhash_test_portable.cc b/highwayhash/highwayhash/highwayhash_test_portable.cc
new file mode 100644
index 000000000..04930a7e1
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_portable.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/highwayhash/highwayhash/highwayhash_test_sse41.cc b/highwayhash/highwayhash/highwayhash_test_sse41.cc
new file mode 100644
index 000000000..2d6e83d66
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_sse41.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/highwayhash_test_target.cc"
diff --git a/highwayhash/highwayhash/highwayhash_test_target.cc b/highwayhash/highwayhash/highwayhash_test_target.cc
new file mode 100644
index 000000000..65afd4e91
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_target.cc
@@ -0,0 +1,220 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/highwayhash_test_target.h"
+
+#include "highwayhash/highwayhash.h"
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+namespace highwayhash {
+namespace {
+
+void NotifyIfUnequal(const size_t size, const HHResult64& expected,
+                     const HHResult64& actual, const HHNotify notify) {
+  if (expected != actual) {
+    (*notify)(TargetName(HH_TARGET), size);
+  }
+}
+
+// Overload for HHResult128 or HHResult256 (arrays).
+template <size_t kNumLanes>
+void NotifyIfUnequal(const size_t size, const uint64_t (&expected)[kNumLanes],
+                     const uint64_t (&actual)[kNumLanes],
+                     const HHNotify notify) {
+  for (size_t i = 0; i < kNumLanes; ++i) {
+    if (expected[i] != actual[i]) {
+      (*notify)(TargetName(HH_TARGET), size);
+      return;
+    }
+  }
+}
+
+// Shared logic for all HighwayHashTest::operator() overloads.
+template <typename Result>
+void TestHighwayHash(HHStateT<HH_TARGET>* HH_RESTRICT state,
+                     const char* HH_RESTRICT bytes, const size_t size,
+                     const Result* expected, const HHNotify notify) {
+  // TODO(janwas): investigate (length=33)
+#if HH_TARGET == HH_TARGET_Portable && HH_GCC_VERSION && !HH_CLANG_VERSION
+  return;
+#endif
+  Result actual;
+  HighwayHashT(state, bytes, size, &actual);
+  NotifyIfUnequal(size, *expected, actual, notify);
+}
+
+// Shared logic for all HighwayHashCatTest::operator() overloads.
+template <typename Result>
+void TestHighwayHashCat(const HHKey& key, const char* HH_RESTRICT bytes,
+                        const size_t size, const Result* expected,
+                        const HHNotify notify) {
+  // TODO(janwas): investigate (length=33)
+#if HH_TARGET == HH_TARGET_Portable && HH_GCC_VERSION && !HH_CLANG_VERSION
+  return;
+#endif
+
+  // Slightly faster to compute the expected prefix hashes only once.
+  // Use new instead of vector to avoid headers with inline functions.
+  Result* results = new Result[size + 1];
+  for (size_t i = 0; i <= size; ++i) {
+    HHStateT<HH_TARGET> state_flat(key);
+    HighwayHashT(&state_flat, bytes, i, &results[i]);
+  }
+
+  // Splitting into three fragments/Append should cover all codepaths.
+  const size_t max_fragment_size = size / 3;
+  for (size_t size1 = 0; size1 < max_fragment_size; ++size1) {
+    for (size_t size2 = 0; size2 < max_fragment_size; ++size2) {
+      for (size_t size3 = 0; size3 < max_fragment_size; ++size3) {
+        HighwayHashCatT<HH_TARGET> cat(key);
+        const char* pos = bytes;
+        cat.Append(pos, size1);
+        pos += size1;
+        cat.Append(pos, size2);
+        pos += size2;
+        cat.Append(pos, size3);
+        pos += size3;
+
+        Result result_cat;
+        cat.Finalize(&result_cat);
+
+        const size_t total_size = pos - bytes;
+        NotifyIfUnequal(total_size, results[total_size], result_cat, notify);
+      }
+    }
+  }
+
+  delete[] results;
+}
+
+}  // namespace
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+                                         const char* HH_RESTRICT bytes,
+                                         const size_t size,
+                                         const HHResult64* expected,
+                                         const HHNotify notify) const {
+  HHStateT<Target> state(key);
+  TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+                                         const char* HH_RESTRICT bytes,
+                                         const size_t size,
+                                         const HHResult128* expected,
+                                         const HHNotify notify) const {
+  HHStateT<Target> state(key);
+  TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashTest<Target>::operator()(const HHKey& key,
+                                         const char* HH_RESTRICT bytes,
+                                         const size_t size,
+                                         const HHResult256* expected,
+                                         const HHNotify notify) const {
+  HHStateT<Target> state(key);
+  TestHighwayHash(&state, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+                                            const char* HH_RESTRICT bytes,
+                                            const uint64_t size,
+                                            const HHResult64* expected,
+                                            const HHNotify notify) const {
+  TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+                                            const char* HH_RESTRICT bytes,
+                                            const uint64_t size,
+                                            const HHResult128* expected,
+                                            const HHNotify notify) const {
+  TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+template <TargetBits Target>
+void HighwayHashCatTest<Target>::operator()(const HHKey& key,
+                                            const char* HH_RESTRICT bytes,
+                                            const uint64_t size,
+                                            const HHResult256* expected,
+                                            const HHNotify notify) const {
+  TestHighwayHashCat(key, bytes, size, expected, notify);
+}
+
+// Instantiate for the current target.
+template struct HighwayHashTest<HH_TARGET>;
+template struct HighwayHashCatTest<HH_TARGET>;
+
+//-----------------------------------------------------------------------------
+// benchmark
+
+namespace {
+
+template <TargetBits Target>
+uint64_t RunHighway(const void*, const size_t size) {
+  HH_ALIGNAS(32) static const HHKey key = {0, 1, 2, 3};
+  char in[kMaxBenchmarkInputSize];
+  in[0] = static_cast<char>(size & 0xFF);
+  HHResult64 result;
+  HHStateT<Target> state(key);
+  HighwayHashT(&state, in, size, &result);
+  return result;
+}
+
+template <TargetBits Target>
+uint64_t RunHighwayCat(const void*, const size_t size) {
+  HH_ALIGNAS(32) static const HHKey key = {0, 1, 2, 3};
+  HH_ALIGNAS(64) HighwayHashCatT<Target> cat(key);
+  char in[kMaxBenchmarkInputSize];
+  in[0] = static_cast<char>(size & 0xFF);
+  const size_t half_size = size / 2;
+  cat.Append(in, half_size);
+  cat.Append(in + half_size, size - half_size);
+  HHResult64 result;
+  cat.Finalize(&result);
+  return result;
+}
+
+}  // namespace
+
+template <TargetBits Target>
+void HighwayHashBenchmark<Target>::operator()(DurationsForInputs* input_map,
+                                              NotifyBenchmark notify,
+                                              void* context) const {
+  MeasureDurations(&RunHighway<Target>, input_map);
+  notify("HighwayHash", TargetName(Target), input_map, context);
+}
+
+template <TargetBits Target>
+void HighwayHashCatBenchmark<Target>::operator()(DurationsForInputs* input_map,
+                                                 NotifyBenchmark notify,
+                                                 void* context) const {
+  MeasureDurations(&RunHighwayCat<Target>, input_map);
+  notify("HighwayHashCat", TargetName(Target), input_map, context);
+}
+
+// Instantiate for the current target.
+template struct HighwayHashBenchmark<HH_TARGET>;
+template struct HighwayHashCatBenchmark<HH_TARGET>;
+
+}  // namespace highwayhash
+#endif  // HH_DISABLE_TARGET_SPECIFIC
diff --git a/highwayhash/highwayhash/highwayhash_test_target.h b/highwayhash/highwayhash/highwayhash_test_target.h
new file mode 100644
index 000000000..56ae960ba
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_target.h
@@ -0,0 +1,90 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_
+#define HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_
+
+// Tests called by InstructionSets::RunAll, so we can verify all
+// implementations supported by the current CPU.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/highwayhash.h"
+#include "highwayhash/nanobenchmark.h"
+
+namespace highwayhash {
+
+// Verifies the hash result matches "expected" and calls "notify" if not.
+template <TargetBits Target>
+struct HighwayHashTest {
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, const HHResult64* expected,
+                  const HHNotify notify) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, const HHResult128* expected,
+                  const HHNotify notify) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const size_t size, const HHResult256* expected,
+                  const HHNotify notify) const;
+};
+
+// For every possible partition of "bytes" into zero to three fragments,
+// verifies HighwayHashCat returns the same result as HighwayHashT of the
+// concatenated fragments, and calls "notify" if not. The value of "expected"
+// is ignored; it is only used for overloading.
+template <TargetBits Target>
+struct HighwayHashCatTest {
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const uint64_t size, const HHResult64* expected,
+                  const HHNotify notify) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const uint64_t size, const HHResult128* expected,
+                  const HHNotify notify) const;
+  void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
+                  const uint64_t size, const HHResult256* expected,
+                  const HHNotify notify) const;
+};
+
+// Called by benchmark with prefix, target_name, input_map, context.
+// This function must set input_map->num_items to 0.
+using NotifyBenchmark = void (*)(const char*, const char*, DurationsForInputs*,
+                                 void*);
+
+constexpr size_t kMaxBenchmarkInputSize = 1024;
+
+// Calls "notify" with benchmark results for the input sizes specified by
+// "input_map" (<= kMaxBenchmarkInputSize) plus a "context" parameter.
+template <TargetBits Target>
+struct HighwayHashBenchmark {
+  void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
+                  void* context) const;
+};
+
+template <TargetBits Target>
+struct HighwayHashCatBenchmark {
+  void operator()(DurationsForInputs* input_map, NotifyBenchmark notify,
+                  void* context) const;
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_HIGHWAYHASH_TEST_TARGET_H_
diff --git a/highwayhash/highwayhash/highwayhash_test_vsx.cc b/highwayhash/highwayhash/highwayhash_test_vsx.cc
new file mode 100644
index 000000000..224a65efe
--- /dev/null
+++ b/highwayhash/highwayhash/highwayhash_test_vsx.cc
@@ -0,0 +1,22 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME VSX
+
+#ifdef __VSX__
+#include "highwayhash/highwayhash_test_target.cc"
+#endif
diff --git a/highwayhash/highwayhash/iaca.h b/highwayhash/highwayhash/iaca.h
new file mode 100644
index 000000000..80e1013ae
--- /dev/null
+++ b/highwayhash/highwayhash/iaca.h
@@ -0,0 +1,63 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_IACA_H_
+#define HIGHWAYHASH_IACA_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/compiler_specific.h"
+
+// IACA (Intel's Code Analyzer, go/intel-iaca) analyzes instruction latencies,
+// but only for code between special markers. These functions embed such markers
+// in an executable, but only for reading via IACA - they deliberately trigger
+// a crash if executed to ensure they are removed in normal builds.
+
+// Default off; callers must `#define HH_ENABLE_IACA 1` before including this.
+#ifndef HH_ENABLE_IACA
+#define HH_ENABLE_IACA 0
+#endif
+
+namespace highwayhash {
+
+#if HH_ENABLE_IACA && (HH_GCC_VERSION || HH_CLANG_VERSION)
+
+// Call before the region of interest. Fences hopefully prevent reordering.
+static HH_INLINE void BeginIACA() {
+  HH_COMPILER_FENCE;
+  asm volatile(
+      ".byte 0x0F, 0x0B\n\t"  // UD2
+      "movl $111, %ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t");
+  HH_COMPILER_FENCE;
+}
+
+// Call after the region of interest. Fences hopefully prevent reordering.
+static HH_INLINE void EndIACA() {
+  HH_COMPILER_FENCE;
+  asm volatile(
+      "movl $222, %ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t"
+      ".byte 0x0F, 0x0B\n\t");  // UD2
+  HH_COMPILER_FENCE;
+}
+
+#endif
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_IACA_H_
diff --git a/highwayhash/highwayhash/instruction_sets.cc b/highwayhash/highwayhash/instruction_sets.cc
new file mode 100644
index 000000000..ab6775b10
--- /dev/null
+++ b/highwayhash/highwayhash/instruction_sets.cc
@@ -0,0 +1,144 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/arch_specific.h"
+
+// Currently there are only specialized targets for X64; other architectures
+// only use HH_TARGET_Portable, in which case Supported() just returns that.
+#if HH_ARCH_X64
+
+#include <atomic>
+
+namespace highwayhash {
+
+namespace {
+
+bool IsBitSet(const uint32_t reg, const int index) {
+  return (reg & (1U << index)) != 0;
+}
+
+// Returns the lower 32 bits of extended control register 0.
+// Requires CPU support for "OSXSAVE" (see below).
+uint32_t ReadXCR0() {
+#if HH_MSC_VERSION
+  return static_cast<uint32_t>(_xgetbv(0));
+#else
+  uint32_t xcr0, xcr0_high;
+  const uint32_t index = 0;
+  asm volatile(".byte 0x0F, 0x01, 0xD0"
+               : "=a"(xcr0), "=d"(xcr0_high)
+               : "c"(index));
+  return xcr0;
+#endif
+}
+
+// 0 iff not yet initialized by Supported().
+// Not function-local => no compiler-generated locking.
+std::atomic<TargetBits> supported_{0};
+
+// Bits indicating which instruction set extensions are supported.
+enum {
+  kBitSSE = 1 << 0,
+  kBitSSE2 = 1 << 1,
+  kBitSSE3 = 1 << 2,
+  kBitSSSE3 = 1 << 3,
+  kBitSSE41 = 1 << 4,
+  kBitSSE42 = 1 << 5,
+  kBitAVX = 1 << 6,
+  kBitAVX2 = 1 << 7,
+  kBitFMA = 1 << 8,
+  kBitLZCNT = 1 << 9,
+  kBitBMI = 1 << 10,
+  kBitBMI2 = 1 << 11,
+
+  kGroupAVX2 = kBitAVX | kBitAVX2 | kBitFMA | kBitLZCNT | kBitBMI | kBitBMI2,
+  kGroupSSE41 = kBitSSE | kBitSSE2 | kBitSSE3 | kBitSSSE3 | kBitSSE41
+};
+
+}  // namespace
+
+TargetBits InstructionSets::Supported() {
+  TargetBits supported = supported_.load(std::memory_order_acquire);
+  // Already initialized, return that.
+  if (HH_LIKELY(supported)) {
+    return supported;
+  }
+
+  uint32_t flags = 0;
+  uint32_t abcd[4];
+
+  Cpuid(0, 0, abcd);
+  const uint32_t max_level = abcd[0];
+
+  // Standard feature flags
+  Cpuid(1, 0, abcd);
+  flags |= IsBitSet(abcd[3], 25) ? kBitSSE : 0;
+  flags |= IsBitSet(abcd[3], 26) ? kBitSSE2 : 0;
+  flags |= IsBitSet(abcd[2], 0) ? kBitSSE3 : 0;
+  flags |= IsBitSet(abcd[2], 9) ? kBitSSSE3 : 0;
+  flags |= IsBitSet(abcd[2], 19) ? kBitSSE41 : 0;
+  flags |= IsBitSet(abcd[2], 20) ? kBitSSE42 : 0;
+  flags |= IsBitSet(abcd[2], 12) ? kBitFMA : 0;
+  flags |= IsBitSet(abcd[2], 28) ? kBitAVX : 0;
+  const bool has_xsave = IsBitSet(abcd[2], 26);
+  const bool has_osxsave = IsBitSet(abcd[2], 27);
+
+  // Extended feature flags
+  Cpuid(0x80000001U, 0, abcd);
+  flags |= IsBitSet(abcd[2], 5) ? kBitLZCNT : 0;
+
+  // Extended features
+  if (max_level >= 7) {
+    Cpuid(7, 0, abcd);
+    flags |= IsBitSet(abcd[1], 3) ? kBitBMI : 0;
+    flags |= IsBitSet(abcd[1], 5) ? kBitAVX2 : 0;
+    flags |= IsBitSet(abcd[1], 8) ? kBitBMI2 : 0;
+  }
+
+  // Verify OS support for XSAVE, without which XMM/YMM registers are not
+  // preserved across context switches and are not safe to use.
+  if (has_xsave && has_osxsave) {
+    const uint32_t xcr0 = ReadXCR0();
+    // XMM/YMM
+    if ((xcr0 & 2) == 0 || (xcr0 & 4) == 0) {
+      flags &= ~(kBitAVX | kBitAVX2);
+    }
+  } else {
+    // Clear the AVX/AVX2 bits if the CPU or OS does not support XSAVE.
+    //
+    // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across
+    // context switches on x86_64 and any modern 32-bit system, so only AVX2
+    // needs to be disabled.
+    flags &= ~(kBitAVX | kBitAVX2);
+  }
+
+  // Also indicates "supported" has been initialized.
+  supported = HH_TARGET_Portable;
+
+  // Set target bit(s) if all their group's flags are all set.
+  if ((flags & kGroupAVX2) == kGroupAVX2) {
+    supported |= HH_TARGET_AVX2;
+  }
+  if ((flags & kGroupSSE41) == kGroupSSE41) {
+    supported |= HH_TARGET_SSE41;
+  }
+
+  supported_.store(supported, std::memory_order_release);
+  return supported;
+}
+
+}  // namespace highwayhash
+
+#endif  // HH_ARCH_X64
diff --git a/highwayhash/highwayhash/instruction_sets.h b/highwayhash/highwayhash/instruction_sets.h
new file mode 100644
index 000000000..aa7bd6b3f
--- /dev/null
+++ b/highwayhash/highwayhash/instruction_sets.h
@@ -0,0 +1,118 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_INSTRUCTION_SETS_H_
+#define HIGHWAYHASH_INSTRUCTION_SETS_H_
+
+// Calls the best specialization of a template supported by the current CPU.
+//
+// Usage: for each dispatch site, declare a Functor template with a 'Target'
+// argument, add a source file defining its operator() and instantiating
+// Functor<HH_TARGET>, add a cc_library_for_targets rule for that source file,
+// and call InstructionSets::Run<Functor>(/*args*/).
+
+#include <utility>  // std::forward
+
+#include "highwayhash/arch_specific.h"  // HH_TARGET_*
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// Detects TargetBits and calls specializations of a user-defined functor.
+class InstructionSets {
+ public:
+// Returns bit array of HH_TARGET_* supported by the current CPU.
+// The HH_TARGET_Portable bit is guaranteed to be set.
+#if HH_ARCH_X64
+  static TargetBits Supported();
+#elif HH_ARCH_PPC
+  static HH_INLINE TargetBits Supported() {
+    return HH_TARGET_VSX | HH_TARGET_Portable;
+  }
+#elif HH_ARCH_NEON
+  static HH_INLINE TargetBits Supported() {
+    return HH_TARGET_NEON | HH_TARGET_Portable;
+  }
+#else
+  static HH_INLINE TargetBits Supported() { return HH_TARGET_Portable; }
+#endif
+
+  // Chooses the best available "Target" for the current CPU, runs the
+  // corresponding Func<Target>::operator()(args) and returns that Target
+  // (a single bit). The overhead of dispatching is low, about 4 cycles, but
+  // this should only be called infrequently (e.g. hoisting it out of loops).
+  template <template <TargetBits> class Func, typename... Args>
+  static HH_INLINE TargetBits Run(Args&&... args) {
+#if HH_ARCH_X64
+    const TargetBits supported = Supported();
+    if (supported & HH_TARGET_AVX2) {
+      Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
+      return HH_TARGET_AVX2;
+    }
+    if (supported & HH_TARGET_SSE41) {
+      Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
+      return HH_TARGET_SSE41;
+    }
+#elif HH_ARCH_PPC
+    const TargetBits supported = Supported();
+    if (supported & HH_TARGET_VSX) {
+      Func<HH_TARGET_VSX>()(std::forward<Args>(args)...);
+      return HH_TARGET_VSX;
+    }
+#elif HH_ARCH_NEON
+    const TargetBits supported = Supported();
+    if (supported & HH_TARGET_NEON) {
+      Func<HH_TARGET_NEON>()(std::forward<Args>(args)...);
+      return HH_TARGET_NEON;
+    }
+#endif
+
+    // No matching HH_ARCH or no supported HH_TARGET:
+    Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
+    return HH_TARGET_Portable;
+  }
+
+  // Calls Func<Target>::operator()(args) for all Target supported by the
+  // current CPU, and returns their HH_TARGET_* bits.
+  template <template <TargetBits> class Func, typename... Args>
+  static HH_INLINE TargetBits RunAll(Args&&... args) {
+    const TargetBits supported = Supported();
+
+#if HH_ARCH_X64
+    if (supported & HH_TARGET_AVX2) {
+      Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
+    }
+    if (supported & HH_TARGET_SSE41) {
+      Func<HH_TARGET_SSE41>()(std::forward<Args>(args)...);
+    }
+#elif HH_ARCH_PPC
+    if (supported & HH_TARGET_VSX) {
+      Func<HH_TARGET_VSX>()(std::forward<Args>(args)...);
+    }
+
+#elif HH_ARCH_NEON
+    if (supported & HH_TARGET_NEON) {
+      Func<HH_TARGET_NEON>()(std::forward<Args>(args)...);
+    }
+#endif
+
+    Func<HH_TARGET_Portable>()(std::forward<Args>(args)...);
+
+    return supported;  // i.e. all that were run
+  }
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_INSTRUCTION_SETS_H_
diff --git a/highwayhash/highwayhash/load3.h b/highwayhash/highwayhash/load3.h
new file mode 100644
index 000000000..5e258e5ed
--- /dev/null
+++ b/highwayhash/highwayhash/load3.h
@@ -0,0 +1,144 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_HH_LOAD3_H_
+#define HIGHWAYHASH_HH_LOAD3_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Loads 0 to 3 bytes from a given location using one of several policies.
+// These are potentially faster than 8-bit loads, but require certain additional
+// promises by the caller: that 'out of bounds' memory accesses are allowed,
+// and/or that the bytes may be permuted or duplicated.
+class Load3 {
+ public:
+  // In increasing order of complexity:
+  struct AllowReadBeforeAndReturn {};
+  struct AllowReadBefore {};
+  struct AllowUnordered {};
+  struct AllowNone {};
+
+  // Up to 4 preceding bytes may be read and returned along with the 0..3
+  // valid bytes. The valid bytes are in little-endian order, except that the
+  // preceding bytes occupy the least-significant bytes.
+  HH_INLINE uint32_t operator()(AllowReadBeforeAndReturn, const char* from,
+                                const size_t size_mod4) {
+    // It's safe to read before "from", so we can load 32 bits, which is faster
+    // than individual byte loads. We assume little-endian byte order, so
+    // big-endian platforms will need to swap. Type punning can generate
+    // incorrect code if compiled with strict aliasing; the only safe
+    // alternatives are memcpy and reading through char*. We must avoid memcpy
+    // because string.h must not be included per the warning above. On GCC and
+    // Clang, we can use a builtin instead.
+    uint32_t last4;
+    Copy(from + size_mod4 - 4, 4, reinterpret_cast<char*>(&last4));
+    return host_from_le32(last4);
+  }
+
+  // As above, but preceding bytes are removed and upper byte(s) are zero.
+  HH_INLINE uint64_t operator()(AllowReadBefore, const char* from,
+                                const size_t size_mod4) {
+    // Shift 0..3 valid bytes into LSB as if loaded in little-endian order.
+    // 64-bit type enables 32-bit shift when size_mod4 == 0.
+    uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4);
+    last3 >>= 32 - (size_mod4 * 8);
+    return last3;
+  }
+
+  // The bytes need not be loaded in little-endian order. This particular order
+  // (and the duplication of some bytes depending on "size_mod4") was chosen for
+  // computational convenience and can no longer be changed because it is part
+  // of the HighwayHash length padding definition.
+  HH_INLINE uint64_t operator()(AllowUnordered, const char* from,
+                                const size_t size_mod4) {
+    uint64_t last3 = 0;
+    // Not allowed to read any bytes; early-out is faster than reading from a
+    // constant array of zeros.
+    if (size_mod4 == 0) {
+      return last3;
+    }
+
+    // These indices are chosen as an easy-to-compute sequence containing the
+    // same elements as [0, size), but repeated and/or reordered. This enables
+    // unconditional loads, which outperform conditional 8 or 16+8 bit loads.
+    const uint64_t idx0 = 0;
+    const uint64_t idx1 = size_mod4 >> 1;
+    const uint64_t idx2 = size_mod4 - 1;
+    // Store into least significant bytes (avoids one shift).
+    last3 = U64FromChar(from[idx0]);
+    last3 += U64FromChar(from[idx1]) << 8;
+    last3 += U64FromChar(from[idx2]) << 16;
+    return last3;
+  }
+
+  // Must read exactly [0, size) bytes in little-endian order.
+  HH_INLINE uint64_t operator()(AllowNone, const char* from,
+                                const size_t size_mod4) {
+    // We need to load in little-endian order without accessing anything outside
+    // [from, from + size_mod4). Unrolling is faster than looping backwards.
+    uint64_t last3 = 0;
+    if (size_mod4 >= 1) {
+      last3 += U64FromChar(from[0]);
+    }
+    if (size_mod4 >= 2) {
+      last3 += U64FromChar(from[1]) << 8;
+    }
+    if (size_mod4 == 3) {
+      last3 += U64FromChar(from[2]) << 16;
+    }
+    return last3;
+  }
+
+ private:
+  static HH_INLINE uint32_t U32FromChar(const char c) {
+    return static_cast<uint32_t>(static_cast<unsigned char>(c));
+  }
+
+  static HH_INLINE uint64_t U64FromChar(const char c) {
+    return static_cast<uint64_t>(static_cast<unsigned char>(c));
+  }
+
+  static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size,
+                             char* HH_RESTRICT to) {
+#if HH_MSC_VERSION
+    for (size_t i = 0; i < size; ++i) {
+      to[i] = from[i];
+    }
+#else
+    __builtin_memcpy(to, from, size);
+#endif
+  }
+};
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_LOAD3_H_
diff --git a/highwayhash/highwayhash/nanobenchmark.cc b/highwayhash/highwayhash/nanobenchmark.cc
new file mode 100644
index 000000000..d72ffbfc1
--- /dev/null
+++ b/highwayhash/highwayhash/nanobenchmark.cc
@@ -0,0 +1,451 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/nanobenchmark.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <random>
+#include <vector>
+
+#include <map>
+#include "highwayhash/os_specific.h"
+#include "highwayhash/robust_statistics.h"
+#include "highwayhash/tsc_timer.h"
+
+namespace highwayhash {
+namespace {
+
+// Enables sanity checks that verify correct operation at the cost of
+// longer benchmark runs.
+#ifndef NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_ENABLE_CHECKS 0
+#endif
+
+#define NANOBENCHMARK_CHECK_ALWAYS(condition)                    \
+  while (!(condition)) {                                         \
+    printf("Nanobenchmark check failed at line %d\n", __LINE__); \
+    abort();                                                     \
+  }
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
+#else
+#define NANOBENCHMARK_CHECK(condition)
+#endif
+
+#if HH_MSC_VERSION
+
+// MSVC does not support inline assembly anymore (and never supported GCC's
+// RTL constraints used below).
+#pragma optimize("", off)
+// Self-assignment with #pragma optimize("off") might be expected to prevent
+// elision, but it does not with MSVC 2015.
+void UseCharPointer(volatile const char*) {}
+#pragma optimize("", on)
+
+template <class T>
+inline void PreventElision(T&& output) {
+  UseCharPointer(reinterpret_cast<volatile const char*>(&output));
+}
+
+#else
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// FuncOutput.
+template <class T>
+inline void PreventElision(T&& output) {
+  asm volatile("" : "+r"(output) : : "memory");
+}
+
+#endif
+
+HH_NOINLINE FuncOutput Func1(const void*, const FuncInput input) {
+  return input + 1;
+}
+HH_NOINLINE FuncOutput Func2(const void*, const FuncInput input) {
+  return input + 2;
+}
+
+// Duration := difference between two tick counts. Must be unsigned to ensure
+// wraparound on overflow.
+using Duration = uint32_t;
+
+// Even with high-priority pinned threads and frequency throttling disabled,
+// elapsed times are noisy due to interrupts or SMM operations. It might help
+// to detect such events via transactions and omit affected measurements.
+// Unfortunately, TSX is currently unavailable due to a bug. We achieve
+// repeatable results with a robust measure of the central tendency ("mode").
+
+// Returns time elapsed between timer Start/Stop.
+Duration EstimateResolutionOnCurrentCPU(const Func func, const uint8_t* arg) {
+  // Even 128K samples are not enough to achieve repeatable results when
+  // throttling is enabled; the caller must perform additional aggregation.
+  const size_t kNumSamples = 512;
+  Duration samples[kNumSamples];
+  for (size_t i = 0; i < kNumSamples; ++i) {
+    const volatile Duration t0 = Start<Duration>();
+    PreventElision(func(arg, i));
+    const volatile Duration t1 = Stop<Duration>();
+    NANOBENCHMARK_CHECK(t0 <= t1);
+    samples[i] = t1 - t0;
+  }
+  CountingSort(samples, samples + kNumSamples);
+  const Duration resolution = Mode(samples, kNumSamples);
+  NANOBENCHMARK_CHECK(resolution != 0);
+  return resolution;
+}
+
+// Returns mode of EstimateResolutionOnCurrentCPU across all CPUs. This
+// increases repeatability because some CPUs may be throttled or slowed down by
+// interrupts.
+Duration EstimateResolution(const Func func_to_measure, const uint8_t* arg) {
+  Func func = (func_to_measure == &Func2) ? &Func1 : &Func2;
+
+  const size_t kNumSamples = 512;
+  std::vector<Duration> resolutions;
+  resolutions.reserve(kNumSamples);
+
+  const auto cpus = AvailableCPUs();
+  const size_t repetitions_per_cpu = kNumSamples / cpus.size();
+
+  auto affinity = GetThreadAffinity();
+  for (const int cpu : cpus) {
+    PinThreadToCPU(cpu);
+    for (size_t i = 0; i < repetitions_per_cpu; ++i) {
+      resolutions.push_back(EstimateResolutionOnCurrentCPU(func, arg));
+    }
+  }
+  SetThreadAffinity(affinity);
+  free(affinity);
+
+  Duration* const begin = resolutions.data();
+  CountingSort(begin, begin + resolutions.size());
+  const Duration resolution = Mode(begin, resolutions.size());
+  printf("Resolution %lu\n", long(resolution));
+  return resolution;
+}
+
+// Returns ticks elapsed when running an empty region, i.e. the timer
+// resolution/overhead, which will be deducted from other measurements and
+// also used by InitReplicas.
+Duration Resolution(const Func func, const uint8_t* arg) {
+  // Initialization is expensive and should only happen once.
+  static const Duration resolution = EstimateResolution(func, arg);
+  return resolution;
+}
+
+// Returns total ticks elapsed when passing each of "inputs" (after in-place
+// shuffling) to "func", which must return something it has computed so the
+// compiler does not optimize it away.
+Duration TotalDuration(const Duration resolution, const Func func,
+                       const uint8_t* arg, std::vector<FuncInput>* inputs,
+                       std::mt19937_64* rng) {
+  // This benchmark attempts to measure the performance of "func" when
+  // called with realistic inputs, which we assume are randomly drawn
+  // from the given "inputs" distribution, so we shuffle those values.
+  if (inputs->size() > 1) {
+    std::shuffle(inputs->begin(), inputs->end(), *rng);
+  }
+
+  const Duration t0 = Start<Duration>();
+  for (const FuncInput input : *inputs) {
+    PreventElision(func(arg, input));
+  }
+  const Duration t1 = Stop<Duration>();
+  const Duration elapsed = t1 - t0;
+  NANOBENCHMARK_CHECK(elapsed > resolution);
+  return elapsed - resolution;
+}
+
+// Stores input values for a series of calls to the function to measure.
+// We assume inputs are drawn from a known discrete probability distribution,
+// modeled as a vector<FuncInput> v. The probability of a value X
+// in v is count(v.begin(), v.end(), X) / v.size().
+class Inputs {
+  Inputs(const Inputs&) = delete;
+  Inputs& operator=(const Inputs&) = delete;
+
+ public:
+  Inputs(const Duration resolution, const std::vector<FuncInput>& distribution,
+         const Func func, const uint8_t* arg, std::mt19937_64* rng)
+      : unique_(InitUnique(distribution)),
+        replicas_(InitReplicas(distribution, resolution, func, arg, rng)),
+        num_replicas_(replicas_.size() / distribution.size()) {
+    if (num_replicas_ != 1) {
+      printf("NumReplicas %zu\n", num_replicas_);
+    }
+  }
+
+  // Returns vector of the unique values from the input distribution.
+  const std::vector<FuncInput>& Unique() const { return unique_; }
+
+  // Returns how many instances of "distribution" are in "replicas_", i.e.
+  // the number of occurrences of an input value that occurred only once
+  // in the distribution. This is the divisor for computing the duration
+  // of a single call.
+  size_t NumReplicas() const { return num_replicas_; }
+
+  // Returns the (replicated) input distribution. Modified by caller
+  // (shuffled in-place) => not thread-safe.
+  std::vector<FuncInput>& Replicas() { return replicas_; }
+
+  // Returns a copy of Replicas() with NumReplicas() occurrences of "input"
+  // removed. Used for the leave-one-out measurement.
+  std::vector<FuncInput> Without(const FuncInput input_to_remove) const {
+    // "input_to_remove" should be in the original distribution.
+    NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(),
+                                  input_to_remove) != unique_.end());
+
+    std::vector<FuncInput> copy = replicas_;
+    auto pos = std::partition(copy.begin(), copy.end(),
+                              [input_to_remove](const FuncInput input) {
+                                return input_to_remove != input;
+                              });
+    // Must occur at least num_replicas_ times.
+    NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_);
+    // (Avoids unused-variable warning.)
+    PreventElision(&*pos);
+    copy.resize(copy.size() - num_replicas_);
+    return copy;
+  }
+
+ private:
+  // Returns a copy with any duplicate values removed. Initializing unique_
+  // through this function allows it to be const.
+  static std::vector<FuncInput> InitUnique(
+      const std::vector<FuncInput>& distribution) {
+    std::vector<FuncInput> unique = distribution;
+    std::sort(unique.begin(), unique.end());
+    unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+    // Our leave-one-out measurement technique only makes sense when
+    // there are multiple input values.
+    NANOBENCHMARK_CHECK(unique.size() >= 2);
+    return unique;
+  }
+
+  // Returns how many replicas of "distribution" are required before
+  // TotalDuration is large enough compared to the timer resolution.
+  static std::vector<FuncInput> InitReplicas(
+      const std::vector<FuncInput>& distribution, const Duration resolution,
+      const Func func, const uint8_t* arg, std::mt19937_64* rng) {
+    // We compute the difference in duration for inputs = Replicas() vs.
+    // Without(). Dividing this by num_replicas must yield a value where the
+    // quantization error (from the timer resolution) is sufficiently small.
+    const uint64_t min_elapsed = distribution.size() * resolution * 400;
+
+    std::vector<FuncInput> replicas;
+    for (;;) {
+      AppendReplica(distribution, &replicas);
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+      const uint64_t t0 = Start64();
+#endif
+      const Duration elapsed =
+          TotalDuration(resolution, func, arg, &replicas, rng);
+#if NANOBENCHMARK_ENABLE_CHECKS
+      const uint64_t t1 = Stop64();
+#endif
+      // Ensure the 32-bit timer didn't and won't overflow.
+      NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30));
+
+      if (elapsed >= min_elapsed) {
+        return replicas;
+      }
+    }
+  }
+
+  // Appends all values in "distribution" to "replicas".
+  static void AppendReplica(const std::vector<FuncInput>& distribution,
+                            std::vector<FuncInput>* replicas) {
+    replicas->reserve(replicas->size() + distribution.size());
+    for (const FuncInput input : distribution) {
+      replicas->push_back(input);
+    }
+  }
+
+  const std::vector<FuncInput> unique_;
+
+  // Modified by caller (shuffled in-place) => non-const.
+  std::vector<FuncInput> replicas_;
+
+  // Initialized from replicas_.
+  const size_t num_replicas_;
+};
+
+// Holds samples of measured durations, and (robustly) reduces them to a
+// single result for each unique input value.
+class DurationSamples {
+ public:
+  DurationSamples(const std::vector<FuncInput>& unique_inputs,
+                  const size_t num_samples)
+      : num_samples_(num_samples) {
+    // Preallocate storage.
+    for (const FuncInput input : unique_inputs) {
+      samples_for_input_[input].reserve(num_samples);
+    }
+  }
+
+  void Add(const FuncInput input, const Duration sample) {
+    // "input" should be one of the values passed to the ctor.
+    NANOBENCHMARK_CHECK(samples_for_input_.find(input) !=
+                        samples_for_input_.end());
+
+    samples_for_input_[input].push_back(sample);
+  }
+
+  // Invokes "lambda" for each (input, duration) pair. The per-call duration
+  // is the central tendency (the mode) of the samples.
+  template <class Lambda>
+  void Reduce(const Lambda& lambda) {
+    for (auto& input_and_samples : samples_for_input_) {
+      const FuncInput input = input_and_samples.first;
+      std::vector<Duration>& samples = input_and_samples.second;
+
+      NANOBENCHMARK_CHECK(samples.size() <= num_samples_);
+      std::sort(samples.begin(), samples.end());
+      const Duration duration = Mode(samples.data(), samples.size());
+      lambda(input, duration);
+    }
+  }
+
+ private:
+  const size_t num_samples_;
+  std::map<FuncInput, std::vector<Duration>> samples_for_input_;
+};
+
+// Gathers "num_samples" durations via repeated leave-one-out measurements.
+DurationSamples GatherDurationSamples(const Duration resolution, Inputs& inputs,
+                                      const Func func, const uint8_t* arg,
+                                      const size_t num_samples,
+                                      std::mt19937_64* rng) {
+  DurationSamples samples(inputs.Unique(), num_samples);
+  for (size_t i = 0; i < num_samples; ++i) {
+    // Total duration for all shuffled input values. This may change over time,
+    // so recompute it for each sample.
+    const Duration total =
+        TotalDuration(resolution, func, arg, &inputs.Replicas(), rng);
+
+    for (const FuncInput input : inputs.Unique()) {
+      // To isolate the durations of the calls with this input value,
+      // we measure the duration without those values and subtract that
+      // from the total, and later divide by NumReplicas.
+      std::vector<FuncInput> without = inputs.Without(input);
+      for (int rep = 0; rep < 3; ++rep) {
+        const Duration elapsed =
+            TotalDuration(resolution, func, arg, &without, rng);
+        if (elapsed < total) {
+          samples.Add(input, total - elapsed);
+          break;
+        }
+      }
+    }
+  }
+  return samples;
+}
+
+}  // namespace
+
+DurationsForInputs::DurationsForInputs(const FuncInput* inputs,
+                                       const size_t num_inputs,
+                                       const size_t max_durations)
+    : num_items(0),
+      inputs_(inputs),
+      num_inputs_(num_inputs),
+      max_durations_(max_durations),
+      all_durations_(new float[num_inputs * max_durations]) {
+  NANOBENCHMARK_CHECK(num_inputs != 0);
+  NANOBENCHMARK_CHECK(max_durations != 0);
+
+  items = new Item[num_inputs];
+  for (size_t i = 0; i < num_inputs_; ++i) {
+    items[i].input = 0;  // initialized later
+    items[i].num_durations = 0;
+    items[i].durations = all_durations_ + i * max_durations;
+  }
+}
+
+DurationsForInputs::~DurationsForInputs() {
+  delete[] all_durations_;
+  delete[] items;
+}
+
+void DurationsForInputs::AddItem(const FuncInput input, const float sample) {
+  for (size_t i = 0; i < num_items; ++i) {
+    NANOBENCHMARK_CHECK(items[i].input != input);
+  }
+  Item& item = items[num_items];
+  item.input = input;
+  item.num_durations = 1;
+  item.durations[0] = sample;
+  ++num_items;
+}
+
+void DurationsForInputs::AddSample(const FuncInput input, const float sample) {
+  for (size_t i = 0; i < num_items; ++i) {
+    Item& item = items[i];
+    if (item.input == input) {
+      item.durations[item.num_durations] = sample;
+      ++item.num_durations;
+      return;
+    }
+  }
+  NANOBENCHMARK_CHECK(!"Item not found");
+}
+
+void DurationsForInputs::Item::PrintMedianAndVariability(const double mul) {
+  // Copy so that Median can modify.
+  std::vector<float> duration_vec(durations, durations + num_durations);
+  const float median = Median(&duration_vec);
+  const float variability = MedianAbsoluteDeviation(duration_vec, median);
+  printf("%5zu: median=%6.2f ticks; median abs. deviation=%6.3f ticks\n", input,
+         median * mul, variability * mul);
+}
+
+void MeasureDurations(const Func func, DurationsForInputs* input_map,
+                      const uint8_t* arg) {
+  std::mt19937_64 rng;
+  const Duration resolution = Resolution(func, arg);
+
+  // Adds enough 'replicas' of the distribution to measure "func" given
+  // the timer resolution.
+  const std::vector<FuncInput> distribution(
+      input_map->inputs_, input_map->inputs_ + input_map->num_inputs_);
+  Inputs inputs(resolution, distribution, func, arg, &rng);
+  const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas());
+
+  // First iteration: populate input_map items.
+  auto samples =
+      GatherDurationSamples(resolution, inputs, func, arg, 512, &rng);
+  samples.Reduce(
+      [per_call, input_map](const FuncInput input, const Duration duration) {
+        const float sample = static_cast<float>(duration * per_call);
+        input_map->AddItem(input, sample);
+      });
+
+  // Subsequent iteration(s): append to input_map items' array.
+  for (size_t rep = 1; rep < input_map->max_durations_; ++rep) {
+    auto samples =
+        GatherDurationSamples(resolution, inputs, func, arg, 512, &rng);
+    samples.Reduce(
+        [per_call, input_map](const FuncInput input, const Duration duration) {
+          const float sample = static_cast<float>(duration * per_call);
+          input_map->AddSample(input, sample);
+        });
+  }
+}
+
+}  // namespace highwayhash
diff --git a/highwayhash/highwayhash/nanobenchmark.h b/highwayhash/highwayhash/nanobenchmark.h
new file mode 100644
index 000000000..b85704fc6
--- /dev/null
+++ b/highwayhash/highwayhash/nanobenchmark.h
@@ -0,0 +1,189 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_NANOBENCHMARK_H_
+#define HIGHWAYHASH_NANOBENCHMARK_H_
+
+// Benchmarks functions of a single integer argument with realistic branch
+// prediction hit rates. Uses a robust estimator to summarize the measurements.
+// Measurements are precise to about 0.2 cycles.
+//
+// Example:
+//   #include "highwayhash/nanobenchmark.h"
+//   using namespace highwayhash;
+//
+//   uint64_t RegionToMeasure(const void*, size_t size) {
+//     char from[8] = {static_cast<char>(size)};
+//     char to[8];
+//     memcpy(to, from, size);
+//     return to[0];
+//   }
+//
+//   PinThreadToRandomCPU();
+//
+//   static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
+//   DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
+//   MeasureDurations(&RegionToMeasure, &input_map);
+//   for (size_t i = 0; i < input_map.num_items; ++i) {
+//     input_map.items[i].PrintMedianAndVariability();
+//   }
+//
+// Output:
+//   3: median= 25.2 ticks; median abs. deviation= 0.1 ticks
+//   4: median= 13.5 ticks; median abs. deviation= 0.1 ticks
+//   7: median= 13.5 ticks; median abs. deviation= 0.1 ticks
+//   8: median= 27.5 ticks; median abs. deviation= 0.2 ticks
+// (7 is presumably faster because it can use two unaligned 32-bit load/stores.)
+//
+// Background: Microbenchmarks such as http://github.com/google/benchmark
+// can measure elapsed times on the order of a microsecond. Shorter functions
+// are typically measured by repeating them thousands of times and dividing
+// the total elapsed time by this count. Unfortunately, repetition (especially
+// with the same input parameter!) influences the runtime. In time-critical
+// code, it is reasonable to expect warm instruction/data caches and TLBs,
+// but a perfect record of which branches will be taken is unrealistic.
+// Unless the application also repeatedly invokes the measured function with
+// the same parameter, the benchmark is measuring something very different -
+// a best-case result, almost as if the parameter were made a compile-time
+// constant. This may lead to erroneous conclusions about branch-heavy
+// algorithms outperforming branch-free alternatives.
+//
+// Our approach differs in three ways. Adding fences to the timer functions
+// reduces variability due to instruction reordering, improving the timer
+// resolution to about 10 nanoseconds. However, shorter functions must still
+// be invoked repeatedly. For more realistic branch prediction performance,
+// we vary the input parameter according to a user-specified distribution.
+// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
+// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
+// central tendency of the measurement samples with the "half sample mode",
+// which is more robust to outliers and skewed data than the mean or median.
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// Argument to the function being measured (e.g. number of bytes to copy).
+using FuncInput = size_t;
+
+// "Proof of work" returned by the function to ensure it is not elided.
+using FuncOutput = uint64_t;
+
+// Function to measure (cannot use std::function in a restricted header).
+// Users either pass a function pointer or captureless lambda with this
+// signature, or use MeasureClosureDuration to convert a closure (e.g. lambda
+// with captures) to this kind of function pointer.
+using Func = FuncOutput (*)(const void*, FuncInput);
+
+// Flat map of input -> durations[]. NOTE: durations are 'ticks' (tsc_timer.h);
+// convert to seconds via division by InvariantTicksPerSecond.
+class DurationsForInputs {
+ public:
+  struct Item {
+    // The optional "mul" scaling factor is applied to median and variability
+    // (useful for reporting cycles per byte etc.)
+    void PrintMedianAndVariability(const double mul = 1.0);
+
+    FuncInput input;       // read-only (set by AddItem).
+    size_t num_durations;  // written so far: [0, max_durations).
+    float* durations;      // max_durations entries; points into all_durations.
+  };
+
+  // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
+  // "func". The values are chosen to maximize coverage of "func". The pointer
+  // must remain valid until after MeasureDurations. This represents a
+  // distribution, so a value's frequency should reflect its probability in the
+  // real application. Order does not matter; for example, a uniform
+  // distribution over [0, 4) could be represented as {3,0,2,1}. Repeating each
+  // value at least once ensures the leave-one-out distribution is closer to the
+  // original distribution, leading to more realistic results.
+  //
+  // "max_durations" is the number of duration samples to measure for each
+  // unique input value. Larger values decrease variability.
+  //
+  // Runtime is proportional to "num_inputs" * #unique * "max_durations".
+  DurationsForInputs(const FuncInput* inputs, const size_t num_inputs,
+                     const size_t max_durations);
+  ~DurationsForInputs();
+
+  // Adds an item with the given "input" and "sample". Must only be called once
+  // per unique "input" value.
+  void AddItem(const FuncInput input, const float sample);
+
+  // Adds "sample" to an already existing Item with the given "input".
+  void AddSample(const FuncInput input, const float sample);
+
+  // Allow direct inspection of items[0..num_items-1] because accessor or
+  // ForeachItem functions are unsafe in a restricted header.
+  Item* items;       // owned by this class, do not allocate/free.
+  size_t num_items;  // safe to reset to zero.
+
+ private:
+  friend void MeasureDurations(Func, DurationsForInputs*, const uint8_t*);
+
+  const FuncInput* const inputs_;
+  const size_t num_inputs_;
+  const size_t max_durations_;
+  float* const all_durations_;
+};
+
+// Helper function to detect num_inputs from arrays.
+template <size_t N>
+static HH_INLINE DurationsForInputs MakeDurationsForInputs(
+    const FuncInput (&inputs)[N], const size_t max_durations) {
+  return DurationsForInputs(&inputs[0], N, max_durations);
+}
+
+// Returns precise measurements of the number of ticks (see tsc_timer.h)
+// elapsed when calling "func" with each unique input value in "input_map",
+// taking special care to maintain realistic branch prediction hit rates.
+//
+// "func" returns a 'proof of work' to ensure its computations are not elided.
+// "arg*" are for use by MeasureClosureDurations.
+void MeasureDurations(const Func func, DurationsForInputs* input_map,
+                      const uint8_t* arg = nullptr);
+
+namespace HH_TARGET_NAME {
+// Calls operator() of the given closure (lambda function).
+template <class Closure>
+static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
+  return (*f)(input);
+}
+}  // namespace HH_TARGET_NAME
+
+// Returns a function pointer that will be called with the address of "closure".
+template <class Closure>
+static HH_INLINE Func MakeThunk(const Closure& closure) {
+  return reinterpret_cast<Func>(&HH_TARGET_NAME::CallClosure<Closure>);
+}
+
+// Same as MeasureDurations, except "closure" is typically a lambda function of
+// FuncInput -> FuncOutput with a capture list.
+template <class Closure>
+void HH_INLINE MeasureClosureDurations(const Closure& closure,
+                                       DurationsForInputs* input_map) {
+  MeasureDurations(MakeThunk(closure), input_map,
+                   reinterpret_cast<const uint8_t*>(&closure));
+}
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_NANOBENCHMARK_H_
diff --git a/highwayhash/highwayhash/nanobenchmark_example.cc b/highwayhash/highwayhash/nanobenchmark_example.cc
new file mode 100644
index 000000000..fbe86cca0
--- /dev/null
+++ b/highwayhash/highwayhash/nanobenchmark_example.cc
@@ -0,0 +1,58 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#include "highwayhash/nanobenchmark.h"
+#include "highwayhash/os_specific.h"
+
+#ifdef HH_GOOGLETEST
+#include "testing/base/public/gunit.h"
+#endif
+
+namespace highwayhash {
+namespace {
+
+uint64_t RegionToMeasure(const void*, FuncInput size) {
+  char from[8] = {static_cast<char>(size)};
+  char to[8];
+  memcpy(to, from, size);
+  return to[0];
+}
+
+void Measure() {
+  PinThreadToRandomCPU();
+  static const size_t distribution[] = {3, 3, 4, 4, 7, 7, 8, 8};
+  DurationsForInputs input_map = MakeDurationsForInputs(distribution, 10);
+  MeasureDurations(&RegionToMeasure, &input_map);
+  for (size_t i = 0; i < input_map.num_items; ++i) {
+    input_map.items[i].PrintMedianAndVariability();
+  }
+}
+
+#ifdef HH_GOOGLETEST
+TEST(NanobenchmarkTest, Run) { Measure(); }
+#endif
+
+}  // namespace
+}  // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+  highwayhash::Measure();
+  return 0;
+}
+#endif
diff --git a/highwayhash/highwayhash/os_mac.cc b/highwayhash/highwayhash/os_mac.cc
new file mode 100644
index 000000000..fd179d8c3
--- /dev/null
+++ b/highwayhash/highwayhash/os_mac.cc
@@ -0,0 +1,44 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Created by Alexander Gryanko on 16/09/2017.
+
+#include "highwayhash/os_mac.h"
+
+int mac_getaffinity(cpu_set_t* set) {
+  int64_t core_count = 0;
+  size_t core_count_size = sizeof(core_count);  // size is a pointer
+  const int err =
+      sysctlbyname(SYSCTL_CORE_COUNT, &core_count, &core_count_size, NULL, 0);
+  if (err != 0) return err;
+
+  CPU_ZERO(set);
+  for (int64_t i = 0; i < core_count; ++i) {
+    CPU_SET(i, set);
+  }
+
+  return 0;
+}
+
+int mac_setaffinity(cpu_set_t* set) {
+  thread_port_t thread = pthread_mach_thread_np(pthread_self());
+
+  uint16_t current_core;
+  for (current_core = 0; current_core < NR_CPUS; ++current_core) {
+    if (CPU_ISSET(current_core, set)) break;
+  }
+  thread_affinity_policy_data_t policy = {current_core};
+  return thread_policy_set(thread, THREAD_AFFINITY_POLICY,
+                           (thread_policy_t)&policy, 1);
+}
diff --git a/highwayhash/highwayhash/os_mac.h b/highwayhash/highwayhash/os_mac.h
new file mode 100644
index 000000000..5658b9110
--- /dev/null
+++ b/highwayhash/highwayhash/os_mac.h
@@ -0,0 +1,62 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Created by Alexander Gryanko on 16/09/2017.
+
+#ifndef HIGHWAYHASH_OS_MAC_H_
+#define HIGHWAYHASH_OS_MAC_H_
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+#include <mach/mach_types.h>
+#include <mach/thread_act.h>
+#include <pthread.h>
+#include <sys/sysctl.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+typedef unsigned long int cpu_mask;
+
+#define SYSCTL_CORE_COUNT "machdep.cpu.thread_count"
+#define NR_CPUS 512  // from the linux kernel limit
+#define NR_CPUBITS (8 * sizeof(cpu_mask))
+
+struct cpu_set_t {
+  cpu_mask bits[NR_CPUS / NR_CPUBITS];
+};
+
+static inline void CPU_ZERO(cpu_set_t* set) {
+  memset(set, 0, sizeof(cpu_set_t));
+}
+
+static inline int CPU_ISSET(int cpu, const cpu_set_t* set) {
+  if (cpu < NR_CPUS) {
+    return (set->bits[cpu / NR_CPUBITS] & 1L << (cpu % NR_CPUBITS)) != 0;
+  }
+  return 0;
+}
+
+static inline void CPU_SET(int cpu, cpu_set_t* set) {
+  if (cpu < NR_CPUS) {
+    set->bits[cpu / NR_CPUBITS] |= 1L << (cpu % NR_CPUBITS);
+  }
+}
+
+int mac_getaffinity(cpu_set_t* set);
+int mac_setaffinity(cpu_set_t* set);
+
+#endif  // !HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_OS_MAC_H_
diff --git a/highwayhash/highwayhash/os_specific.cc b/highwayhash/highwayhash/os_specific.cc
new file mode 100644
index 000000000..a4eed7909
--- /dev/null
+++ b/highwayhash/highwayhash/os_specific.cc
@@ -0,0 +1,260 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/os_specific.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <random>
+
+#include "highwayhash/arch_specific.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#define OS_WIN 1
+#define NOMINMAX
+#include <windows.h>
+#else
+#define OS_WIN 0
+#endif
+
+#ifdef __linux__
+#define OS_LINUX 1
+#include <sched.h>
+#include <sys/time.h>
+#else
+#define OS_LINUX 0
+#endif
+
+#if defined(__APPLE__) && \
+    defined(__MACH__)  // __MACH__ also defined for GNU/Hurd
+#define OS_MAC 1
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+//
+#include "highwayhash/os_mac.cc"
+#else
+#define OS_MAC 0
+#endif
+
+#ifdef __FreeBSD__
+#define OS_FREEBSD 1
+// clang-format off
+#include <sys/param.h>
+#include <sys/cpuset.h>  /* must come after sys/param.h */
+// clang-format on
+#include <unistd.h>
+#else
+#define OS_FREEBSD 0
+#endif
+
+namespace highwayhash {
+
+#define CHECK(condition)                                       \
+  while (!(condition)) {                                       \
+    printf("os_specific CHECK failed at line %d\n", __LINE__); \
+    abort();                                                   \
+  }
+
+double Now() {
+#if OS_WIN
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  static const double rcp_freq = []() {
+    LARGE_INTEGER freq;
+    (void)QueryPerformanceFrequency(&freq);
+    return 1.0 / freq.QuadPart;
+  }();
+  return counter.QuadPart * rcp_freq;
+#elif OS_MAC
+  const auto t = mach_absolute_time();
+  // On OSX/iOS platform the elapsed time is cpu time unit
+  // We have to query the time base information to convert it back
+  // See https://developer.apple.com/library/mac/qa/qa1398/_index.html
+  static mach_timebase_info_data_t timebase;
+  if (timebase.denom == 0) {
+    (void)mach_timebase_info(&timebase);
+  }
+  return double(t) * timebase.numer / timebase.denom * 1E-9;
+#else
+  timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+  return t.tv_sec + t.tv_nsec * 1E-9;
+#endif
+}
+
+void RaiseThreadPriority() {
+#if OS_WIN
+  BOOL ok = SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
+  CHECK(ok);
+  SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
+  CHECK(ok);
+#elif OS_LINUX
+  // omit: SCHED_RR and SCHED_FIFO with sched_priority max, max-1 and max/2
+  // lead to 2-3x runtime and higher variability!
+#elif OS_FREEBSD || OS_MAC
+#else
+#error "port"
+#endif
+}
+
+struct ThreadAffinity {
+#if OS_WIN
+  DWORD_PTR mask;
+#elif OS_LINUX || OS_MAC
+  cpu_set_t set;
+#elif OS_FREEBSD
+  cpuset_t set;
+#endif
+};
+
+ThreadAffinity* GetThreadAffinity() {
+  ThreadAffinity* affinity =
+      static_cast<ThreadAffinity*>(malloc(sizeof(ThreadAffinity)));
+#if OS_WIN
+  DWORD_PTR system_affinity;
+  const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &affinity->mask,
+                                         &system_affinity);
+  CHECK(ok);
+#elif OS_LINUX
+  const pid_t pid = 0;  // current thread
+  const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &affinity->set);
+  CHECK(err == 0);
+#elif OS_FREEBSD
+  const pid_t pid = getpid();  // current thread
+  const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
+                                     sizeof(cpuset_t), &affinity->set);
+  CHECK(err == 0);
+#elif OS_MAC
+  const int err = mac_getaffinity(&affinity->set);
+  CHECK(err == 0);
+#endif
+  return affinity;
+}
+
+namespace {
+
+ThreadAffinity* OriginalThreadAffinity() {
+  static ThreadAffinity* original = GetThreadAffinity();
+  return original;
+}
+
+}  // namespace
+
+void SetThreadAffinity(ThreadAffinity* affinity) {
+  // Ensure original is initialized before changing.
+  const ThreadAffinity* const original = OriginalThreadAffinity();
+  CHECK(original != nullptr);
+
+#if OS_WIN
+  const HANDLE hThread = GetCurrentThread();
+  const DWORD_PTR prev = SetThreadAffinityMask(hThread, affinity->mask);
+  CHECK(prev != 0);
+#elif OS_LINUX
+  const pid_t pid = 0;  // current thread
+  const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &affinity->set);
+  CHECK(err == 0);
+#elif OS_FREEBSD
+  const pid_t pid = getpid();  // current thread
+  const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
+                                     sizeof(cpuset_t), &affinity->set);
+  CHECK(err == 0);
+#elif OS_MAC
+  const int err = mac_setaffinity(&affinity->set);
+  CHECK(err == 0);
+#else
+#error "port"
+#endif
+}
+
+std::vector<int> AvailableCPUs() {
+  std::vector<int> cpus;
+  cpus.reserve(64);
+  const ThreadAffinity* const affinity = OriginalThreadAffinity();
+#if OS_WIN
+  for (int cpu = 0; cpu < 64; ++cpu) {
+    if (affinity->mask & (1ULL << cpu)) {
+      cpus.push_back(cpu);
+    }
+  }
+#elif OS_LINUX
+  for (size_t cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) {
+    if (CPU_ISSET(cpu, &affinity->set)) {
+      cpus.push_back(cpu);
+    }
+  }
+#elif OS_FREEBSD
+  for (size_t cpu = 0; cpu < sizeof(cpuset_t) * 8; ++cpu) {
+    if (CPU_ISSET(cpu, &affinity->set)) {
+      cpus.push_back(cpu);
+    }
+  }
+#elif OS_MAC
+  for (int cpu = 0; cpu < sizeof(cpu_set_t) * 8; ++cpu) {
+    if (CPU_ISSET(cpu, &affinity->set)) {
+      cpus.push_back(cpu);
+    }
+  }
+#else
+#error "port"
+#endif
+  return cpus;
+}
+
+void PinThreadToCPU(const int cpu) {
+  ThreadAffinity affinity;
+#if OS_WIN
+  affinity.mask = 1ULL << cpu;
+#elif OS_LINUX
+  CPU_ZERO(&affinity.set);
+  CPU_SET(cpu, &affinity.set);
+#elif OS_FREEBSD
+  CPU_ZERO(&affinity.set);
+  CPU_SET(cpu, &affinity.set);
+#elif OS_MAC
+  CPU_ZERO(&affinity.set);
+  CPU_SET(cpu, &affinity.set);
+#else
+#error "port"
+#endif
+  SetThreadAffinity(&affinity);
+}
+
+void PinThreadToRandomCPU() {
+  std::vector<int> cpus = AvailableCPUs();
+
+  // Remove first two CPUs because interrupts are often pinned to them.
+  CHECK(cpus.size() > 2);
+  cpus.erase(cpus.begin(), cpus.begin() + 2);
+
+  // Random choice to prevent burning up the same core.
+  std::random_device device;
+  std::ranlux48 generator(device());
+  std::shuffle(cpus.begin(), cpus.end(), generator);
+  const int cpu = cpus.front();
+
+  PinThreadToCPU(cpu);
+
+#if HH_ARCH_X64
+  // After setting affinity, we should be running on the desired CPU.
+  printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId());
+#else
+  printf("Running on CPU #%d\n", cpu);
+#endif
+}
+
+}  // namespace highwayhash
diff --git a/highwayhash/highwayhash/os_specific.h b/highwayhash/highwayhash/os_specific.h
new file mode 100644
index 000000000..46f3c3e3e
--- /dev/null
+++ b/highwayhash/highwayhash/os_specific.h
@@ -0,0 +1,54 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_OS_SPECIFIC_H_
+#define HIGHWAYHASH_OS_SPECIFIC_H_
+
+#include <vector>
+
+namespace highwayhash {
+
+// Returns current wall-clock time [seconds].
+double Now();
+
+// Sets this thread's priority to the maximum. This should not be called on
+// single-core systems. Requires elevated permissions. No effect on Linux
+// because it increases runtime and variability (issue #19).
+void RaiseThreadPriority();
+
+// Returns CPU numbers in [0, N), where N is the number of bits in the
+// thread's initial affinity (unaffected by any SetThreadAffinity).
+std::vector<int> AvailableCPUs();
+
+// Opaque.
+struct ThreadAffinity;
+
+// Caller must free() the return value.
+ThreadAffinity* GetThreadAffinity();
+
+// Restores a previous affinity returned by GetThreadAffinity.
+void SetThreadAffinity(ThreadAffinity* affinity);
+
+// Ensures the thread is running on the specified cpu, and no others.
+// Useful for reducing nanobenchmark variability (fewer context switches).
+// Uses SetThreadAffinity.
+void PinThreadToCPU(const int cpu);
+
+// Random choice of CPU avoids overloading any one core.
+// Uses SetThreadAffinity.
+void PinThreadToRandomCPU();
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_OS_SPECIFIC_H_
diff --git a/highwayhash/highwayhash/profiler.h b/highwayhash/highwayhash/profiler.h
new file mode 100644
index 000000000..7a8ce4ed2
--- /dev/null
+++ b/highwayhash/highwayhash/profiler.h
@@ -0,0 +1,762 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_PROFILER_H_
+#define HIGHWAYHASH_PROFILER_H_
+
+// High precision, low overhead time measurements. Returns exact call counts and
+// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
+//
+// Usage: add this header to BUILD srcs; instrument regions of interest:
+// { PROFILER_ZONE("name"); /*code*/ } or
+// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
+// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
+// print call counts and average durations [CPU cycles] to stdout, sorted in
+// descending order of total duration.
+
+// Configuration settings:
+
+// If zero, this file has no effect and no measurements will be recorded.
+#ifndef PROFILER_ENABLED
+#define PROFILER_ENABLED 1
+#endif
+
+// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// and discard packets, thus temporarily adding some observer overhead.
+// Each zone occupies 16 bytes.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 200ULL
+#endif
+
+#if PROFILER_ENABLED
+
+#define PROFILER_PRINT_OVERHEAD 0
+
+#include <algorithm>  // min/max
+#include <atomic>
+#include <cassert>
+#include <cstddef>  // ptrdiff_t
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>  // memcpy
+#include <new>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// Non-portable aspects:
+// - SSE2 128-bit load/store (write-combining, UpdateOrAdd)
+// - RDTSCP timestamps (serializing, high-resolution)
+// - assumes string literals are stored within an 8 MiB range
+// - compiler-specific annotations (restrict, alignment, fences)
+#if HH_ARCH_X64
+#include <emmintrin.h>
+#if HH_MSC_VERSION
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+#include "highwayhash/robust_statistics.h"
+#include "highwayhash/tsc_timer.h"
+
+#define PROFILER_CHECK(condition)                           \
+  while (!(condition)) {                                    \
+    printf("Profiler check failed at line %d\n", __LINE__); \
+    abort();                                                \
+  }
+
+namespace highwayhash {
+
+// Upper bounds for various fixed-size data structures (guarded via assert):
+
+// How many threads can actually enter a zone (those that don't do not count).
+// Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
+// WARNING: a fiber library can spawn hundreds of threads.
+static constexpr size_t kMaxThreads = 128;
+
+// Maximum nesting of zones.
+static constexpr size_t kMaxDepth = 64;
+
+// Total number of zones.
+static constexpr size_t kMaxZones = 256;
+
+// Functions that depend on the cache line size.
+class CacheAligned {
+ public:
+  static constexpr size_t kPointerSize = sizeof(void*);
+  static constexpr size_t kCacheLineSize = 64;
+
+  static void* Allocate(const size_t bytes) {
+    char* const allocated = static_cast<char*>(malloc(bytes + kCacheLineSize));
+    if (allocated == nullptr) {
+      return nullptr;
+    }
+    const uintptr_t misalignment =
+        reinterpret_cast<uintptr_t>(allocated) & (kCacheLineSize - 1);
+    // malloc is at least kPointerSize aligned, so we can store the "allocated"
+    // pointer immediately before the aligned memory.
+    assert(misalignment % kPointerSize == 0);
+    char* const aligned = allocated + kCacheLineSize - misalignment;
+    memcpy(aligned - kPointerSize, &allocated, kPointerSize);
+    return aligned;
+  }
+
+  // Template allows freeing pointer-to-const.
+  template <typename T>
+  static void Free(T* aligned_pointer) {
+    if (aligned_pointer == nullptr) {
+      return;
+    }
+    const char* const aligned = reinterpret_cast<const char*>(aligned_pointer);
+    assert(reinterpret_cast<uintptr_t>(aligned) % kCacheLineSize == 0);
+    char* allocated;
+    memcpy(&allocated, aligned - kPointerSize, kPointerSize);
+    assert(allocated <= aligned - kPointerSize);
+    assert(allocated >= aligned - kCacheLineSize);
+    free(allocated);
+  }
+
+#if HH_ARCH_X64
+  // Overwrites "to" without loading it into the cache (read-for-ownership).
+  template <typename T>
+  static void StreamCacheLine(const T* from_items, T* to_items) {
+    const __m128i* const from = reinterpret_cast<const __m128i*>(from_items);
+    __m128i* const to = reinterpret_cast<__m128i*>(to_items);
+    HH_COMPILER_FENCE;
+    const __m128i v0 = _mm_load_si128(from + 0);
+    const __m128i v1 = _mm_load_si128(from + 1);
+    const __m128i v2 = _mm_load_si128(from + 2);
+    const __m128i v3 = _mm_load_si128(from + 3);
+    // Fences prevent the compiler from reordering loads/stores, which may
+    // interfere with write-combining.
+    HH_COMPILER_FENCE;
+    _mm_stream_si128(to + 0, v0);
+    _mm_stream_si128(to + 1, v1);
+    _mm_stream_si128(to + 2, v2);
+    _mm_stream_si128(to + 3, v3);
+    HH_COMPILER_FENCE;
+  }
+#endif
+};
+
+// Represents zone entry/exit events. Stores a full-resolution timestamp plus
+// an offset (representing zone name or identifying exit packets). POD.
+class Packet {
+ public:
+  // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
+  // (governed by kMaxZones). We have seen multi-megabyte offsets.
+  static constexpr size_t kOffsetBits = 25;
+  static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
+
+  // We need full-resolution timestamps; at an effective rate of 4 GHz,
+  // this permits 1 minute zone durations (for longer durations, split into
+  // multiple zones). Wraparound is handled by masking.
+  static constexpr size_t kTimestampBits = 64 - kOffsetBits;
+  static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
+
+  static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
+    assert(biased_offset < (1ULL << kOffsetBits));
+
+    Packet packet;
+    packet.bits_ =
+        (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
+    return packet;
+  }
+
+  uint64_t Timestamp() const { return bits_ & kTimestampMask; }
+
+  size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
+
+ private:
+  uint64_t bits_;
+};
+static_assert(sizeof(Packet) == 8, "Wrong Packet size");
+
+// Returns the address of a string literal. Assuming zone names are also
+// literals and stored nearby, we can represent them as offsets, which are
+// faster to compute than hashes or even a static index.
+//
+// This function must not be static - each call (even from other translation
+// units) must return the same value.
+inline const char* StringOrigin() {
+  // Chosen such that no zone name is a prefix nor suffix of this string
+  // to ensure they aren't merged (offset 0 identifies zone-exit packets).
+  static const char* string_origin = "__#__";
+  return string_origin - Packet::kOffsetBias;
+}
+
+// Representation of an active zone, stored in a stack. Used to deduct
+// child duration from the parent's self time. POD.
+struct Node {
+  Packet packet;
+  uint64_t child_total;
+};
+
+// Holds statistics for all zones with the same name. POD.
+struct Accumulator {
+  static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
+
+  uint64_t BiasedOffset() const { return num_calls >> kNumCallBits; }
+  uint64_t NumCalls() const { return num_calls & ((1ULL << kNumCallBits) - 1); }
+
+  // UpdateOrAdd relies upon this layout.
+  uint64_t num_calls = 0;  // upper bits = biased_offset.
+  uint64_t total_duration = 0;
+};
+#if HH_ARCH_X64
+static_assert(sizeof(Accumulator) == sizeof(__m128i), "Wrong Accumulator size");
+#endif
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+  if (subtrahend > minuend) {
+    return 0;
+  }
+  return minuend - subtrahend;
+}
+
+// Per-thread call graph (stack) and Accumulator for each zone.
+class Results {
+ public:
+  Results() {
+    // Zero-initialize first accumulator to avoid a check for num_zones_ == 0.
+    memset(zones_, 0, sizeof(Accumulator));
+  }
+
+  // Used for computing overhead when this thread encounters its first Zone.
+  // This has no observable effect apart from increasing "analyze_elapsed_".
+  uint64_t ZoneDuration(const Packet* packets) {
+    PROFILER_CHECK(depth_ == 0);
+    PROFILER_CHECK(num_zones_ == 0);
+    AnalyzePackets(packets, 2);
+    const uint64_t duration = zones_[0].total_duration;
+    zones_[0].num_calls = 0;
+    zones_[0].total_duration = 0;
+    PROFILER_CHECK(depth_ == 0);
+    num_zones_ = 0;
+    return duration;
+  }
+
+  void SetSelfOverhead(const uint64_t self_overhead) {
+    self_overhead_ = self_overhead;
+  }
+
+  void SetChildOverhead(const uint64_t child_overhead) {
+    child_overhead_ = child_overhead;
+  }
+
+  // Draw all required information from the packets, which can be discarded
+  // afterwards. Called whenever this thread's storage is full.
+  void AnalyzePackets(const Packet* packets, const size_t num_packets) {
+    const uint64_t t0 = Start<uint64_t>();
+
+    for (size_t i = 0; i < num_packets; ++i) {
+      const Packet p = packets[i];
+      // Entering a zone
+      if (p.BiasedOffset() != Packet::kOffsetBias) {
+        assert(depth_ < kMaxDepth);
+        nodes_[depth_].packet = p;
+        nodes_[depth_].child_total = 0;
+        ++depth_;
+        continue;
+      }
+
+      assert(depth_ != 0);
+      const Node& node = nodes_[depth_ - 1];
+      // Masking correctly handles unsigned wraparound.
+      const uint64_t duration =
+          (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
+      const uint64_t self_duration = ClampedSubtract(
+          duration, self_overhead_ + child_overhead_ + node.child_total);
+
+      UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
+      --depth_;
+
+      // Deduct this nested node's time from its parent's self_duration.
+      if (depth_ != 0) {
+        nodes_[depth_ - 1].child_total += duration + child_overhead_;
+      }
+    }
+
+    const uint64_t t1 = Stop<uint64_t>();
+    analyze_elapsed_ += t1 - t0;
+  }
+
+  // Incorporates results from another thread. Call after all threads have
+  // exited any zones.
+  void Assimilate(const Results& other) {
+    const uint64_t t0 = Start<uint64_t>();
+    assert(depth_ == 0);
+    assert(other.depth_ == 0);
+
+    for (size_t i = 0; i < other.num_zones_; ++i) {
+      const Accumulator& zone = other.zones_[i];
+      UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.total_duration);
+    }
+    const uint64_t t1 = Stop<uint64_t>();
+    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+  }
+
+  // Single-threaded.
+  void Print() {
+    const uint64_t t0 = Start<uint64_t>();
+    MergeDuplicates();
+
+    // Sort by decreasing total (self) cost.
+    std::sort(zones_, zones_ + num_zones_,
+              [](const Accumulator& r1, const Accumulator& r2) {
+                return r1.total_duration > r2.total_duration;
+              });
+
+    const char* string_origin = StringOrigin();
+    for (size_t i = 0; i < num_zones_; ++i) {
+      const Accumulator& r = zones_[i];
+      const uint64_t num_calls = r.NumCalls();
+      printf("%40s: %10zu x %15zu = %15zu\n", string_origin + r.BiasedOffset(),
+             num_calls, r.total_duration / num_calls, r.total_duration);
+    }
+
+    const uint64_t t1 = Stop<uint64_t>();
+    analyze_elapsed_ += t1 - t0;
+    printf("Total clocks during analysis: %zu\n", analyze_elapsed_);
+  }
+
+ private:
+#if HH_ARCH_X64
+  static bool SameOffset(const __m128i& zone, const size_t biased_offset) {
+    const uint64_t num_calls = _mm_cvtsi128_si64(zone);
+    return (num_calls >> Accumulator::kNumCallBits) == biased_offset;
+  }
+#endif
+
+  // Updates an existing Accumulator (uniquely identified by biased_offset) or
+  // adds one if this is the first time this thread analyzed that zone.
+  // Uses a self-organizing list data structure, which avoids dynamic memory
+  // allocations and is far faster than unordered_map. Loads, updates and
+  // stores the entire Accumulator with vector instructions.
+  void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
+                   const uint64_t duration) {
+    assert(biased_offset < (1ULL << Packet::kOffsetBits));
+
+#if HH_ARCH_X64
+    const __m128i num_calls_64 = _mm_cvtsi64_si128(num_calls);
+    const __m128i duration_64 = _mm_cvtsi64_si128(duration);
+    const __m128i add_duration_call =
+        _mm_unpacklo_epi64(num_calls_64, duration_64);
+
+    __m128i* const HH_RESTRICT zones = reinterpret_cast<__m128i*>(zones_);
+
+    // Special case for first zone: (maybe) update, without swapping.
+    __m128i prev = _mm_load_si128(zones);
+    if (SameOffset(prev, biased_offset)) {
+      prev = _mm_add_epi64(prev, add_duration_call);
+      assert(SameOffset(prev, biased_offset));
+      _mm_store_si128(zones, prev);
+      return;
+    }
+
+    // Look for a zone with the same offset.
+    for (size_t i = 1; i < num_zones_; ++i) {
+      __m128i zone = _mm_load_si128(zones + i);
+      if (SameOffset(zone, biased_offset)) {
+        zone = _mm_add_epi64(zone, add_duration_call);
+        assert(SameOffset(zone, biased_offset));
+        // Swap with predecessor (more conservative than move to front,
+        // but at least as successful).
+        _mm_store_si128(zones + i - 1, zone);
+        _mm_store_si128(zones + i, prev);
+        return;
+      }
+      prev = zone;
+    }
+
+    // Not found; create a new Accumulator.
+    const __m128i biased_offset_64 = _mm_slli_epi64(
+        _mm_cvtsi64_si128(biased_offset), Accumulator::kNumCallBits);
+    const __m128i zone = _mm_add_epi64(biased_offset_64, add_duration_call);
+    assert(SameOffset(zone, biased_offset));
+
+    assert(num_zones_ < kMaxZones);
+    _mm_store_si128(zones + num_zones_, zone);
+    ++num_zones_;
+#else
+    // Special case for first zone: (maybe) update, without swapping.
+    if (zones_[0].BiasedOffset() == biased_offset) {
+      zones_[0].total_duration += duration;
+      zones_[0].num_calls += num_calls;
+      assert(zones_[0].BiasedOffset() == biased_offset);
+      return;
+    }
+
+    // Look for a zone with the same offset.
+    for (size_t i = 1; i < num_zones_; ++i) {
+      if (zones_[i].BiasedOffset() == biased_offset) {
+        zones_[i].total_duration += duration;
+        zones_[i].num_calls += num_calls;
+        assert(zones_[i].BiasedOffset() == biased_offset);
+        // Swap with predecessor (more conservative than move to front,
+        // but at least as successful).
+        const Accumulator prev = zones_[i - 1];
+        zones_[i - 1] = zones_[i];
+        zones_[i] = prev;
+        return;
+      }
+    }
+
+    // Not found; create a new Accumulator.
+    assert(num_zones_ < kMaxZones);
+    Accumulator* HH_RESTRICT zone = zones_ + num_zones_;
+    zone->num_calls = (biased_offset << Accumulator::kNumCallBits) + num_calls;
+    zone->total_duration = duration;
+    assert(zone->BiasedOffset() == biased_offset);
+    ++num_zones_;
+#endif
+  }
+
+  // Each instantiation of a function template seems to get its own copy of
+  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+  // acceptable because we only expect a few dozen zones.
+  void MergeDuplicates() {
+    const char* string_origin = StringOrigin();
+    for (size_t i = 0; i < num_zones_; ++i) {
+      const size_t biased_offset = zones_[i].BiasedOffset();
+      const char* name = string_origin + biased_offset;
+      // Separate num_calls from biased_offset so we can add them together.
+      uint64_t num_calls = zones_[i].NumCalls();
+
+      // Add any subsequent duplicates to num_calls and total_duration.
+      for (size_t j = i + 1; j < num_zones_;) {
+        if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
+          num_calls += zones_[j].NumCalls();
+          zones_[i].total_duration += zones_[j].total_duration;
+          // Fill hole with last item.
+          zones_[j] = zones_[--num_zones_];
+        } else {  // Name differed, try next Accumulator.
+          ++j;
+        }
+      }
+
+      assert(num_calls < (1ULL << Accumulator::kNumCallBits));
+
+      // Re-pack regardless of whether any duplicates were found.
+      zones_[i].num_calls =
+          (biased_offset << Accumulator::kNumCallBits) + num_calls;
+    }
+  }
+
+  uint64_t analyze_elapsed_ = 0;
+  uint64_t self_overhead_ = 0;
+  uint64_t child_overhead_ = 0;
+
+  size_t depth_ = 0;      // Number of active zones.
+  size_t num_zones_ = 0;  // Number of retired zones.
+
+  HH_ALIGNAS(64) Node nodes_[kMaxDepth];         // Stack
+  HH_ALIGNAS(64) Accumulator zones_[kMaxZones];  // Self-organizing list
+};
+
+// Per-thread packet storage, allocated via CacheAligned.
+class ThreadSpecific {
+  static constexpr size_t kBufferCapacity =
+      CacheAligned::kCacheLineSize / sizeof(Packet);
+
+ public:
+  // "name" is used to sanity-check offsets fit in kOffsetBits.
+  explicit ThreadSpecific(const char* name)
+      : packets_(static_cast<Packet*>(
+            CacheAligned::Allocate(PROFILER_THREAD_STORAGE << 20))),
+        num_packets_(0),
+        max_packets_(PROFILER_THREAD_STORAGE << 17),
+        string_origin_(StringOrigin()) {
+    // Even in optimized builds (with NDEBUG), verify that this zone's name
+    // offset fits within the allotted space. If not, UpdateOrAdd is likely to
+    // overrun zones_[]. We also assert(), but users often do not run debug
+    // builds. Checking here on the cold path (only reached once per thread)
+    // is cheap, but it only covers one zone.
+    const size_t biased_offset = name - string_origin_;
+    PROFILER_CHECK(biased_offset <= (1ULL << Packet::kOffsetBits));
+  }
+
+  ~ThreadSpecific() { CacheAligned::Free(packets_); }
+
+  // Depends on Zone => defined below.
+  void ComputeOverhead();
+
+  void WriteEntry(const char* name, const uint64_t timestamp) {
+    const size_t biased_offset = name - string_origin_;
+    Write(Packet::Make(biased_offset, timestamp));
+  }
+
+  void WriteExit(const uint64_t timestamp) {
+    const size_t biased_offset = Packet::kOffsetBias;
+    Write(Packet::Make(biased_offset, timestamp));
+  }
+
+  void AnalyzeRemainingPackets() {
+#if HH_ARCH_X64
+    // Ensures prior weakly-ordered streaming stores are globally visible.
+    _mm_sfence();
+
+    // Storage full => empty it.
+    if (num_packets_ + buffer_size_ > max_packets_) {
+      results_.AnalyzePackets(packets_, num_packets_);
+      num_packets_ = 0;
+    }
+    memcpy(packets_ + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+    num_packets_ += buffer_size_;
+#endif
+
+    results_.AnalyzePackets(packets_, num_packets_);
+    num_packets_ = 0;
+  }
+
+  Results& GetResults() { return results_; }
+
+ private:
+  // Write packet to buffer/storage, emptying them as needed.
+  void Write(const Packet packet) {
+#if HH_ARCH_X64
+    // Buffer full => copy to storage.
+    if (buffer_size_ == kBufferCapacity) {
+      // Storage full => empty it.
+      if (num_packets_ + kBufferCapacity > max_packets_) {
+        results_.AnalyzePackets(packets_, num_packets_);
+        num_packets_ = 0;
+      }
+      // This buffering halves observer overhead and decreases the overall
+      // runtime by about 3%.
+      CacheAligned::StreamCacheLine(buffer_, packets_ + num_packets_);
+      num_packets_ += kBufferCapacity;
+      buffer_size_ = 0;
+    }
+    buffer_[buffer_size_] = packet;
+    ++buffer_size_;
+#else
+    // Write directly to storage.
+    if (num_packets_ >= max_packets_) {
+      results_.AnalyzePackets(packets_, num_packets_);
+      num_packets_ = 0;
+    }
+    packets_[num_packets_] = packet;
+    ++num_packets_;
+#endif
+  }
+
+  // Write-combining buffer to avoid cache pollution. Must be the first
+  // non-static member to ensure cache-line alignment.
+#if HH_ARCH_X64
+  Packet buffer_[kBufferCapacity];
+  size_t buffer_size_ = 0;
+#endif
+
+  // Contiguous storage for zone enter/exit packets.
+  Packet* const HH_RESTRICT packets_;
+  size_t num_packets_;
+  const size_t max_packets_;
+  // Cached here because we already read this cache line on zone entry/exit.
+  const char* HH_RESTRICT string_origin_;
+  Results results_;
+};
+
+class ThreadList {
+ public:
+  // Thread-safe.
+  void Add(ThreadSpecific* const ts) {
+    const uint32_t index = num_threads_.fetch_add(1);
+    PROFILER_CHECK(index < kMaxThreads);
+    threads_[index] = ts;
+  }
+
+  // Single-threaded.
+  void PrintResults() {
+    const uint32_t num_threads = num_threads_.load();
+    for (uint32_t i = 0; i < num_threads; ++i) {
+      threads_[i]->AnalyzeRemainingPackets();
+    }
+
+    // Combine all threads into a single Result.
+    for (uint32_t i = 1; i < num_threads; ++i) {
+      threads_[0]->GetResults().Assimilate(threads_[i]->GetResults());
+    }
+
+    if (num_threads != 0) {
+      threads_[0]->GetResults().Print();
+    }
+  }
+
+ private:
+  // Owning pointers.
+  HH_ALIGNAS(64) ThreadSpecific* threads_[kMaxThreads];
+  std::atomic<uint32_t> num_threads_{0};
+};
+
+// RAII zone enter/exit recorder constructed by the ZONE macro; also
+// responsible for initializing ThreadSpecific.
+class Zone {
+ public:
+  // "name" must be a string literal (see StringOrigin).
+  HH_NOINLINE explicit Zone(const char* name) {
+    HH_COMPILER_FENCE;
+    ThreadSpecific* HH_RESTRICT thread_specific = StaticThreadSpecific();
+    if (HH_UNLIKELY(thread_specific == nullptr)) {
+      void* mem = CacheAligned::Allocate(sizeof(ThreadSpecific));
+      thread_specific = new (mem) ThreadSpecific(name);
+      // Must happen before ComputeOverhead, which re-enters this ctor.
+      Threads().Add(thread_specific);
+      StaticThreadSpecific() = thread_specific;
+      thread_specific->ComputeOverhead();
+    }
+
+    // (Capture timestamp ASAP, not inside WriteEntry.)
+    HH_COMPILER_FENCE;
+    const uint64_t timestamp = Start<uint64_t>();
+    thread_specific->WriteEntry(name, timestamp);
+  }
+
+  HH_NOINLINE ~Zone() {
+    HH_COMPILER_FENCE;
+    const uint64_t timestamp = Stop<uint64_t>();
+    StaticThreadSpecific()->WriteExit(timestamp);
+    HH_COMPILER_FENCE;
+  }
+
+  // Call exactly once after all threads have exited all zones.
+  static void PrintResults() { Threads().PrintResults(); }
+
+ private:
+  // Returns reference to the thread's ThreadSpecific pointer (initially null).
+  // Function-local static avoids needing a separate definition.
+  static ThreadSpecific*& StaticThreadSpecific() {
+    static thread_local ThreadSpecific* thread_specific;
+    return thread_specific;
+  }
+
+  // Returns the singleton ThreadList. Non time-critical.
+  static ThreadList& Threads() {
+    static ThreadList threads_;
+    return threads_;
+  }
+};
+
+// Creates a zone starting from here until the end of the current scope.
+// Timestamps will be recorded when entering and exiting the zone.
+// "name" must be a string literal, which is ensured by merging with "".
+#define PROFILER_ZONE(name)           \
+  HH_COMPILER_FENCE;                  \
+  const Zone zone("" name); \
+  HH_COMPILER_FENCE
+
+// Creates a zone for an entire function (when placed at its beginning).
+// Shorter/more convenient than ZONE.
+#define PROFILER_FUNC                  \
+  HH_COMPILER_FENCE;                   \
+  const Zone zone(__func__); \
+  HH_COMPILER_FENCE
+
+#define PROFILER_PRINT_RESULTS Zone::PrintResults
+
+inline void ThreadSpecific::ComputeOverhead() {
+  // Delay after capturing timestamps before/after the actual zone runs. Even
+  // with frequency throttling disabled, this has a multimodal distribution,
+  // including 32, 34, 48, 52, 59, 62.
+  uint64_t self_overhead;
+  {
+    const size_t kNumSamples = 32;
+    uint32_t samples[kNumSamples];
+    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+      const size_t kNumDurations = 1024;
+      uint32_t durations[kNumDurations];
+
+      for (size_t idx_duration = 0; idx_duration < kNumDurations;
+           ++idx_duration) {
+        { PROFILER_ZONE("Dummy Zone (never shown)"); }
+#if HH_ARCH_X64
+        const uint64_t duration = results_.ZoneDuration(buffer_);
+        buffer_size_ = 0;
+#else
+        const uint64_t duration = results_.ZoneDuration(packets_);
+        num_packets_ = 0;
+#endif
+        durations[idx_duration] = static_cast<uint32_t>(duration);
+        PROFILER_CHECK(num_packets_ == 0);
+      }
+      CountingSort(durations, durations + kNumDurations);
+      samples[idx_sample] = Mode(durations, kNumDurations);
+    }
+    // Median.
+    CountingSort(samples, samples + kNumSamples);
+    self_overhead = samples[kNumSamples / 2];
+#if PROFILER_PRINT_OVERHEAD
+    printf("Overhead: %zu\n", self_overhead);
+#endif
+    results_.SetSelfOverhead(self_overhead);
+  }
+
+  // Delay before capturing start timestamp / after end timestamp.
+  const size_t kNumSamples = 32;
+  uint32_t samples[kNumSamples];
+  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+    const size_t kNumDurations = 16;
+    uint32_t durations[kNumDurations];
+    for (size_t idx_duration = 0; idx_duration < kNumDurations;
+         ++idx_duration) {
+      const size_t kReps = 10000;
+      // Analysis time should not be included => must fit within buffer.
+      PROFILER_CHECK(kReps * 2 < max_packets_);
+#if HH_ARCH_X64
+      _mm_mfence();
+#endif
+      const uint64_t t0 = Start<uint64_t>();
+      for (size_t i = 0; i < kReps; ++i) {
+        PROFILER_ZONE("Dummy");
+      }
+#if HH_ARCH_X64
+      _mm_sfence();
+#endif
+      const uint64_t t1 = Stop<uint64_t>();
+#if HH_ARCH_X64
+      PROFILER_CHECK(num_packets_ + buffer_size_ == kReps * 2);
+      buffer_size_ = 0;
+#else
+      PROFILER_CHECK(num_packets_ == kReps * 2);
+#endif
+      num_packets_ = 0;
+      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+      durations[idx_duration] =
+          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+    }
+    CountingSort(durations, durations + kNumDurations);
+    samples[idx_sample] = Mode(durations, kNumDurations);
+  }
+  CountingSort(samples, samples + kNumSamples);
+  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+#if PROFILER_PRINT_OVERHEAD
+  printf("Child overhead: %zu\n", child_overhead);
+#endif
+  results_.SetChildOverhead(child_overhead);
+}
+
+}  // namespace highwayhash
+
+#else  // !PROFILER_ENABLED
+#define PROFILER_ZONE(name)
+#define PROFILER_FUNC
+#define PROFILER_PRINT_RESULTS()
+#endif
+
+#endif  // HIGHWAYHASH_PROFILER_H_
diff --git a/highwayhash/highwayhash/profiler_example.cc b/highwayhash/highwayhash/profiler_example.cc
new file mode 100644
index 000000000..999cc4581
--- /dev/null
+++ b/highwayhash/highwayhash/profiler_example.cc
@@ -0,0 +1,97 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "highwayhash/os_specific.h"
+#include "highwayhash/profiler.h"
+
+namespace highwayhash {
+namespace {
+
+void Spin(const double min_time) {
+  const double t0 = Now();
+  for (;;) {
+    const double elapsed = Now() - t0;
+    if (elapsed > min_time) {
+      break;
+    }
+  }
+}
+
+void Spin10() {
+  PROFILER_FUNC;
+  Spin(10E-6);
+}
+
+void Spin20() {
+  PROFILER_FUNC;
+  Spin(20E-6);
+}
+
+void Spin3060() {
+  {
+    PROFILER_ZONE("spin30");
+    Spin(30E-6);
+  }
+  {
+    PROFILER_ZONE("spin60");
+    Spin(60E-6);
+  }
+}
+
+void Level3() {
+  PROFILER_FUNC;
+  for (int rep = 0; rep < 10; ++rep) {
+    double total = 0.0;
+    for (int i = 0; i < 100 - rep; ++i) {
+      total += pow(0.9, i);
+    }
+    if (std::abs(total - 9.999) > 1E-2) {
+      abort();
+    }
+  }
+}
+
+void Level2() {
+  PROFILER_FUNC;
+  Level3();
+}
+
+void Level1() {
+  PROFILER_FUNC;
+  Level2();
+}
+
+void ProfilerExample() {
+  PinThreadToRandomCPU();
+  {
+    PROFILER_FUNC;
+    Spin10();
+    Spin20();
+    Spin3060();
+    Level1();
+  }
+  PROFILER_PRINT_RESULTS();
+}
+
+}  // namespace
+}  // namespace highwayhash
+
+int main(int argc, char* argv[]) {
+  highwayhash::ProfilerExample();
+  return 0;
+}
diff --git a/highwayhash/highwayhash/robust_statistics.h b/highwayhash/highwayhash/robust_statistics.h
new file mode 100644
index 000000000..4e45494f9
--- /dev/null
+++ b/highwayhash/highwayhash/robust_statistics.h
@@ -0,0 +1,135 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_ROBUST_STATISTICS_H_
+#define HIGHWAYHASH_ROBUST_STATISTICS_H_
+
+// Robust statistics: Mode, Median, MedianAbsoluteDeviation.
+
+#include <stddef.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+
+// @return i in [idx_begin, idx_begin + half_count) that minimizes
+// sorted[i + half_count] - sorted[i].
+template <typename T>
+size_t MinRange(const T* const HH_RESTRICT sorted, const size_t idx_begin,
+                const size_t half_count) {
+  T min_range = std::numeric_limits<T>::max();
+  size_t min_idx = 0;
+
+  for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
+    assert(sorted[idx] <= sorted[idx + half_count]);
+    const T range = sorted[idx + half_count] - sorted[idx];
+    if (range < min_range) {
+      min_range = range;
+      min_idx = idx;
+    }
+  }
+
+  return min_idx;
+}
+
+// Returns an estimate of the mode by calling MinRange on successively
+// halved intervals. "sorted" must be in ascending order. This is the
+// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
+// estimator of the mode", with complexity O(N log N). The mode is less
+// affected by outliers in highly-skewed distributions than the median.
+// The averaging operation below assumes "T" is an unsigned integer type.
+template <typename T>
+T Mode(const T* const HH_RESTRICT sorted, const size_t num_values) {
+  size_t idx_begin = 0;
+  size_t half_count = num_values / 2;
+  while (half_count > 1) {
+    idx_begin = MinRange(sorted, idx_begin, half_count);
+    half_count >>= 1;
+  }
+
+  const T x = sorted[idx_begin + 0];
+  if (half_count == 0) {
+    return x;
+  }
+  assert(half_count == 1);
+  const T average = (x + sorted[idx_begin + 1] + 1) / 2;
+  return average;
+}
+
+// Sorts integral values in ascending order. About 3x faster than std::sort for
+// input distributions with very few unique values.
+template <class T>
+void CountingSort(T* begin, T* end) {
+  // Unique values and their frequency (similar to flat_map).
+  using Unique = std::pair<T, int>;
+  std::vector<Unique> unique;
+  for (const T* p = begin; p != end; ++p) {
+    const T value = *p;
+    const auto pos =
+        std::find_if(unique.begin(), unique.end(),
+                     [value](const Unique& u) { return u.first == value; });
+    if (pos == unique.end()) {
+      unique.push_back(std::make_pair(*p, 1));
+    } else {
+      ++pos->second;
+    }
+  }
+
+  // Sort in ascending order of value (pair.first).
+  std::sort(unique.begin(), unique.end());
+
+  // Write that many copies of each unique value to the array.
+  T* HH_RESTRICT p = begin;
+  for (const auto& value_count : unique) {
+    std::fill(p, p + value_count.second, value_count.first);
+    p += value_count.second;
+  }
+  assert(p == end);
+}
+
+// Returns the median value. Side effect: sorts "samples".
+template <typename T>
+T Median(std::vector<T>* samples) {
+  assert(!samples->empty());
+  std::sort(samples->begin(), samples->end());
+  const size_t half = samples->size() / 2;
+  // Odd count: return middle
+  if (samples->size() % 2) {
+    return (*samples)[half];
+  }
+  // Even count: return average of middle two.
+  return ((*samples)[half] + (*samples)[half - 1]) / 2;
+}
+
+// Returns a robust measure of variability.
+template <typename T>
+T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
+  assert(!samples.empty());
+  std::vector<T> abs_deviations;
+  abs_deviations.reserve(samples.size());
+  for (const T sample : samples) {
+    abs_deviations.push_back(std::abs(sample - median));
+  }
+  return Median(&abs_deviations);
+}
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_ROBUST_STATISTICS_H_
diff --git a/highwayhash/highwayhash/scalar.h b/highwayhash/highwayhash/scalar.h
new file mode 100644
index 000000000..21181e758
--- /dev/null
+++ b/highwayhash/highwayhash/scalar.h
@@ -0,0 +1,352 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SCALAR_H_
+#define HIGHWAYHASH_SCALAR_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>  // size_t
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Single-lane "vector" type with the same interface as V128/Scalar. Allows the
+// same client template to generate both SIMD and portable code.
+template <typename Type>
+class Scalar {
+ public:
+  struct Intrinsic {
+    Type t;
+  };
+
+  using T = Type;
+  static constexpr size_t N = 1;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE Scalar() {}
+
+  HH_INLINE explicit Scalar(const T t) : v_(t) {}
+
+  HH_INLINE Scalar(const Scalar<T>& other) : v_(other.v_) {}
+
+  HH_INLINE Scalar& operator=(const Scalar<T>& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE Scalar(const Intrinsic& v) : v_(v.t) {}
+  HH_INLINE Scalar& operator=(const Intrinsic& v) {
+    v_ = v.t;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return {v_}; }
+
+  HH_INLINE Scalar operator==(const Scalar& other) const {
+    Scalar eq;
+    eq.FillWithByte(v_ == other.v_ ? 0xFF : 0x00);
+    return eq;
+  }
+  HH_INLINE Scalar operator<(const Scalar& other) const {
+    Scalar lt;
+    lt.FillWithByte(v_ < other.v_ ? 0xFF : 0x00);
+    return lt;
+  }
+  HH_INLINE Scalar operator>(const Scalar& other) const {
+    Scalar gt;
+    gt.FillWithByte(v_ > other.v_ ? 0xFF : 0x00);
+    return gt;
+  }
+
+  HH_INLINE Scalar& operator*=(const Scalar& other) {
+    v_ *= other.v_;
+    return *this;
+  }
+  HH_INLINE Scalar& operator/=(const Scalar& other) {
+    v_ /= other.v_;
+    return *this;
+  }
+  HH_INLINE Scalar& operator+=(const Scalar& other) {
+    v_ += other.v_;
+    return *this;
+  }
+  HH_INLINE Scalar& operator-=(const Scalar& other) {
+    v_ -= other.v_;
+    return *this;
+  }
+
+  HH_INLINE Scalar& operator&=(const Scalar& other) {
+    v_ &= other.v_;
+    return *this;
+  }
+  HH_INLINE Scalar& operator|=(const Scalar& other) {
+    v_ |= other.v_;
+    return *this;
+  }
+  HH_INLINE Scalar& operator^=(const Scalar& other) {
+    v_ ^= other.v_;
+    return *this;
+  }
+
+  HH_INLINE Scalar& operator<<=(const int count) {
+    // In C, int64_t << 64 is undefined, but we want to match the sensible
+    // behavior of SSE2 (zeroing).
+    if (count >= static_cast<int>(sizeof(T)) * 8) {
+      v_ = 0;
+    } else {
+      v_ <<= count;
+    }
+    return *this;
+  }
+
+  HH_INLINE Scalar& operator>>=(const int count) {
+    if (count >= static_cast<int>(sizeof(T)) * 8) {
+      v_ = 0;
+    } else {
+      v_ >>= count;
+    }
+    return *this;
+  }
+
+  // For internal use only. We need to avoid memcpy/memset because this is a
+  // restricted header.
+  void FillWithByte(const unsigned char value) {
+    unsigned char* bytes = reinterpret_cast<unsigned char*>(&v_);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      bytes[i] = value;
+    }
+  }
+
+  void CopyTo(unsigned char* HH_RESTRICT to_bytes) const {
+    const unsigned char* from_bytes =
+        reinterpret_cast<const unsigned char*>(&v_);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      to_bytes[i] = from_bytes[i];
+    }
+  }
+
+ private:
+  T v_;
+};
+
+// Non-member operators.
+
+template <typename T>
+HH_INLINE Scalar<T> operator*(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t *= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator/(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t /= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator+(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t += right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator-(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t -= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator&(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t &= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator|(const Scalar<T> left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t |= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator^(const Scalar<T>& left, const Scalar<T>& right) {
+  Scalar<T> t(left);
+  return t ^= right;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator<<(const Scalar<T>& v, const int count) {
+  Scalar<T> t(v);
+  return t <<= count;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> operator>>(const Scalar<T>& v, const int count) {
+  Scalar<T> t(v);
+  return t >>= count;
+}
+
+using V1x8U = Scalar<uint8_t>;
+using V1x16U = Scalar<uint16_t>;
+using V1x16I = Scalar<int16_t>;
+using V1x32U = Scalar<uint32_t>;
+using V1x32I = Scalar<int32_t>;
+using V1x64U = Scalar<uint64_t>;
+using V1x32F = Scalar<float>;
+using V1x64F = Scalar<double>;
+
+// Load/Store.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector128/256.
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+template <>
+HH_INLINE V1x8U Load<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
+  return V1x8U(*from);
+}
+template <>
+HH_INLINE V1x16U Load<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
+  return V1x16U(*from);
+}
+template <>
+HH_INLINE V1x16I Load<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
+  return V1x16I(*from);
+}
+template <>
+HH_INLINE V1x32U Load<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
+  return V1x32U(*from);
+}
+template <>
+HH_INLINE V1x32I Load<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
+  return V1x32I(*from);
+}
+template <>
+HH_INLINE V1x64U Load<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
+  return V1x64U(*from);
+}
+template <>
+HH_INLINE V1x32F Load<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
+  return V1x32F(*from);
+}
+template <>
+HH_INLINE V1x64F Load<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
+  return V1x64F(*from);
+}
+
+template <>
+HH_INLINE V1x8U LoadUnaligned<V1x8U>(const V1x8U::T* const HH_RESTRICT from) {
+  return V1x8U(*from);
+}
+template <>
+HH_INLINE V1x16U
+LoadUnaligned<V1x16U>(const V1x16U::T* const HH_RESTRICT from) {
+  return V1x16U(*from);
+}
+template <>
+HH_INLINE V1x16I
+LoadUnaligned<V1x16I>(const V1x16I::T* const HH_RESTRICT from) {
+  return V1x16I(*from);
+}
+template <>
+HH_INLINE V1x32U
+LoadUnaligned<V1x32U>(const V1x32U::T* const HH_RESTRICT from) {
+  return V1x32U(*from);
+}
+template <>
+HH_INLINE V1x32I
+LoadUnaligned<V1x32I>(const V1x32I::T* const HH_RESTRICT from) {
+  return V1x32I(*from);
+}
+template <>
+HH_INLINE V1x64U
+LoadUnaligned<V1x64U>(const V1x64U::T* const HH_RESTRICT from) {
+  return V1x64U(*from);
+}
+template <>
+HH_INLINE V1x32F
+LoadUnaligned<V1x32F>(const V1x32F::T* const HH_RESTRICT from) {
+  return V1x32F(*from);
+}
+template <>
+HH_INLINE V1x64F
+LoadUnaligned<V1x64F>(const V1x64F::T* const HH_RESTRICT from) {
+  return V1x64F(*from);
+}
+
+template <typename T>
+HH_INLINE void Store(const Scalar<T>& v, T* const HH_RESTRICT to) {
+  v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const Scalar<T>& v, T* const HH_RESTRICT to) {
+  v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+template <typename T>
+HH_INLINE void Stream(const Scalar<T>& v, T* const HH_RESTRICT to) {
+  v.CopyTo(reinterpret_cast<unsigned char*>(to));
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE Scalar<T> RotateLeft(const Scalar<T>& v, const int count) {
+  constexpr size_t num_bits = sizeof(T) * 8;
+  return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE Scalar<T> AndNot(const Scalar<T>& neg_mask, const Scalar<T>& values) {
+  return values & ~neg_mask;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Select(const Scalar<T>& a, const Scalar<T>& b,
+                           const Scalar<T>& mask) {
+  const char* mask_bytes = reinterpret_cast<const char*>(&mask);
+  return (mask_bytes[sizeof(T) - 1] & 0x80) ? b : a;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Min(const Scalar<T>& v0, const Scalar<T>& v1) {
+  return (v0 < v1) ? v0 : v1;
+}
+
+template <typename T>
+HH_INLINE Scalar<T> Max(const Scalar<T>& v0, const Scalar<T>& v1) {
+  return (v0 < v1) ? v1 : v0;
+}
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_SCALAR_H_
diff --git a/highwayhash/highwayhash/scalar_sip_tree_hash.cc b/highwayhash/highwayhash/scalar_sip_tree_hash.cc
new file mode 100644
index 000000000..827b66b0f
--- /dev/null
+++ b/highwayhash/highwayhash/scalar_sip_tree_hash.cc
@@ -0,0 +1,183 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/scalar_sip_tree_hash.h"
+
+#include <cstddef>
+#include <cstring>  // memcpy
+
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/sip_hash.h"
+
+namespace highwayhash {
+namespace {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+// SSE41 implementation: https://goo.gl/80GBSD
+// Tree hash extension: https://doi.org/10.4236/jis.2014.53010
+
+// The hash state is updated by injecting 4x8-byte packets;
+// XORing together all state vectors yields 32 bytes that are
+// reduced to 64 bits via 8-byte SipHash.
+
+const int kNumLanes = 4;
+using Lanes = HH_U64[kNumLanes];
+const int kPacketSize = sizeof(Lanes);
+
+template <int kUpdateRounds, int kFinalizeRounds>
+class ScalarSipTreeHashState {
+ public:
+  HH_INLINE ScalarSipTreeHashState(const Lanes& keys, const int lane) {
+    const HH_U64 key = keys[lane] ^ (kNumLanes | lane);
+    v0 = 0x736f6d6570736575ull ^ key;
+    v1 = 0x646f72616e646f6dull ^ key;
+    v2 = 0x6c7967656e657261ull ^ key;
+    v3 = 0x7465646279746573ull ^ key;
+  }
+
+  HH_INLINE void Update(const HH_U64& packet) {
+    v3 ^= packet;
+
+    Compress<kUpdateRounds>();
+
+    v0 ^= packet;
+  }
+
+  HH_INLINE HH_U64 Finalize() {
+    // Mix in bits to avoid leaking the key if all packets were zero.
+    v2 ^= 0xFF;
+
+    Compress<kFinalizeRounds>();
+
+    return (v0 ^ v1) ^ (v2 ^ v3);
+  }
+
+ private:
+  // Rotate a 64-bit value "v" left by N bits.
+  template <HH_U64 bits>
+  static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
+    const HH_U64 left = v << bits;
+    const HH_U64 right = v >> (64 - bits);
+    return left | right;
+  }
+
+  template <int kRounds>
+  HH_INLINE void Compress() {
+    for (int i = 0; i < kRounds; ++i) {
+      // ARX network: add, rotate, exclusive-or.
+      v0 += v1;
+      v2 += v3;
+      v1 = RotateLeft<13>(v1);
+      v3 = RotateLeft<16>(v3);
+      v1 ^= v0;
+      v3 ^= v2;
+
+      v0 = RotateLeft<32>(v0);
+
+      v2 += v1;
+      v0 += v3;
+      v1 = RotateLeft<17>(v1);
+      v3 = RotateLeft<21>(v3);
+      v1 ^= v2;
+      v3 ^= v0;
+
+      v2 = RotateLeft<32>(v2);
+    }
+  }
+
+  HH_U64 v0;
+  HH_U64 v1;
+  HH_U64 v2;
+  HH_U64 v3;
+};
+
+}  // namespace
+
+template <size_t kUpdateRounds, size_t kFinalizeRounds>
+HH_U64 ScalarSipTreeHashT(const Lanes& key, const char* bytes,
+                          const HH_U64 size) {
+  // "j-lanes" tree hashing interleaves 8-byte input packets.
+  using State = ScalarSipTreeHashState<kUpdateRounds, kFinalizeRounds>;
+  State state[kNumLanes] = {State(key, 0), State(key, 1), State(key, 2),
+                            State(key, 3)};
+
+  // Hash entire 32-byte packets.
+  const size_t remainder = size & (kPacketSize - 1);
+  const size_t truncated_size = size - remainder;
+  const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
+  for (size_t i = 0; i < truncated_size / kPacketSize; ++i) {
+    for (int lane = 0; lane < kNumLanes; ++lane) {
+      const HH_U64 packet = *packets++;
+      state[lane].Update(packet);
+    }
+  }
+
+  // Update with final 32-byte packet.
+  const size_t remainder_mod4 = remainder & 3;
+  uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
+  const char* final_bytes = bytes + size - remainder_mod4;
+  for (size_t i = 0; i < remainder_mod4; ++i) {
+    const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
+    packet4 += byte << (i * 8);
+  }
+
+  char final_packet[kPacketSize] = {0};
+  memcpy(final_packet, bytes + truncated_size, remainder - remainder_mod4);
+  memcpy(final_packet + kPacketSize - 4, &packet4, sizeof(packet4));
+  packets = reinterpret_cast<const HH_U64*>(final_packet);
+  for (int lane = 0; lane < kNumLanes; ++lane) {
+    state[lane].Update(packets[lane]);
+  }
+
+  // Store the resulting hashes.
+  uint64_t hashes[4];
+  for (int lane = 0; lane < kNumLanes; ++lane) {
+    hashes[lane] = state[lane].Finalize();
+  }
+
+  typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
+  memcpy(&reduce_key, &key, sizeof(reduce_key));
+  return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
+      reduce_key, hashes);
+}
+
+HH_U64 ScalarSipTreeHash(const Lanes& key, const char* bytes,
+                         const HH_U64 size) {
+  return ScalarSipTreeHashT<2, 4>(key, bytes, size);
+}
+
+HH_U64 ScalarSipTreeHash13(const Lanes& key, const char* bytes,
+                           const HH_U64 size) {
+  return ScalarSipTreeHashT<1, 3>(key, bytes, size);
+}
+}  // namespace highwayhash
+
+using highwayhash::HH_U64;
+using highwayhash::ScalarSipTreeHash;
+using highwayhash::ScalarSipTreeHash13;
+using Key = HH_U64[4];
+
+extern "C" {
+
+HH_U64 ScalarSipTreeHashC(const HH_U64* key, const char* bytes,
+                          const HH_U64 size) {
+  return ScalarSipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 ScalarSipTreeHash13C(const HH_U64* key, const char* bytes,
+                            const HH_U64 size) {
+  return ScalarSipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+}  // extern "C"
diff --git a/highwayhash/highwayhash/scalar_sip_tree_hash.h b/highwayhash/highwayhash/scalar_sip_tree_hash.h
new file mode 100644
index 000000000..2f79f3a01
--- /dev/null
+++ b/highwayhash/highwayhash/scalar_sip_tree_hash.h
@@ -0,0 +1,37 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
+#define HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
+
+// Scalar (non-vector/SIMD) version for comparison purposes.
+
+#include "highwayhash/state_helpers.h"
+
+#ifdef __cplusplus
+namespace highwayhash {
+extern "C" {
+#endif
+
+HH_U64 ScalarSipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+                         const HH_U64 size);
+HH_U64 ScalarSipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+                           const HH_U64 size);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace highwayhash
+#endif
+
+#endif  // HIGHWAYHASH_SCALAR_SIP_TREE_HASH_H_
diff --git a/highwayhash/highwayhash/sip_hash.cc b/highwayhash/highwayhash/sip_hash.cc
new file mode 100644
index 000000000..1c0853354
--- /dev/null
+++ b/highwayhash/highwayhash/sip_hash.cc
@@ -0,0 +1,33 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_hash.h"
+
+using highwayhash::HH_U64;
+using highwayhash::SipHash;
+using highwayhash::SipHash13;
+using Key = highwayhash::SipHashState::Key;
+using Key13 = highwayhash::SipHash13State::Key;
+
+extern "C" {
+
+HH_U64 SipHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
+  return SipHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 SipHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
+  return SipHash13(*reinterpret_cast<const Key13*>(key), bytes, size);
+}
+
+}  // extern "C"
diff --git a/highwayhash/highwayhash/sip_hash.h b/highwayhash/highwayhash/sip_hash.h
new file mode 100644
index 000000000..eebe3dc94
--- /dev/null
+++ b/highwayhash/highwayhash/sip_hash.h
@@ -0,0 +1,171 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SIP_HASH_H_
+#define HIGHWAYHASH_SIP_HASH_H_
+
+// Portable but fast SipHash implementation.
+
+#include <cstddef>
+#include <cstring>  // memcpy
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+#include "highwayhash/state_helpers.h"
+
+namespace highwayhash {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+template <int kUpdateIters, int kFinalizeIters>
+class SipHashStateT {
+ public:
+  using Key = HH_U64[2];
+  static const size_t kPacketSize = sizeof(HH_U64);
+
+  explicit HH_INLINE SipHashStateT(const Key& key) {
+    v0 = 0x736f6d6570736575ull ^ key[0];
+    v1 = 0x646f72616e646f6dull ^ key[1];
+    v2 = 0x6c7967656e657261ull ^ key[0];
+    v3 = 0x7465646279746573ull ^ key[1];
+  }
+
+  HH_INLINE void Update(const char* bytes) {
+    HH_U64 packet;
+    memcpy(&packet, bytes, sizeof(packet));
+    packet = host_from_le64(packet);
+
+    v3 ^= packet;
+
+    Compress<kUpdateIters>();
+
+    v0 ^= packet;
+  }
+
+  HH_INLINE HH_U64 Finalize() {
+    // Mix in bits to avoid leaking the key if all packets were zero.
+    v2 ^= 0xFF;
+
+    Compress<kFinalizeIters>();
+
+    return (v0 ^ v1) ^ (v2 ^ v3);
+  }
+ private:
+  // Rotate a 64-bit value "v" left by N bits.
+  template <HH_U64 bits>
+  static HH_INLINE HH_U64 RotateLeft(const HH_U64 v) {
+    const HH_U64 left = v << bits;
+    const HH_U64 right = v >> (64 - bits);
+    return left | right;
+  }
+
+  template <size_t rounds>
+  HH_INLINE void Compress() {
+    for (size_t i = 0; i < rounds; ++i) {
+      // ARX network: add, rotate, exclusive-or.
+      v0 += v1;
+      v2 += v3;
+      v1 = RotateLeft<13>(v1);
+      v3 = RotateLeft<16>(v3);
+      v1 ^= v0;
+      v3 ^= v2;
+
+      v0 = RotateLeft<32>(v0);
+
+      v2 += v1;
+      v0 += v3;
+      v1 = RotateLeft<17>(v1);
+      v3 = RotateLeft<21>(v3);
+      v1 ^= v2;
+      v3 ^= v0;
+
+      v2 = RotateLeft<32>(v2);
+    }
+  }
+
+  HH_U64 v0;
+  HH_U64 v1;
+  HH_U64 v2;
+  HH_U64 v3;
+};
+
+using SipHashState = SipHashStateT<2, 4>;
+using SipHash13State = SipHashStateT<1, 3>;
+
+// Override the HighwayTreeHash padding scheme with that of SipHash so that
+// the hash output matches the known-good values in sip_hash_test.
+template <>
+HH_INLINE void PaddedUpdate<SipHashState>(const HH_U64 size,
+                                          const char* remaining_bytes,
+                                          const HH_U64 remaining_size,
+                                          SipHashState* state) {
+  // Copy to avoid overrunning the input buffer.
+  char final_packet[SipHashState::kPacketSize] = {0};
+  memcpy(final_packet, remaining_bytes, remaining_size);
+  final_packet[SipHashState::kPacketSize - 1] = static_cast<char>(size & 0xFF);
+  state->Update(final_packet);
+}
+
+template <>
+HH_INLINE void PaddedUpdate<SipHash13State>(const HH_U64 size,
+                                            const char* remaining_bytes,
+                                            const HH_U64 remaining_size,
+                                            SipHash13State* state) {
+  // Copy to avoid overrunning the input buffer.
+  char final_packet[SipHash13State::kPacketSize] = {0};
+  memcpy(final_packet, remaining_bytes, remaining_size);
+  final_packet[SipHash13State::kPacketSize - 1] =
+      static_cast<char>(size & 0xFF);
+  state->Update(final_packet);
+}
+
+// Fast, cryptographically strong pseudo-random function, e.g. for
+// deterministic/idempotent 'random' number generation. See also
+// README.md for information on resisting hash flooding attacks.
+//
+// Robust versus timing attacks because memory accesses are sequential
+// and the algorithm is branch-free. Compute time is proportional to the
+// number of 8-byte packets and about twice as fast as an sse41 implementation.
+//
+// "key" is a secret 128-bit key unknown to attackers.
+// "bytes" is the data to hash; ceil(size / 8) * 8 bytes are read.
+// Returns a 64-bit hash of the given data bytes, which are swapped on
+// big-endian CPUs so the return value is the same as on little-endian CPUs.
+static HH_INLINE HH_U64 SipHash(const SipHashState::Key& key, const char* bytes,
+                                const HH_U64 size) {
+  return ComputeHash<SipHashState>(key, bytes, size);
+}
+
+// Round-reduced SipHash version (1 update and 3 finalization rounds).
+static HH_INLINE HH_U64 SipHash13(const SipHash13State::Key& key,
+                                  const char* bytes, const HH_U64 size) {
+  return ComputeHash<SipHash13State>(key, bytes, size);
+}
+
+template <int kNumLanes, int kUpdateIters, int kFinalizeIters>
+static HH_INLINE HH_U64 ReduceSipTreeHash(
+    const typename SipHashStateT<kUpdateIters, kFinalizeIters>::Key& key,
+    const uint64_t (&hashes)[kNumLanes]) {
+  SipHashStateT<kUpdateIters, kFinalizeIters> state(key);
+
+  for (int i = 0; i < kNumLanes; ++i) {
+    state.Update(reinterpret_cast<const char*>(&hashes[i]));
+  }
+
+  return state.Finalize();
+}
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_SIP_HASH_H_
diff --git a/highwayhash/highwayhash/sip_hash_fuzzer.cc b/highwayhash/highwayhash/sip_hash_fuzzer.cc
new file mode 100644
index 000000000..2ecc4d553
--- /dev/null
+++ b/highwayhash/highwayhash/sip_hash_fuzzer.cc
@@ -0,0 +1,20 @@
+#include "highwayhash/sip_hash.h"
+
+using highwayhash::HH_U64;
+using highwayhash::SipHash;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size < sizeof(HH_U64) * 2) {
+    return 0;
+  }
+
+  // Generate the key.
+  const HH_U64 *hhU64s = reinterpret_cast<const HH_U64*>(data);
+  HH_ALIGNAS(16) const HH_U64 key[2] = {hhU64s[0], hhU64s[1]};
+  data += sizeof(HH_U64) * 2;
+  size -= sizeof(HH_U64) * 2;
+
+  // Compute the hash.
+  SipHash(key, reinterpret_cast<const char*>(data), size);
+  return 0;
+}
diff --git a/highwayhash/highwayhash/sip_hash_test.cc b/highwayhash/highwayhash/sip_hash_test.cc
new file mode 100644
index 000000000..7c1e8d1bd
--- /dev/null
+++ b/highwayhash/highwayhash/sip_hash_test.cc
@@ -0,0 +1,148 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_hash.h"
+
+#include <cassert>
+#include <numeric>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef HH_GOOGLETEST
+#include "base/integral_types.h"
+#include "testing/base/public/benchmark.h"
+#include "testing/base/public/gunit.h"
+#endif
+#include "highwayhash/scalar_sip_tree_hash.h"
+#include "highwayhash/sip_tree_hash.h"
+
+namespace highwayhash {
+namespace {
+
+void VerifySipHash() {
+  const int kMaxSize = 64;
+  char in[kMaxSize];  // empty string, 00, 00 01, ...
+  const HH_U64 key[2] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL};
+
+  // Known-good SipHash-2-4 output from D. Bernstein.
+  const HH_U64 kSipHashOutput[64] = {
+      0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A,
+      0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D,
+      0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462,
+      0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7,
+      0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE,
+      0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794,
+      0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98,
+      0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8,
+      0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3,
+      0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571,
+      0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE,
+      0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE,
+      0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D,
+      0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992,
+      0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2,
+      0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB,
+      0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72,
+      0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F,
+      0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD,
+      0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93,
+      0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57,
+      0x958A324CEB064572};
+
+  for (int size = 0; size < kMaxSize; ++size) {
+    in[size] = static_cast<char>(size);
+    const HH_U64 hash = highwayhash::SipHash(key, in, size);
+#ifdef HH_GOOGLETEST
+    EXPECT_EQ(kSipHashOutput[size], hash) << "Mismatch at length " << size;
+#else
+    if (hash != kSipHashOutput[size]) {
+      printf("Mismatch at length %d\n", size);
+      abort();
+    }
+#endif
+  }
+}
+
+#ifdef HH_GOOGLETEST
+TEST(SipHashTest, OutputMatchesExpectations) { VerifySipHash(); }
+
+namespace bm {
+/* Run with:
+   blaze run -c opt --cpu=haswell third_party/highwayhash:sip_hash_test -- \
+     --benchmarks=all --benchmark_min_iters=1 --benchmark_min_time=0.25
+*/
+
+// Returns a pointer to memory of at least size bytes long to be used as hashing
+// input.
+char* GetInput(size_t size) {
+  static constexpr size_t kMaxSize = 100 << 20;
+  assert(size <= kMaxSize);
+  static auto* res = []() {
+    auto* res = new char[kMaxSize];
+    std::iota(res, res + kMaxSize, 0);
+    return res;
+  }();
+  return res;
+}
+
+template <class Hasher>
+void BM(benchmark::State& state) {
+  int size = state.range(0);
+  auto* input = GetInput(size);
+  const HH_U64 keys[4] = {0x0706050403020100ULL, 0x0F0E0D0C0B0A0908ULL,
+                          0x1716151413121110ULL, 0x1F1E1D1C1B1A1918ULL};
+  Hasher hasher(keys);
+  for (auto s : state) {
+    benchmark::DoNotOptimize(hasher(input, size));
+  }
+  state.SetBytesProcessed(state.iterations() * size);
+}
+
+void Args(::testing::Benchmark* bm) {
+  bm->DenseRange(1, 16)->Range(32, 100 << 20);
+}
+
+#define DEFINE_HASHER(hashfn, num_keys)                            \
+  struct hashfn##er {                                              \
+    hashfn##er(const HH_U64* k) { memcpy(keys, k, sizeof(keys)); } \
+    HH_U64 operator()(const char* input, size_t size) {            \
+      return highwayhash::hashfn(keys, input, size);               \
+    }                                                              \
+    HH_U64 keys[num_keys];                                         \
+  }
+
+DEFINE_HASHER(SipHash, 2);
+BENCHMARK(BM<SipHasher>)->Apply(Args);
+
+DEFINE_HASHER(ScalarSipTreeHash, 4);
+BENCHMARK(BM<ScalarSipTreeHasher>)->Apply(Args);
+
+#ifdef __AVX2__
+DEFINE_HASHER(SipTreeHash, 4);
+BENCHMARK(BM<SipTreeHasher>)->Apply(Args);
+#endif
+
+}  // namespace bm
+#endif  // HH_GOOGLETEST
+
+}  // namespace
+}  // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+  highwayhash::VerifySipHash();
+  printf("VerifySipHash succeeded.\n");
+  return 0;
+}
+#endif
diff --git a/highwayhash/highwayhash/sip_tree_hash.cc b/highwayhash/highwayhash/sip_tree_hash.cc
new file mode 100644
index 000000000..3543cb203
--- /dev/null
+++ b/highwayhash/highwayhash/sip_tree_hash.cc
@@ -0,0 +1,227 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "highwayhash/sip_tree_hash.h"
+
+#include <cstring>  // memcpy
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/sip_hash.h"
+
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/vector256.h"
+namespace highwayhash {
+namespace HH_TARGET_NAME {
+namespace {
+
+// Paper: https://www.131002.net/siphash/siphash.pdf
+// SSE41 implementation: https://goo.gl/80GBSD
+// Tree hash extension: https://doi.org/10.4236/jis.2014.53010
+
+// The hash state is updated by injecting 4x8-byte packets;
+// XORing together all state vectors yields 32 bytes that are
+// reduced to 64 bits via 8-byte SipHash.
+
+const int kPacketSize = 32;
+const int kNumLanes = kPacketSize / sizeof(HH_U64);
+
+// 32 bytes key. Parameters are hardwired to c=2, d=4 [rounds].
+template <int kUpdateRounds, int kFinalizeRounds>
+class SipTreeHashStateT {
+ public:
+  explicit HH_INLINE SipTreeHashStateT(const HH_U64 (&keys)[kNumLanes]) {
+    const V4x64U init(0x7465646279746573ull, 0x6c7967656e657261ull,
+                      0x646f72616e646f6dull, 0x736f6d6570736575ull);
+    const V4x64U lanes(kNumLanes | 3, kNumLanes | 2, kNumLanes | 1,
+                       kNumLanes | 0);
+    const V4x64U key =
+        LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(keys)) ^ lanes;
+    v0 = V4x64U(_mm256_permute4x64_epi64(init, 0x00)) ^ key;
+    v1 = V4x64U(_mm256_permute4x64_epi64(init, 0x55)) ^ key;
+    v2 = V4x64U(_mm256_permute4x64_epi64(init, 0xAA)) ^ key;
+    v3 = V4x64U(_mm256_permute4x64_epi64(init, 0xFF)) ^ key;
+  }
+
+  HH_INLINE void Update(const V4x64U& packet) {
+    v3 ^= packet;
+
+    Compress<kUpdateRounds>();
+
+    v0 ^= packet;
+  }
+
+  HH_INLINE V4x64U Finalize() {
+    // Mix in bits to avoid leaking the key if all packets were zero.
+    v2 ^= V4x64U(0xFF);
+
+    Compress<kFinalizeRounds>();
+
+    return (v0 ^ v1) ^ (v2 ^ v3);
+  }
+
+ private:
+  static HH_INLINE V4x64U RotateLeft16(const V4x64U& v) {
+    const V4x64U control(0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL,
+                         0x0D0C0B0A09080F0EULL, 0x0504030201000706ULL);
+    return V4x64U(_mm256_shuffle_epi8(v, control));
+  }
+
+  // Rotates each 64-bit element of "v" left by N bits.
+  template <HH_U64 bits>
+  static HH_INLINE V4x64U RotateLeft(const V4x64U& v) {
+    const V4x64U left = v << bits;
+    const V4x64U right = v >> (64 - bits);
+    return left | right;
+  }
+
+  static HH_INLINE V4x64U Rotate32(const V4x64U& v) {
+    return V4x64U(_mm256_shuffle_epi32(v, _MM_SHUFFLE(2, 3, 0, 1)));
+  }
+
+  template <int kRounds>
+  HH_INLINE void Compress() {
+    // Loop is faster than unrolling!
+    for (int i = 0; i < kRounds; ++i) {
+      // ARX network: add, rotate, exclusive-or.
+      v0 += v1;
+      v2 += v3;
+      v1 = RotateLeft<13>(v1);
+      v3 = RotateLeft16(v3);
+      v1 ^= v0;
+      v3 ^= v2;
+
+      v0 = Rotate32(v0);
+
+      v2 += v1;
+      v0 += v3;
+      v1 = RotateLeft<17>(v1);
+      v3 = RotateLeft<21>(v3);
+      v1 ^= v2;
+      v3 ^= v0;
+
+      v2 = Rotate32(v2);
+    }
+  }
+
+  V4x64U v0;
+  V4x64U v1;
+  V4x64U v2;
+  V4x64U v3;
+};
+
+// Returns 32-byte packet by loading the remaining 0..31 bytes, storing
+// "remainder" in the upper byte, and zeroing any intervening bytes.
+// "remainder" is the number of accessible/remaining bytes (size % 32).
+// Loading past the end of the input risks page fault exceptions which even
+// LDDQU cannot prevent.
+static HH_INLINE V4x64U LoadFinalPacket32(const char* bytes, const HH_U64 size,
+                                          const HH_U64 remainder) {
+  // Copying into an aligned buffer incurs a store-to-load-forwarding stall.
+  // Instead, we use masked loads to read any remaining whole uint32_t
+  // without incurring page faults for the others.
+  const size_t remaining_32 = remainder >> 2;  // 0..7
+
+  // mask[32*i+31] := uint32_t #i valid/accessible ? 1 : 0.
+  // To avoid large lookup tables, we pack uint32_t lanes into bytes,
+  // compute the packed mask by shifting, and then sign-extend 0xFF to
+  // 0xFFFFFFFF (although only the MSB needs to be set).
+  // remaining_32 = 0 => mask = 00000000; remaining_32 = 7 => mask = 01111111.
+  const HH_U64 packed_mask = 0x00FFFFFFFFFFFFFFULL >> ((7 - remaining_32) * 8);
+  const V4x64U mask(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(packed_mask)));
+  // Load 0..7 remaining (potentially unaligned) uint32_t.
+  const V4x64U packet28(
+      _mm256_maskload_epi32(reinterpret_cast<const int*>(bytes), mask));
+
+  // Load any remaining bytes individually and combine into a uint32_t.
+  const int remainder_mod4 = remainder & 3;
+  // Length padding ensures that zero-valued buffers of different lengths
+  // result in different hashes.
+  uint32_t packet4 = static_cast<uint32_t>(remainder << 24);
+  const char* final_bytes = bytes + (remaining_32 * 4);
+  for (int i = 0; i < remainder_mod4; ++i) {
+    const uint32_t byte = static_cast<unsigned char>(final_bytes[i]);
+    packet4 += byte << (i * 8);
+  }
+
+  // The upper 4 bytes of packet28 are zero; replace with packet4 to
+  // obtain the (length-padded) 32-byte packet.
+  const __m256i v4 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(packet4));
+  const V4x64U packet(_mm256_blend_epi32(packet28, v4, 0x80));
+  return packet;
+}
+
+}  // namespace
+}  // namespace HH_TARGET_NAME
+
+template <size_t kUpdateRounds, size_t kFinalizeRounds>
+HH_U64 SipTreeHashT(const HH_U64 (&key)[4], const char* bytes,
+                    const HH_U64 size) {
+  using namespace HH_TARGET_NAME;
+  SipTreeHashStateT<kUpdateRounds, kFinalizeRounds> state(key);
+
+  const size_t remainder = size & (kPacketSize - 1);
+  const size_t truncated_size = size - remainder;
+  const HH_U64* packets = reinterpret_cast<const HH_U64*>(bytes);
+  for (size_t i = 0; i < truncated_size / sizeof(HH_U64); i += kNumLanes) {
+    const V4x64U packet =
+        LoadUnaligned<V4x64U>(reinterpret_cast<const uint64_t*>(packets) + i);
+    state.Update(packet);
+  }
+
+  const V4x64U final_packet =
+      LoadFinalPacket32(bytes + truncated_size, size, remainder);
+
+  state.Update(final_packet);
+
+  // Faster than passing __m256i and extracting.
+  HH_ALIGNAS(32) uint64_t hashes[kNumLanes];
+  Store(state.Finalize(), hashes);
+
+  typename SipHashStateT<kUpdateRounds, kFinalizeRounds>::Key reduce_key;
+  memcpy(&reduce_key, &key, sizeof(reduce_key));
+  return ReduceSipTreeHash<kNumLanes, kUpdateRounds, kFinalizeRounds>(
+      reduce_key, hashes);
+}
+
+HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+                   const HH_U64 size) {
+  return SipTreeHashT<2, 4>(key, bytes, size);
+}
+
+HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+                     const HH_U64 size) {
+  return SipTreeHashT<1, 3>(key, bytes, size);
+}
+
+}  // namespace highwayhash
+
+using highwayhash::HH_U64;
+using highwayhash::SipTreeHash;
+using highwayhash::SipTreeHash13;
+using Key = HH_U64[4];
+
+extern "C" {
+
+HH_U64 SipTreeHashC(const HH_U64* key, const char* bytes, const HH_U64 size) {
+  return SipTreeHash(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+HH_U64 SipTreeHash13C(const HH_U64* key, const char* bytes, const HH_U64 size) {
+  return SipTreeHash13(*reinterpret_cast<const Key*>(key), bytes, size);
+}
+
+}  // extern "C"
+
+#endif  // HH_TARGET == HH_TARGET_AVX2
diff --git a/highwayhash/highwayhash/sip_tree_hash.h b/highwayhash/highwayhash/sip_tree_hash.h
new file mode 100644
index 000000000..ee5a42340
--- /dev/null
+++ b/highwayhash/highwayhash/sip_tree_hash.h
@@ -0,0 +1,52 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_SIP_TREE_HASH_H_
+#define HIGHWAYHASH_SIP_TREE_HASH_H_
+
+#include "highwayhash/state_helpers.h"
+
+#ifdef __cplusplus
+namespace highwayhash {
+extern "C" {
+#endif
+
+// Fast, cryptographically strong pseudo-random function. Useful for:
+// . hash tables holding attacker-controlled data. This function is
+//   immune to hash flooding DOS attacks because multi-collisions are
+//   infeasible to compute, provided the key remains secret.
+// . deterministic/idempotent 'random' number generation, e.g. for
+//   choosing a subset of items based on their contents.
+//
+// Robust versus timing attacks because memory accesses are sequential
+// and the algorithm is branch-free. Compute time is proportional to the
+// number of 8-byte packets and 1.5x faster than an sse41 implementation.
+// Requires an AVX-2 capable CPU.
+//
+// "key" is a secret 256-bit key unknown to attackers.
+// "bytes" is the data to hash (possibly unaligned).
+// "size" is the number of bytes to hash; exactly that many bytes are read.
+// Returns a 64-bit hash of the given data bytes.
+HH_U64 SipTreeHash(const HH_U64 (&key)[4], const char* bytes,
+                   const HH_U64 size);
+
+HH_U64 SipTreeHash13(const HH_U64 (&key)[4], const char* bytes,
+                     const HH_U64 size);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace highwayhash
+#endif
+
+#endif  // HIGHWAYHASH_SIP_TREE_HASH_H_
diff --git a/highwayhash/highwayhash/state_helpers.h b/highwayhash/highwayhash/state_helpers.h
new file mode 100644
index 000000000..3e9ca8d5e
--- /dev/null
+++ b/highwayhash/highwayhash/state_helpers.h
@@ -0,0 +1,130 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_STATE_H_
+#define HIGHWAYHASH_STATE_H_
+
+// Helper functions to split inputs into packets and call State::Update on each.
+
+#include <stdint.h>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/endianess.h"
+
+namespace highwayhash {
+
+// uint64_t is unsigned long on Linux; we need 'unsigned long long'
+// for interoperability with TensorFlow.
+typedef unsigned long long HH_U64;  // NOLINT
+
+// Copies the remaining bytes to a zero-padded buffer, sets the upper byte to
+// size % 256 (always possible because this should only be called if the
+// total size is not a multiple of the packet size) and updates hash state.
+//
+// The padding scheme is essentially from SipHash, but permuted for the
+// convenience of AVX-2 masked loads. This function must use the same layout so
+// that the vector and scalar HighwayTreeHash have the same result.
+//
+// "remaining_size" is the number of accessible/remaining bytes
+// (size % kPacketSize).
+//
+// Primary template; the specialization for AVX-2 is faster. Intended as an
+// implementation detail, do not call directly.
+template <class State>
+HH_INLINE void PaddedUpdate(const HH_U64 size, const char* remaining_bytes,
+                            const HH_U64 remaining_size, State* state) {
+  HH_ALIGNAS(32) char final_packet[State::kPacketSize] = {0};
+
+  // This layout matches the AVX-2 specialization in highway_tree_hash.h.
+  uint32_t packet4 = static_cast<uint32_t>(size) << 24;
+
+  const size_t remainder_mod4 = remaining_size & 3;
+  if (remainder_mod4 != 0) {
+    const char* final_bytes = remaining_bytes + remaining_size - remainder_mod4;
+    packet4 += static_cast<uint32_t>(final_bytes[0]);
+    const int idx1 = remainder_mod4 >> 1;
+    const int idx2 = remainder_mod4 - 1;
+    packet4 += static_cast<uint32_t>(final_bytes[idx1]) << 8;
+    packet4 += static_cast<uint32_t>(final_bytes[idx2]) << 16;
+  }
+  packet4 = host_from_le32(packet4);
+
+  memcpy(final_packet, remaining_bytes, remaining_size - remainder_mod4);
+  memcpy(final_packet + State::kPacketSize - 4, &packet4, sizeof(packet4));
+
+  state->Update(final_packet);
+}
+
+// Updates hash state for every whole packet, and once more for the final
+// padded packet.
+template <class State>
+HH_INLINE void UpdateState(const char* bytes, const HH_U64 size, State* state) {
+  // Feed entire packets.
+  const int kPacketSize = State::kPacketSize;
+  static_assert((kPacketSize & (kPacketSize - 1)) == 0, "Size must be 2^i.");
+  const size_t remainder = size & (kPacketSize - 1);
+  const size_t truncated_size = size - remainder;
+  for (size_t i = 0; i < truncated_size; i += kPacketSize) {
+    state->Update(bytes + i);
+  }
+
+  PaddedUpdate(size, bytes + truncated_size, remainder, state);
+}
+
+// Convenience function for updating with the bytes of a string.
+template <class String, class State>
+HH_INLINE void UpdateState(const String& s, State* state) {
+  const char* bytes = reinterpret_cast<const char*>(s.data());
+  const size_t size = s.length() * sizeof(typename String::value_type);
+  UpdateState(bytes, size, state);
+}
+
+// Computes a hash of a byte array using the given hash State class.
+//
+// Example: const SipHashState::Key key = { 1, 2 }; char data[4];
+// ComputeHash<SipHashState>(key, data, sizeof(data));
+//
+// This function avoids duplicating Update/Finalize in every call site.
+// Callers wanting to combine multiple hashes should repeatedly UpdateState()
+// and only call State::Finalize once.
+template <class State>
+HH_U64 ComputeHash(const typename State::Key& key, const char* bytes,
+                   const HH_U64 size) {
+  State state(key);
+  UpdateState(bytes, size, &state);
+  return state.Finalize();
+}
+
+// Computes a hash of a string's bytes using the given hash State class.
+//
+// Example: const SipHashState::Key key = { 1, 2 };
+// StringHasher<SipHashState>()(key, std::u16string(u"abc"));
+//
+// A struct with nested function template enables deduction of the String type.
+template <class State>
+struct StringHasher {
+  template <class String>
+  HH_U64 operator()(const typename State::Key& key, const String& s) {
+    State state(key);
+    UpdateState(s, &state);
+    return state.Finalize();
+  }
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_STATE_H_
diff --git a/highwayhash/highwayhash/tsc_timer.h b/highwayhash/highwayhash/tsc_timer.h
new file mode 100644
index 000000000..e8119b5a5
--- /dev/null
+++ b/highwayhash/highwayhash/tsc_timer.h
@@ -0,0 +1,208 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_TSC_TIMER_H_
+#define HIGHWAYHASH_TSC_TIMER_H_
+
+// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
+// ensure exactly the desired regions are measured.
+
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+#if HH_ARCH_X64 && HH_MSC_VERSION
+#include <emmintrin.h>  // _mm_lfence
+#include <intrin.h>
+#endif
+
+#include <ctime>
+
+namespace highwayhash {
+
+// Start/Stop return absolute timestamps and must be placed immediately before
+// and after the region to measure. We provide separate Start/Stop functions
+// because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
+//
+// Using Start+Start leads to higher variance and overhead than Stop+Stop.
+// However, Stop+Stop includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Start+Stop
+// is faster than Start+Start and more consistent than Stop+Stop because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Primary templates; must use one of the specializations.
+template <typename T>
+inline T Start();
+
+template <typename T>
+inline T Stop();
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+template <>
+inline uint64_t Start<uint64_t>() {
+  uint64_t t;
+#if HH_ARCH_PPC
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HH_ARCH_AARCH64
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HH_ARCH_X64 && HH_MSC_VERSION
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+  t = __rdtsc();
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rdx", "memory", "cc");
+#else
+  t = static_cast<uint64_t>(clock());
+#endif
+  return t;
+}
+
+template <>
+inline uint64_t Stop<uint64_t>() {
+  uint64_t t;
+#if HH_ARCH_PPC
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HH_ARCH_AARCH64
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif HH_ARCH_X64 && HH_MSC_VERSION
+  HH_COMPILER_FENCE;
+  unsigned aux;
+  t = __rdtscp(&aux);
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rcx", "rdx", "memory", "cc");
+#else
+  t = static_cast<uint64_t>(clock());
+#endif
+  return t;
+}
+
+// Returns a 32-bit timestamp with about 4 cycles less overhead than
+// Start<uint64_t>. Only suitable for measuring very short regions because the
+// timestamp overflows about once a second.
+template <>
+inline uint32_t Start<uint32_t>() {
+  uint32_t t;
+#if HH_ARCH_X64 && HH_MSC_VERSION
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+  t = static_cast<uint32_t>(__rdtsc());
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      : "rdx", "memory");
+#else
+  t = static_cast<uint32_t>(Start<uint64_t>());
+#endif
+  return t;
+}
+
+template <>
+inline uint32_t Stop<uint32_t>() {
+  uint32_t t;
+#if HH_ARCH_X64 && HH_MSC_VERSION
+  HH_COMPILER_FENCE;
+  unsigned aux;
+  t = static_cast<uint32_t>(__rdtscp(&aux));
+  _mm_lfence();
+  HH_COMPILER_FENCE;
+#elif HH_ARCH_X64 && (HH_CLANG_VERSION || HH_GCC_VERSION)
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      : "rcx", "rdx", "memory");
+#else
+  t = static_cast<uint32_t>(Stop<uint64_t>());
+#endif
+  return t;
+}
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_TSC_TIMER_H_
diff --git a/highwayhash/highwayhash/vector128.h b/highwayhash/highwayhash/vector128.h
new file mode 100644
index 000000000..53eb9f164
--- /dev/null
+++ b/highwayhash/highwayhash/vector128.h
@@ -0,0 +1,796 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR128_H_
+#define HIGHWAYHASH_VECTOR128_H_
+
+// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators:
+// const V2x64U masked_sum = (a + b) & m;
+// This is shorter and more readable than compiler intrinsics:
+// const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m);
+// There is typically no runtime cost for these abstractions.
+//
+// The naming convention is VNxBBT where N is the number of lanes, BB the
+// number of bits per lane and T is the lane type: unsigned integer (U),
+// signed integer (I), or floating-point (F).
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -msse4.1 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+// WARNING: smmintrin.h will also be included through immintrin.h in the AVX2
+// translation unit, which is compiled with different flags. This risks ODR
+// violations, and can cause crashes when functions are not inlined and the
+// linker selects the AVX2 version. Unfortunately this include cannot reside
+// within a namespace due to conflicts with other system headers. We need to
+// assume all the intrinsic functions (defined as static inline by Clang's
+// library and as extern inline by GCC) are in fact inlined. targets.bzl
+// generates a test that verifies this by detecting duplicate symbols.
+#include <smmintrin.h>  // SSE4.1
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Primary template for 128-bit SSE4.1 vectors; only specializations are used.
+template <typename T>
+class V128 {};
+
+template <>
+class V128<uint8_t> {
+ public:
+  using Intrinsic = __m128i;
+  using T = uint8_t;
+  static constexpr size_t N = 16;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_epi8(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_epi8(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_epi8(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_si128(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint16_t> {
+ public:
+  using Intrinsic = __m128i;
+  using T = uint16_t;
+  static constexpr size_t N = 8;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_epi16(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_epi16(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_epi16(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_si128(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    v_ = _mm_slli_epi16(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = _mm_sll_epi16(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    v_ = _mm_srli_epi16(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = _mm_srl_epi16(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint32_t> {
+ public:
+  using Intrinsic = __m128i;
+  using T = uint32_t;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_epi32(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_epi32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_epi32(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_si128(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    v_ = _mm_slli_epi32(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = _mm_sll_epi32(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    v_ = _mm_srli_epi32(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = _mm_srl_epi32(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint64_t> {
+ public:
+  using Intrinsic = __m128i;
+  using T = uint64_t;
+  static constexpr size_t N = 2;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {}
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_epi64(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_epi64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_epi64(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_si128(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_si128(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    v_ = _mm_slli_epi64(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = _mm_sll_epi64(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    v_ = _mm_srli_epi64(v_, count);
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = _mm_srl_epi64(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<float> {
+ public:
+  using Intrinsic = __m128;
+  using T = float;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_ps(v_, other.v_));
+  }
+  HH_INLINE V128 operator<(const V128& other) const {
+    return V128(_mm_cmplt_ps(v_, other.v_));
+  }
+  HH_INLINE V128 operator>(const V128& other) const {
+    return V128(_mm_cmplt_ps(other.v_, v_));
+  }
+
+  HH_INLINE V128& operator*=(const V128& other) {
+    v_ = _mm_mul_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator/=(const V128& other) {
+    v_ = _mm_div_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_ps(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_ps(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<double> {
+ public:
+  using Intrinsic = __m128d;
+  using T = double;
+  static constexpr size_t N = 2;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {}
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(_mm_cmpeq_pd(v_, other.v_));
+  }
+  HH_INLINE V128 operator<(const V128& other) const {
+    return V128(_mm_cmplt_pd(v_, other.v_));
+  }
+  HH_INLINE V128 operator>(const V128& other) const {
+    return V128(_mm_cmplt_pd(other.v_, v_));
+  }
+
+  HH_INLINE V128& operator*=(const V128& other) {
+    v_ = _mm_mul_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator/=(const V128& other) {
+    v_ = _mm_div_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = _mm_add_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = _mm_sub_pd(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = _mm_and_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = _mm_or_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = _mm_xor_pd(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+// Nonmember functions for any V128 via member functions.
+
+template <typename T>
+HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t *= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t /= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t += right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t -= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t &= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t |= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t ^= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) {
+  V128<T> t(v);
+  return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) {
+  V128<T> t(v);
+  return t >>= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator<<(const V128<T>& v, const __m128i& count) {
+  V128<T> t(v);
+  return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator>>(const V128<T>& v, const __m128i& count) {
+  V128<T> t(v);
+  return t >>= count;
+}
+
+using V16x8U = V128<uint8_t>;
+using V8x16U = V128<uint16_t>;
+using V4x32U = V128<uint32_t>;
+using V2x64U = V128<uint64_t>;
+using V4x32F = V128<float>;
+using V2x64F = V128<double>;
+
+// Load/Store for any V128.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector256.
+
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+// "from" must be vector-aligned.
+template <>
+HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V16x8U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V8x16U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V4x32U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V2x64U(_mm_load_si128(p));
+}
+template <>
+HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+  return V4x32F(_mm_load_ps(from));
+}
+template <>
+HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+  return V2x64F(_mm_load_pd(from));
+}
+
+template <>
+HH_INLINE V16x8U
+LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V16x8U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V8x16U
+LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V8x16U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V4x32U
+LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V4x32U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V2x64U
+LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+  const __m128i* const HH_RESTRICT p = reinterpret_cast<const __m128i*>(from);
+  return V2x64U(_mm_loadu_si128(p));
+}
+template <>
+HH_INLINE V4x32F
+LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+  return V4x32F(_mm_loadu_ps(from));
+}
+template <>
+HH_INLINE V2x64F
+LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+  return V2x64F(_mm_loadu_pd(from));
+}
+
+// "to" must be vector-aligned.
+template <typename T>
+HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to) {
+  _mm_store_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) {
+  _mm_store_ps(to, v);
+}
+HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) {
+  _mm_store_pd(to, v);
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to) {
+  _mm_storeu_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void StoreUnaligned(const V128<float>& v,
+                              float* const HH_RESTRICT to) {
+  _mm_storeu_ps(to, v);
+}
+HH_INLINE void StoreUnaligned(const V128<double>& v,
+                              double* const HH_RESTRICT to) {
+  _mm_storeu_pd(to, v);
+}
+
+// Writes directly to (aligned) memory, bypassing the cache. This is useful for
+// data that will not be read again in the near future.
+template <typename T>
+HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) {
+  _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) {
+  _mm_stream_ps(to, v);
+}
+HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) {
+  _mm_stream_pd(to, v);
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) {
+  constexpr size_t num_bits = sizeof(T) * 8;
+  return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) {
+  return V128<T>(_mm_andnot_si128(neg_mask, values));
+}
+template <>
+HH_INLINE V128<float> AndNot(const V128<float>& neg_mask,
+                             const V128<float>& values) {
+  return V128<float>(_mm_andnot_ps(neg_mask, values));
+}
+template <>
+HH_INLINE V128<double> AndNot(const V128<double>& neg_mask,
+                              const V128<double>& values) {
+  return V128<double>(_mm_andnot_pd(neg_mask, values));
+}
+
+HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) {
+  return V4x32F(_mm_blendv_ps(a, b, mask));
+}
+
+HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) {
+  return V2x64F(_mm_blendv_pd(a, b, mask));
+}
+
+// Min/Max
+
+HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) {
+  return V16x8U(_mm_min_epu8(v0, v1));
+}
+
+HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) {
+  return V16x8U(_mm_max_epu8(v0, v1));
+}
+
+HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) {
+  return V8x16U(_mm_min_epu16(v0, v1));
+}
+
+HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) {
+  return V8x16U(_mm_max_epu16(v0, v1));
+}
+
+HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) {
+  return V4x32U(_mm_min_epu32(v0, v1));
+}
+
+HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) {
+  return V4x32U(_mm_max_epu32(v0, v1));
+}
+
+HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) {
+  return V4x32F(_mm_min_ps(v0, v1));
+}
+
+HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) {
+  return V4x32F(_mm_max_ps(v0, v1));
+}
+
+HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) {
+  return V2x64F(_mm_min_pd(v0, v1));
+}
+
+HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) {
+  return V2x64F(_mm_max_pd(v0, v1));
+}
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_VECTOR128_H_
diff --git a/highwayhash/highwayhash/vector256.h b/highwayhash/highwayhash/vector256.h
new file mode 100644
index 000000000..d1ccec49e
--- /dev/null
+++ b/highwayhash/highwayhash/vector256.h
@@ -0,0 +1,758 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR256_H_
+#define HIGHWAYHASH_VECTOR256_H_
+
+// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators:
+// const V4x64U masked_sum = (a + b) & m;
+// This is shorter and more readable than compiler intrinsics:
+// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m);
+// There is typically no runtime cost for these abstractions.
+//
+// The naming convention is VNxBBT where N is the number of lanes, BB the
+// number of bits per lane and T is the lane type: unsigned integer (U),
+// signed integer (I), or floating-point (F).
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents (otherwise compilation fails because -mavx2 is not specified).
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+
+// (This include cannot be moved within a namespace due to conflicts with
+// other system headers; see the comment in hh_sse41.h.)
+#include <immintrin.h>
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Primary template for 256-bit AVX2 vectors; only specializations are used.
+template <typename T>
+class V256 {};
+
+template <>
+class V256<uint8_t> {
+ public:
+  using Intrinsic = __m256i;
+  using T = uint8_t;
+  static constexpr size_t N = 32;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Broadcasts i to all lanes.
+  HH_INLINE explicit V256(T i)
+      : v_(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(i))) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmpeq_epi8(v_, other.v_));
+  }
+
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_epi8(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_epi8(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_si256(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V256<uint16_t> {
+ public:
+  using Intrinsic = __m256i;
+  using T = uint16_t;
+  static constexpr size_t N = 16;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V256(T p_F, T p_E, T p_D, T p_C, T p_B, T p_A, T p_9, T p_8, T p_7,
+                 T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm256_set_epi16(p_F, p_E, p_D, p_C, p_B, p_A, p_9, p_8, p_7, p_6,
+                            p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts i to all lanes.
+  HH_INLINE explicit V256(T i)
+      : v_(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(i))) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmpeq_epi16(v_, other.v_));
+  }
+
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_epi16(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_epi16(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_si256(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator<<=(const int count) {
+    v_ = _mm256_slli_epi16(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V256& operator>>=(const int count) {
+    v_ = _mm256_srli_epi16(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V256<uint32_t> {
+ public:
+  using Intrinsic = __m256i;
+  using T = uint32_t;
+  static constexpr size_t N = 8;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm256_set_epi32(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts i to all lanes.
+  HH_INLINE explicit V256(T i)
+      : v_(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i))) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmpeq_epi32(v_, other.v_));
+  }
+
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_epi32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_epi32(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_si256(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator<<=(const int count) {
+    v_ = _mm256_slli_epi32(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V256& operator>>=(const int count) {
+    v_ = _mm256_srli_epi32(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V256<uint64_t> {
+ public:
+  using Intrinsic = __m256i;
+  using T = uint64_t;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts i to all lanes.
+  HH_INLINE explicit V256(T i)
+      : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmpeq_epi64(v_, other.v_));
+  }
+
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_epi64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_epi64(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_si256(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_si256(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator<<=(const int count) {
+    v_ = _mm256_slli_epi64(v_, count);
+    return *this;
+  }
+
+  HH_INLINE V256& operator>>=(const int count) {
+    v_ = _mm256_srli_epi64(v_, count);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V256<float> {
+ public:
+  using Intrinsic = __m256;
+  using T = float;
+  static constexpr size_t N = 8;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V256(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm256_set_ps(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V256(T f) : v_(_mm256_set1_ps(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmp_ps(v_, other.v_, 0));
+  }
+  HH_INLINE V256 operator<(const V256& other) const {
+    return V256(_mm256_cmp_ps(v_, other.v_, 1));
+  }
+  HH_INLINE V256 operator>(const V256& other) const {
+    return V256(_mm256_cmp_ps(other.v_, v_, 1));
+  }
+
+  HH_INLINE V256& operator*=(const V256& other) {
+    v_ = _mm256_mul_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator/=(const V256& other) {
+    v_ = _mm256_div_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_ps(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_ps(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_ps(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V256<double> {
+ public:
+  using Intrinsic = __m256d;
+  using T = double;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V256() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V256(T p_3, T p_2, T p_1, T p_0)
+      : v_(_mm256_set_pd(p_3, p_2, p_1, p_0)) {}
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V256(T f) : v_(_mm256_set1_pd(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V256(const V256& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V256(const V256<U>& other) : v_(other) {}
+  HH_INLINE V256& operator=(const V256& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V256(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V256& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V256 operator==(const V256& other) const {
+    return V256(_mm256_cmp_pd(v_, other.v_, 0));
+  }
+  HH_INLINE V256 operator<(const V256& other) const {
+    return V256(_mm256_cmp_pd(v_, other.v_, 1));
+  }
+  HH_INLINE V256 operator>(const V256& other) const {
+    return V256(_mm256_cmp_pd(other.v_, v_, 1));
+  }
+
+  HH_INLINE V256& operator*=(const V256& other) {
+    v_ = _mm256_mul_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator/=(const V256& other) {
+    v_ = _mm256_div_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator+=(const V256& other) {
+    v_ = _mm256_add_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator-=(const V256& other) {
+    v_ = _mm256_sub_pd(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V256& operator&=(const V256& other) {
+    v_ = _mm256_and_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator|=(const V256& other) {
+    v_ = _mm256_or_pd(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V256& operator^=(const V256& other) {
+    v_ = _mm256_xor_pd(v_, other.v_);
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+// Nonmember functions for any V256 via member functions.
+
+template <typename T>
+HH_INLINE V256<T> operator*(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t *= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator/(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t /= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator+(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t += right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator-(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t -= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator&(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t &= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator|(const V256<T> left, const V256<T>& right) {
+  V256<T> t(left);
+  return t |= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator^(const V256<T>& left, const V256<T>& right) {
+  V256<T> t(left);
+  return t ^= right;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator<<(const V256<T>& v, const int count) {
+  V256<T> t(v);
+  return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V256<T> operator>>(const V256<T>& v, const int count) {
+  V256<T> t(v);
+  return t >>= count;
+}
+
+// We do not provide operator<<(V, __m128i) because it has 4 cycle latency
+// (to broadcast the shift count). It is faster to use sllv_epi64 etc. instead.
+
+using V32x8U = V256<uint8_t>;
+using V16x16U = V256<uint16_t>;
+using V8x32U = V256<uint32_t>;
+using V4x64U = V256<uint64_t>;
+using V8x32F = V256<float>;
+using V4x64F = V256<double>;
+
+// Load/Store for any V256.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector128.
+
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+template <>
+HH_INLINE V32x8U Load(const V32x8U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V32x8U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V16x16U Load(const V16x16U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V16x16U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V8x32U Load(const V8x32U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V8x32U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V4x64U Load(const V4x64U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V4x64U(_mm256_load_si256(p));
+}
+template <>
+HH_INLINE V8x32F Load(const V8x32F::T* const HH_RESTRICT from) {
+  return V8x32F(_mm256_load_ps(from));
+}
+template <>
+HH_INLINE V4x64F Load(const V4x64F::T* const HH_RESTRICT from) {
+  return V4x64F(_mm256_load_pd(from));
+}
+
+template <>
+HH_INLINE V32x8U LoadUnaligned(const V32x8U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V32x8U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V16x16U LoadUnaligned(const V16x16U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V16x16U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V8x32U LoadUnaligned(const V8x32U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V8x32U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V4x64U LoadUnaligned(const V4x64U::T* const HH_RESTRICT from) {
+  const __m256i* const HH_RESTRICT p = reinterpret_cast<const __m256i*>(from);
+  return V4x64U(_mm256_loadu_si256(p));
+}
+template <>
+HH_INLINE V8x32F LoadUnaligned(const V8x32F::T* const HH_RESTRICT from) {
+  return V8x32F(_mm256_loadu_ps(from));
+}
+template <>
+HH_INLINE V4x64F LoadUnaligned(const V4x64F::T* const HH_RESTRICT from) {
+  return V4x64F(_mm256_loadu_pd(from));
+}
+
+// "to" must be vector-aligned.
+template <typename T>
+HH_INLINE void Store(const V256<T>& v, T* const HH_RESTRICT to) {
+  _mm256_store_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Store(const V256<float>& v, float* const HH_RESTRICT to) {
+  _mm256_store_ps(to, v);
+}
+HH_INLINE void Store(const V256<double>& v, double* const HH_RESTRICT to) {
+  _mm256_store_pd(to, v);
+}
+
+template <typename T>
+HH_INLINE void StoreUnaligned(const V256<T>& v, T* const HH_RESTRICT to) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void StoreUnaligned(const V256<float>& v,
+                              float* const HH_RESTRICT to) {
+  _mm256_storeu_ps(to, v);
+}
+HH_INLINE void StoreUnaligned(const V256<double>& v,
+                              double* const HH_RESTRICT to) {
+  _mm256_storeu_pd(to, v);
+}
+
+// Writes directly to (aligned) memory, bypassing the cache. This is useful for
+// data that will not be read again in the near future.
+template <typename T>
+HH_INLINE void Stream(const V256<T>& v, T* const HH_RESTRICT to) {
+  _mm256_stream_si256(reinterpret_cast<__m256i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Stream(const V256<float>& v, float* const HH_RESTRICT to) {
+  _mm256_stream_ps(to, v);
+}
+HH_INLINE void Stream(const V256<double>& v, double* const HH_RESTRICT to) {
+  _mm256_stream_pd(to, v);
+}
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE V256<T> RotateLeft(const V256<T>& v, const int count) {
+  constexpr size_t num_bits = sizeof(T) * 8;
+  return (v << count) | (v >> (num_bits - count));
+}
+
+template <typename T>
+HH_INLINE V256<T> AndNot(const V256<T>& neg_mask, const V256<T>& values) {
+  return V256<T>(_mm256_andnot_si256(neg_mask, values));
+}
+template <>
+HH_INLINE V256<float> AndNot(const V256<float>& neg_mask,
+                             const V256<float>& values) {
+  return V256<float>(_mm256_andnot_ps(neg_mask, values));
+}
+template <>
+HH_INLINE V256<double> AndNot(const V256<double>& neg_mask,
+                              const V256<double>& values) {
+  return V256<double>(_mm256_andnot_pd(neg_mask, values));
+}
+
+HH_INLINE V8x32F Select(const V8x32F& a, const V8x32F& b, const V8x32F& mask) {
+  return V8x32F(_mm256_blendv_ps(a, b, mask));
+}
+
+HH_INLINE V4x64F Select(const V4x64F& a, const V4x64F& b, const V4x64F& mask) {
+  return V4x64F(_mm256_blendv_pd(a, b, mask));
+}
+
+// Min/Max
+
+HH_INLINE V32x8U Min(const V32x8U& v0, const V32x8U& v1) {
+  return V32x8U(_mm256_min_epu8(v0, v1));
+}
+
+HH_INLINE V32x8U Max(const V32x8U& v0, const V32x8U& v1) {
+  return V32x8U(_mm256_max_epu8(v0, v1));
+}
+
+HH_INLINE V16x16U Min(const V16x16U& v0, const V16x16U& v1) {
+  return V16x16U(_mm256_min_epu16(v0, v1));
+}
+
+HH_INLINE V16x16U Max(const V16x16U& v0, const V16x16U& v1) {
+  return V16x16U(_mm256_max_epu16(v0, v1));
+}
+
+HH_INLINE V8x32U Min(const V8x32U& v0, const V8x32U& v1) {
+  return V8x32U(_mm256_min_epu32(v0, v1));
+}
+
+HH_INLINE V8x32U Max(const V8x32U& v0, const V8x32U& v1) {
+  return V8x32U(_mm256_max_epu32(v0, v1));
+}
+
+HH_INLINE V8x32F Min(const V8x32F& v0, const V8x32F& v1) {
+  return V8x32F(_mm256_min_ps(v0, v1));
+}
+
+HH_INLINE V8x32F Max(const V8x32F& v0, const V8x32F& v1) {
+  return V8x32F(_mm256_max_ps(v0, v1));
+}
+
+HH_INLINE V4x64F Min(const V4x64F& v0, const V4x64F& v1) {
+  return V4x64F(_mm256_min_pd(v0, v1));
+}
+
+HH_INLINE V4x64F Max(const V4x64F& v0, const V4x64F& v1) {
+  return V4x64F(_mm256_max_pd(v0, v1));
+}
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_VECTOR256_H_
diff --git a/highwayhash/highwayhash/vector_neon.h b/highwayhash/highwayhash/vector_neon.h
new file mode 100644
index 000000000..79c50caa9
--- /dev/null
+++ b/highwayhash/highwayhash/vector_neon.h
@@ -0,0 +1,1037 @@
+// Copyright 2016-2019 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR128_NEON_H_
+#define HIGHWAYHASH_VECTOR128_NEON_H_
+
+// Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators:
+// const V2x64U masked_sum = (a + b) & m;
+// This is shorter and more readable than compiler intrinsics:
+// const uint64x2_t masked_sum = vandq_u64(vaddq_u64(a, b), m);
+// There is typically no runtime cost for these abstractions.
+//
+// The naming convention is VNxBBT where N is the number of lanes, BB the
+// number of bits per lane and T is the lane type: unsigned integer (U),
+// signed integer (I), or floating-point (F). (Note: Floating point vectors are
+// currently disabled).
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/compiler_specific.h"
+#include "highwayhash/hh_types.h"
+
+// For auto-dependency generation, we need to include all headers but not their
+// contents.
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+#include <arm_neon.h>  // NEON
+#include <string.h>
+
+namespace highwayhash {
+// To prevent ODR violations when including this from multiple translation
+// units (TU) that are compiled with different flags, the contents must reside
+// in a namespace whose name is unique to the TU. NOTE: this behavior is
+// incompatible with precompiled modules and requires textual inclusion instead.
+namespace HH_TARGET_NAME {
+
+// Polyfills for ARMv7. ARMv7 lacks a few important instructions which were added
+// in aarch64, so we simulate them with these polyfills.
+#if !defined(__aarch64__) && !defined(__arm64__)
+#ifndef vqtbl1q_u8
+// aarch64 allows a 128-bit lookup table with vtbl. ARMv7 needs to do two
+// lookups for the same effect.
+HH_INLINE uint8x16_t vqtbl1q_u8 (uint8x16_t t, uint8x16_t idx)
+{
+  // Prevents scalarizing by GCC.
+  // NOT PORTABLE TO AARCH64! aarch64 uses two separate vectors instead of packing
+  // them, so this reinterpret_cast would fail!
+  uint8x8x2_t split = *reinterpret_cast<const uint8x8x2_t *>(&t);
+  return vcombine_u8(
+    vtbl2_u8(split, vget_low_u8(idx)),
+    vtbl2_u8(split, vget_high_u8(idx))
+  );
+}
+#endif
+#ifndef vnegq_s64
+// ARMv7 lacks this for some weird reason.
+HH_INLINE int64x2_t vnegq_s64(int64x2_t v)
+{
+    const int64x2_t zero = vdupq_n_u64(0);
+    return vsubq_s64(zero, v);
+}
+#endif
+#ifndef vceqq_u64
+HH_INLINE uint64x2_t vceqq_u64(uint64x2_t v1, uint64x2_t v2)
+{
+    uint32x4_t comparison = vceqq_u32(vreinterpretq_u32_u64(v1), vreinterpretq_u32_u64(v2));
+    return vreinterpretq_u64_u32(vandq_u32(comparison, vrev64q_u32(comparison)));
+}
+#endif // vnegq_s64
+#endif // !__aarch64__ && !__arm64__
+
+// Pseudo-instructions.
+// _mm_storel_epi64
+HH_INLINE void vst1q_low_u64(uint64_t* a, uint64x2_t b)
+{
+  uint64x1_t lo = vget_low_u64(b);
+  vst1_u8(reinterpret_cast<uint8_t*>(a), vreinterpret_u8_u64(lo));
+}
+// _mm_loadl_epi64
+HH_INLINE uint64x2_t vld1q_low_u64(const uint64_t* p)
+{
+  return vcombine_u64(
+    vld1_u64(p),
+    vdup_n_u64(0)
+  );
+}
+// _mm_slli_si128 (almost)
+#define vshlq_n_u128(a, imm) ( \
+  vreinterpretq_u64_u8( \
+    vextq_u8( \
+      vdupq_n_u8(0), \
+      vreinterpretq_u8_u64(a), \
+      16 - (imm) \
+    ) \
+  ) \
+)
+
+// Adapted from xsimd.
+// arm_neon.h requires literals for their parameters in many
+// functions, such as vshrq_n_u64, and it will complain even when
+// the value is known at compile-time.
+#define EXPAND(...) __VA_ARGS__
+
+#define CASE(op, i, ...)                   \
+    case i: v_ = op(__VA_ARGS__, i); break;
+
+#define INTRINSIC_REPEAT_8_0(op, addx, ...)    \
+    CASE(EXPAND(op), 1 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 2 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 3 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 4 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 5 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 6 + addx, __VA_ARGS__);       \
+    CASE(EXPAND(op), 7 + addx, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_8_N(op, addx, ...)    \
+    CASE(EXPAND(op), 0 + addx, __VA_ARGS__);       \
+    INTRINSIC_REPEAT_8_0(op, addx, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_8(op, ...)            \
+    INTRINSIC_REPEAT_8_0(op, 0, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_16_0(op, addx,...)   \
+    INTRINSIC_REPEAT_8_0(op, 0 + addx, __VA_ARGS__);   \
+    INTRINSIC_REPEAT_8_N(op, 8 + addx, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_16_N(op, addx, ...)   \
+    INTRINSIC_REPEAT_8_N(op, 0 + addx, __VA_ARGS__);   \
+    INTRINSIC_REPEAT_8_N(op, 8 + addx, __VA_ARGS__);
+
+#define INTRINSIC_EACH_16(op, ...)           \
+    INTRINSIC_REPEAT_16_0(op, 0, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_32_0(op, addx, ...)   \
+    INTRINSIC_REPEAT_16_0(op, 0 + addx, __VA_ARGS__);  \
+    INTRINSIC_REPEAT_16_N(op, 16 + addx, __VA_ARGS__);
+
+#define INTRINSIC_REPEAT_32_N(op, addx, ...)   \
+    INTRINSIC_REPEAT_16_N(op, 0 + addx, __VA_ARGS__);  \
+    INTRINSIC_REPEAT_16_N(op, 16 + addx, __VA_ARGS__);
+
+#define INTRINSIC_EACH_32(op, ...)           \
+    INTRINSIC_REPEAT_32_0(op, 0, __VA_ARGS__);
+
+#define INTRINSIC_EACH_64(op, ...)           \
+    INTRINSIC_REPEAT_32_0(op, 0, __VA_ARGS__);         \
+    INTRINSIC_REPEAT_32_N(op, 32, __VA_ARGS__);
+
+// Primary template for 128-bit SSE4.1 vectors; only specializations are used.
+template <typename T>
+class V128 {};
+
+template <>
+class V128<uint8_t> {
+ public:
+  using Intrinsic = uint8x16_t;
+  using T = uint8_t;
+  static constexpr size_t N = 16;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Sets all lanes to the same value.
+  HH_INLINE explicit V128(T i) : v_(vdupq_n_u8(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+
+  // C-style cast because vector casts are stupid on NEON.
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_((const uint8x16_t)(other)) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_u8(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_u8(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_u8(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vandq_u8(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vorrq_u8(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = veorq_u8(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& AndNot(const Intrinsic &neg_mask) {
+    v_ = vbicq_u8(v_, neg_mask);
+    return *this;
+  }
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint16_t> {
+ public:
+  using Intrinsic = uint16x8_t;
+  using T = uint16_t;
+  static constexpr size_t N = 8;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) {
+    alignas(16) const uint16_t data[8] = {
+      p_0, p_1, p_2, p_3, p_4, p_5, p_6, p_7
+    };
+    v_ = vld1q_u16(data);
+  }
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(vdupq_n_u16(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_u16(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_u16(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_u16(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vandq_u16(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vorrq_u16(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = veorq_u16(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_16(vshlq_n_u16, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = vshlq_u16(v_, vreinterpretq_s16_u16(count));
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_16(vshrq_n_u16, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = vshlq_u16(v_, vnegq_s16(vreinterpretq_s16_u16(count)));
+    return *this;
+  }
+
+  HH_INLINE V128& ShiftRightInsert(const Intrinsic &value, const int count) {
+    switch (count) {
+      INTRINSIC_EACH_16(vsriq_n_u16, v_, value)
+    }
+    return *this;
+  }
+
+  HH_INLINE V128& AndNot(const Intrinsic &neg_mask) {
+    v_ = vbicq_u16(v_, neg_mask);
+    return *this;
+  }
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint32_t> {
+ public:
+  using Intrinsic = uint32x4_t;
+  using T = uint32_t;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) {
+    alignas(16) const T data[4] = {
+      p_0, p_1, p_2, p_3
+    };
+    v_ = vld1q_u32(data);
+  }
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(vdupq_n_u32(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_((const uint32x4_t)other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_u32(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_u32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_u32(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vandq_u32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vorrq_u32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = veorq_u32(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_32(vshlq_n_u32, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = vshlq_u32(v_, vreinterpretq_s32_u32(count));
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_32(vshrq_n_u32, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = vshlq_u32(v_, vnegq_s32(vreinterpretq_s32_u32(count)));
+    return *this;
+  }
+
+  HH_INLINE V128& ShiftRightInsert(const Intrinsic &value, const int count) {
+    switch (count) {
+      INTRINSIC_EACH_32(vsriq_n_u32, v_, value)
+    }
+    return *this;
+  }
+
+  HH_INLINE V128& AndNot(const Intrinsic &neg_mask) {
+    v_ = vbicq_u32(v_, neg_mask);
+    return *this;
+  }
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<uint64_t> {
+ public:
+  using Intrinsic = uint64x2_t;
+  using T = uint64_t;
+  static constexpr size_t N = 2;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_1, T p_0) {
+    alignas(16) const T data[2] = {
+      p_0, p_1
+    };
+    v_ = vld1q_u64(data);
+  }
+
+  // Broadcasts i to all lanes (usually by loading from memory).
+  HH_INLINE explicit V128(T i) : v_(vdupq_n_u64(i)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  // There are no greater-than comparison instructions for unsigned T.
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_u64(v_, other.v_));
+  }
+
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_u64(v_, other.v_);
+    // Prevent Clang from converting to vaddhn when nearby vmovn, which
+    // causes four spills in the main loop on ARMv7a.
+#ifdef __GNUC__
+    __asm__("" : "+w" (v_));
+#endif
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_u64(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vandq_u64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vorrq_u64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = veorq_u64(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator<<=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_64(vshlq_n_u64, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator<<=(const Intrinsic& count) {
+    v_ = vshlq_u64(v_, vreinterpretq_s64_u64(count));
+    return *this;
+  }
+
+  HH_INLINE V128& operator>>=(const int count) {
+    switch (count) {
+      INTRINSIC_EACH_64(vshrq_n_u64, v_)
+    }
+    return *this;
+  }
+  HH_INLINE V128& operator>>=(const Intrinsic& count) {
+    v_ = vshlq_u64(v_, vnegq_s64(vreinterpretq_s64_u64(count)));
+    return *this;
+  }
+
+  HH_INLINE V128& AndNot(const Intrinsic &neg_mask) {
+    v_ = vbicq_u64(v_, neg_mask);
+    return *this;
+  }
+
+  HH_INLINE V128& ShiftRightInsert(const Intrinsic &value, const int count) {
+    switch (count) {
+      INTRINSIC_EACH_64(vsriq_n_u64, v_, value)
+    }
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+// TODO: Enable. For now, this is disabled for the following reasons:
+//   1. ARMv7a lacks float64x2_t.
+//   2. ARMv7a's float32x4_t is not IEE-754 compliant
+//   3. We don't actually use the float vectors right now.
+#if 0
+template <>
+class V128<float> {
+ public:
+  using Intrinsic = float32x4_t;
+  using T = float;
+  static constexpr size_t N = 4;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) {
+    HH_ALIGNAS(16) float tmp[4] = { p_0, p_1, p_2, p_3 };
+    v_ = vld1q_f32(tmp);
+  }
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V128(T f) : v_(vdupq_n_f32(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_f32(v_, other.v_));
+  }
+  HH_INLINE V128 operator<(const V128& other) const {
+    return V128(vcltq_f32(v_, other.v_));
+  }
+  HH_INLINE V128 operator>(const V128& other) const {
+    return V128(vcltq_f32(other.v_, v_));
+  }
+
+  HH_INLINE V128& operator*=(const V128& other) {
+    v_ = vmulq_f32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator/=(const V128& other) {
+    v_ = vdivq_f32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_f32(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_f32(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_)));
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_)));
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_)));
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+
+template <>
+class V128<double> {
+ public:
+  using Intrinsic = float64x2_t;
+  using T = double;
+  static constexpr size_t N = 2;
+
+  // Leaves v_ uninitialized - typically used for output parameters.
+  HH_INLINE V128() {}
+
+  // Lane 0 (p_0) is the lowest.
+  HH_INLINE V128(T p_1, T p_0) {
+    HH_ALIGNAS(16) double tmp[2] = { p_0, p_1 };
+    v_ = vld1q_f64(tmp);
+  }
+
+  // Broadcasts to all lanes.
+  HH_INLINE explicit V128(T f) : v_(vdupq_n_f64(f)) {}
+
+  // Copy from other vector.
+  HH_INLINE explicit V128(const V128& other) : v_(other.v_) {}
+  template <typename U>
+  HH_INLINE explicit V128(const V128<U>& other) : v_(other) {}
+  HH_INLINE V128& operator=(const V128& other) {
+    v_ = other.v_;
+    return *this;
+  }
+
+  // Convert from/to intrinsics.
+  HH_INLINE V128(const Intrinsic& v) : v_(v) {}
+  HH_INLINE V128& operator=(const Intrinsic& v) {
+    v_ = v;
+    return *this;
+  }
+  HH_INLINE operator Intrinsic() const { return v_; }
+
+  HH_INLINE V128 operator==(const V128& other) const {
+    return V128(vceqq_f64(v_, other.v_));
+  }
+  HH_INLINE V128 operator<(const V128& other) const {
+    return V128(vcltq_f64(v_, other.v_));
+  }
+  HH_INLINE V128 operator>(const V128& other) const {
+    return V128(vcltq_f64(other.v_, v_));
+  }
+
+  HH_INLINE V128& operator*=(const V128& other) {
+    v_ = vmulq_f64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator/=(const V128& other) {
+    v_ = vdivq_f64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator+=(const V128& other) {
+    v_ = vaddq_f64(v_, other.v_);
+    return *this;
+  }
+  HH_INLINE V128& operator-=(const V128& other) {
+    v_ = vsubq_f64(v_, other.v_);
+    return *this;
+  }
+
+  HH_INLINE V128& operator&=(const V128& other) {
+    v_ = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_)));
+    return *this;
+  }
+  HH_INLINE V128& operator|=(const V128& other) {
+    v_ = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_)));
+    return *this;
+  }
+  HH_INLINE V128& operator^=(const V128& other) {
+    v_ = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_)));
+    return *this;
+  }
+
+ private:
+  Intrinsic v_;
+};
+#endif
+
+// Nonmember functions for any V128 via member functions.
+
+template <typename T>
+HH_INLINE V128<T> operator*(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t *= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator/(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t /= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator+(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t += right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator-(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t -= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator&(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t &= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator|(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t |= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator^(const V128<T>& left, const V128<T>& right) {
+  V128<T> t(left);
+  return t ^= right;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator<<(const V128<T>& v, const int count) {
+  V128<T> t(v);
+  return t <<= count;
+}
+
+template <typename T>
+HH_INLINE V128<T> operator>>(const V128<T>& v, const int count) {
+  V128<T> t(v);
+  return t >>= count;
+}
+
+using V16x8U = V128<uint8_t>;
+using V8x16U = V128<uint16_t>;
+using V4x32U = V128<uint32_t>;
+using V2x64U = V128<uint64_t>;
+using V4x32F = V128<float>;
+using V2x64F = V128<double>;
+
+// Load/Store for any V128.
+
+// We differentiate between targets' vector types via template specialization.
+// Calling Load<V>(floats) is more natural than Load(V8x32F(), floats) and may
+// generate better code in unoptimized builds. Only declare the primary
+// templates to avoid needing mutual exclusion with vector256.
+
+template <class V>
+HH_INLINE V Load(const typename V::T* const HH_RESTRICT from);
+
+template <class V>
+HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from);
+
+// "from" must be vector-aligned.
+#ifdef __GNUC__
+#define HH_ALIGN(x) __builtin_assume_aligned((x), 16)
+#else
+#define HH_ALIGN(x) (x)
+#endif
+template <>
+HH_INLINE V16x8U Load<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+  const uint8_t* const HH_RESTRICT p = reinterpret_cast<const uint8_t*>(HH_ALIGN(from));
+  return V16x8U(vld1q_u8(p));
+}
+template <>
+HH_INLINE V8x16U Load<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+  const uint16_t* const HH_RESTRICT p = reinterpret_cast<const uint16_t*>(HH_ALIGN(from));
+  return V8x16U(vld1q_u16(p));
+}
+template <>
+HH_INLINE V4x32U Load<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+  const uint32_t* const HH_RESTRICT p = reinterpret_cast<const uint32_t*>(HH_ALIGN(from));
+  return V4x32U(vld1q_u32(p));
+}
+template <>
+HH_INLINE V2x64U Load<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+  const uint64_t* const HH_RESTRICT p = reinterpret_cast<const uint64_t*>(HH_ALIGN(from));
+  return V2x64U(vld1q_u64(p));
+}
+#if 0
+template <>
+HH_INLINE V4x32F Load<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+  return V4x32F(vld1q_f32(from));
+}
+template <>
+HH_INLINE V2x64F Load<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+  return V2x64F(vld1q_f64(from));
+}
+#endif
+// GCC for ARM 32-bit flips out on unaligned reads after a cast.
+// Only vld1q_u8 is safe on unaligned pointers.
+template <>
+HH_INLINE V16x8U
+LoadUnaligned<V16x8U>(const V16x8U::T* const HH_RESTRICT from) {
+  const uint8_t* const HH_RESTRICT p = reinterpret_cast<const uint8_t*>(from);
+  return V16x8U(vld1q_u8(p));
+}
+template <>
+HH_INLINE V8x16U
+LoadUnaligned<V8x16U>(const V8x16U::T* const HH_RESTRICT from) {
+  const uint8_t* const HH_RESTRICT p = reinterpret_cast<const uint8_t*>(from);
+  return V8x16U(vreinterpretq_u16_u8(vld1q_u8(p)));
+}
+template <>
+HH_INLINE V4x32U
+LoadUnaligned<V4x32U>(const V4x32U::T* const HH_RESTRICT from) {
+  const uint8_t* const HH_RESTRICT p = reinterpret_cast<const uint8_t*>(from);
+  return V4x32U(vreinterpretq_u32_u8(vld1q_u8(p)));
+}
+template <>
+HH_INLINE V2x64U
+LoadUnaligned<V2x64U>(const V2x64U::T* const HH_RESTRICT from) {
+  const uint8_t* const HH_RESTRICT p = reinterpret_cast<const uint8_t*>(from);
+  return V2x64U(vreinterpretq_u64_u8(vld1q_u8(p)));
+}
+#if 0
+template <>
+HH_INLINE V4x32F
+LoadUnaligned<V4x32F>(const V4x32F::T* const HH_RESTRICT from) {
+  return V4x32F(vld1q_f32(from));
+}
+template <>
+HH_INLINE V2x64F
+LoadUnaligned<V2x64F>(const V2x64F::T* const HH_RESTRICT from) {
+  return V2x64F(vld1q_f64(from));
+}
+#endif
+HH_INLINE uint32_t LoadUnaligned(const uint32_t* const from) {
+  const uint8_t* cfrom = reinterpret_cast<const uint8_t*>(from);
+  return cfrom[0] | (cfrom[1] << 8) | (cfrom[2] << 16) | (cfrom[3] << 24);
+}
+
+// "to" must be vector-aligned.
+template <typename T>
+HH_INLINE void Store(const V128<T>& v, T* const HH_RESTRICT to);
+
+template<>
+HH_INLINE void Store<uint8_t>(const V128<uint8_t>& v, uint8_t* HH_RESTRICT to) {
+  uint8_t *const HH_RESTRICT p = reinterpret_cast<uint8_t *>(HH_ALIGN(to));
+  vst1q_u8(p, v);
+}
+
+template<>
+HH_INLINE void Store<uint16_t>(const V128<uint16_t>& v, uint16_t* const HH_RESTRICT to) {
+  uint16_t *const HH_RESTRICT p = reinterpret_cast<uint16_t *>(HH_ALIGN(to));
+  vst1q_u16(p, v);
+}
+
+template<>
+HH_INLINE void Store<uint32_t>(const V128<uint32_t>& v, uint32_t* const HH_RESTRICT to) {
+  uint32_t *const HH_RESTRICT p = reinterpret_cast<uint32_t *>(HH_ALIGN(to));
+  vst1q_u32(p, v);
+}
+
+template<>
+HH_INLINE void Store<uint64_t>(const V128<uint64_t>& v, uint64_t* const HH_RESTRICT to) {
+  uint64_t *const HH_RESTRICT p = reinterpret_cast<uint64_t *>(HH_ALIGN(to));
+  vst1q_u64(p, v);
+}
+#undef HH_ALIGN
+
+#if 0
+HH_INLINE void Store(const V128<float>& v, float* const HH_RESTRICT to) {
+  vst1q_f32(to, v);
+}
+HH_INLINE void Store(const V128<double>& v, double* const HH_RESTRICT to) {
+  vst1q_f64(to, v);
+}
+#endif
+template <typename T>
+HH_INLINE void StoreUnaligned(const V128<T>& v, T* const HH_RESTRICT to);
+
+template<>
+HH_INLINE void StoreUnaligned<uint8_t>(const V128<uint8_t>& v, uint8_t* const HH_RESTRICT to) {
+  vst1q_u8(to, v);
+}
+
+template<>
+HH_INLINE void StoreUnaligned<uint16_t>(const V128<uint16_t>& v, uint16_t* const HH_RESTRICT to) {
+  vst1q_u8(reinterpret_cast<uint8_t *>(to), vreinterpretq_u8_u16(v));
+}
+
+template<>
+HH_INLINE void StoreUnaligned<uint32_t>(const V128<uint32_t>& v, uint32_t* const HH_RESTRICT to) {
+  vst1q_u8(reinterpret_cast<uint8_t *>(to), vreinterpretq_u8_u32(v));
+}
+
+template<>
+HH_INLINE void StoreUnaligned<uint64_t>(const V128<uint64_t>& v, uint64_t* const HH_RESTRICT to) {
+  vst1q_u8(reinterpret_cast<uint8_t *>(to), vreinterpretq_u8_u64(v));
+}
+#if 0
+HH_INLINE void StoreUnaligned(const V128<float>& v,
+                              float* const HH_RESTRICT to) {
+  _mm_storeu_ps(to, v);
+}
+HH_INLINE void StoreUnaligned(const V128<double>& v,
+                              double* const HH_RESTRICT to) {
+  _mm_storeu_pd(to, v);
+}
+#endif
+
+// TODO: Enable.
+#if 0
+// Writes directly to (aligned) memory, bypassing the cache. This is useful for
+// data that will not be read again in the near future.
+template <typename T>
+HH_INLINE void Stream(const V128<T>& v, T* const HH_RESTRICT to) {
+  _mm_stream_si128(reinterpret_cast<__m128i * HH_RESTRICT>(to), v);
+}
+HH_INLINE void Stream(const V128<float>& v, float* const HH_RESTRICT to) {
+  _mm_stream_ps(to, v);
+}
+HH_INLINE void Stream(const V128<double>& v, double* const HH_RESTRICT to) {
+  _mm_stream_pd(to, v);
+}
+#endif
+
+// Miscellaneous functions.
+
+template <typename T>
+HH_INLINE V128<T> RotateLeft(const V128<T>& v, const int count) {
+  const size_t num_bits = sizeof(T) * 8;
+  const V128<T>& tmp = v << count;
+  return tmp.ShiftRightAccumulate(v, num_bits - count);
+}
+
+template <typename T>
+HH_INLINE V128<T> AndNot(const V128<T>& neg_mask, const V128<T>& values) {
+  V128<T> tmp = values;
+  return tmp.AndNot(neg_mask);
+}
+#if 0
+template <>
+HH_INLINE V128<float> AndNot(const V128<float>& neg_mask,
+                             const V128<float>& values) {
+  return V128<float>(_mm_andnot_ps(neg_mask, values));
+}
+template <>
+HH_INLINE V128<double> AndNot(const V128<double>& neg_mask,
+                              const V128<double>& values) {
+  return V128<double>(_mm_andnot_pd(neg_mask, values));
+}
+#endif
+HH_INLINE V4x32U Select(const V4x32U& a, const V4x32U& b, const V4x32U& mask) {
+  return V4x32U(vbslq_u32(mask, a, b));
+}
+
+HH_INLINE V2x64U Select(const V2x64U& a, const V2x64U& b, const V2x64U& mask) {
+  return V2x64U(vbslq_u64(mask, a, b));
+}
+
+// Min/Max
+
+HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) {
+  return V16x8U(vminq_u8(v0, v1));
+}
+
+HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) {
+  return V16x8U(vmaxq_u8(v0, v1));
+}
+
+HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) {
+  return V8x16U(vminq_u16(v0, v1));
+}
+
+HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) {
+  return V8x16U(vmaxq_u16(v0, v1));
+}
+
+HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) {
+  return V4x32U(vminq_u32(v0, v1));
+}
+
+HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) {
+  return V4x32U(vmaxq_u32(v0, v1));
+}
+
+
+#if 0
+HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) {
+  return V4x32F(vminq_f32(v0, v1));
+}
+
+HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) {
+  return V4x32F(vmaxq_f32(v0, v1));
+}
+
+HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) {
+  return V2x64F(vminq_f64(v0, v1));
+}
+
+HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) {
+  return V2x64F(vmaxq_f64(v0, v1));
+}
+#endif
+
+}  // namespace HH_TARGET_NAME
+}  // namespace highwayhash
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+#endif  // HIGHWAYHASH_VECTOR128_H_
diff --git a/highwayhash/highwayhash/vector_test.cc b/highwayhash/highwayhash/vector_test.cc
new file mode 100644
index 000000000..8ecd89fe9
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test.cc
@@ -0,0 +1,66 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#ifdef HH_GOOGLETEST
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#endif
+
+#include "highwayhash/instruction_sets.h"
+#include "highwayhash/vector_test_target.h"
+
+namespace highwayhash {
+namespace {
+
+#ifdef HH_DISABLE_TARGET_SPECIFIC
+void RunTests() {}
+#else
+
+void NotifyFailure(const char* target, const size_t size) {
+  const size_t lane_bits = (size & 0xFF) * 8;
+  const size_t lane_index = (size >> 8) & 0xFF;
+  const size_t line = (size >> 16);
+#ifdef HH_GOOGLETEST
+  EXPECT_TRUE(false) << "VectorTest failed for " << target << " T=" << lane_bits
+                     << ", lane " << lane_index << ", line " << line;
+#else
+  printf("VectorTest failed for %10s T=%zu, lane=%zu, line=%zu\n", target,
+         lane_bits, lane_index, line);
+#endif
+}
+
+void RunTests() {
+  const TargetBits tested = InstructionSets::RunAll<VectorTest>(&NotifyFailure);
+  HH_TARGET_NAME::ForeachTarget(tested, [](const TargetBits target) {
+    printf("%10s: done\n", TargetName(target));
+  });
+}
+
+#endif  // HH_DISABLE_TARGET_SPECIFIC
+
+#ifdef HH_GOOGLETEST
+TEST(VectorTest, Run) { RunTests(); }
+#endif
+
+}  // namespace
+}  // namespace highwayhash
+
+#ifndef HH_GOOGLETEST
+int main(int argc, char* argv[]) {
+  highwayhash::RunTests();
+  return 0;
+}
+#endif
diff --git a/highwayhash/highwayhash/vector_test_avx2.cc b/highwayhash/highwayhash/vector_test_avx2.cc
new file mode 100644
index 000000000..30ce2c992
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_avx2.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME AVX2
+#include "highwayhash/vector_test_target.cc"
diff --git a/highwayhash/highwayhash/vector_test_neon.cc b/highwayhash/highwayhash/vector_test_neon.cc
new file mode 100644
index 000000000..4f4d9a51b
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_neon.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME NEON
+#include "highwayhash/vector_test_target.cc"
diff --git a/highwayhash/highwayhash/vector_test_portable.cc b/highwayhash/highwayhash/vector_test_portable.cc
new file mode 100644
index 000000000..a742b4be8
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_portable.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME Portable
+#include "highwayhash/vector_test_target.cc"
diff --git a/highwayhash/highwayhash/vector_test_sse41.cc b/highwayhash/highwayhash/vector_test_sse41.cc
new file mode 100644
index 000000000..80e11b5d9
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_sse41.cc
@@ -0,0 +1,19 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#define HH_TARGET_NAME SSE41
+#include "highwayhash/vector_test_target.cc"
diff --git a/highwayhash/highwayhash/vector_test_target.cc b/highwayhash/highwayhash/vector_test_target.cc
new file mode 100644
index 000000000..1f1a1cdbc
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_target.cc
@@ -0,0 +1,225 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// WARNING: this is a "restricted" source file; avoid including any headers
+// unless they are also restricted. See arch_specific.h for details.
+
+#include "highwayhash/vector_test_target.h"
+
+#include "highwayhash/arch_specific.h"
+
+#ifndef HH_DISABLE_TARGET_SPECIFIC
+#if HH_TARGET == HH_TARGET_AVX2
+#include "highwayhash/vector256.h"
+#elif HH_TARGET == HH_TARGET_SSE41
+#include "highwayhash/vector128.h"
+#elif HH_TARGET == HH_TARGET_NEON
+#include "highwayhash/vector_neon.h"
+#elif HH_TARGET == HH_TARGET_Portable
+#include "highwayhash/scalar.h"
+#else
+#error "Unknown target, add its include here."
+#endif
+
+namespace highwayhash {
+namespace HH_TARGET_NAME {
+namespace {
+
+#if HH_TARGET == HH_TARGET_AVX2
+template <typename T>
+using V = V256<T>;
+#elif HH_TARGET == HH_TARGET_SSE41 || HH_TARGET == HH_TARGET_NEON
+template <typename T>
+using V = V128<T>;
+#elif HH_TARGET == HH_TARGET_Portable
+template <typename T>
+using V = Scalar<T>;
+#else
+#error "Unknown target, add its vector typedef here."
+#endif
+
+template <class T>
+void NotifyIfUnequal(const V<T>& v, const T expected, const size_t line,
+                     const HHNotify notify) {
+  HH_ALIGNAS(32) T lanes[V<T>::N];
+  Store(v, lanes);
+  for (size_t i = 0; i < V<T>::N; ++i) {
+    if (lanes[i] != expected) {
+      notify(TargetName(HH_TARGET), (line << 16) | (i << 8) | sizeof(T));
+    }
+  }
+}
+
+template <class T>
+void NotifyIfUnequal(const T& t, const T expected, const size_t line,
+                     const HHNotify notify) {
+  if (t != expected) {
+    notify(TargetName(HH_TARGET), (line << 16) | sizeof(T));
+  }
+}
+
+// MaxValue<T>()() replaces std::numeric_limits<T>::max().
+template <typename T>
+struct MaxValue;
+template <>
+struct MaxValue<uint8_t> {
+  constexpr uint8_t operator()() const { return 0xFFu; }
+};
+template <>
+struct MaxValue<uint16_t> {
+  constexpr uint16_t operator()() const { return 0xFFFFu; }
+};
+template <>
+struct MaxValue<uint32_t> {
+  constexpr uint32_t operator()() const { return 0xFFFFFFFFu; }
+};
+template <>
+struct MaxValue<uint64_t> {
+  constexpr uint64_t operator()() const { return 0xFFFFFFFFFFFFFFFFull; }
+};
+
+template <typename T>
+void TestMembersAndBinaryOperatorsExceptShifts(const HHNotify notify) {
+  // uninitialized
+  V<T> v;
+
+  // broadcast
+  const V<T> v2(2);
+  NotifyIfUnequal(v2, T(2), __LINE__, notify);
+
+  // assign from V
+  const V<T> v3(3);
+  V<T> v3b;
+  v3b = v3;
+  NotifyIfUnequal(v3b, T(3), __LINE__, notify);
+
+  // equal
+  const V<T> veq(v3 == v3b);
+  NotifyIfUnequal(veq, MaxValue<T>()(), __LINE__, notify);
+
+  // Copying to, and constructing from intrinsic yields same result.
+  typename V<T>::Intrinsic nv2 = v2;
+  V<T> v2b(nv2);
+  NotifyIfUnequal(v2b, T(2), __LINE__, notify);
+
+  // .. assignment also works.
+  V<T> v2c;
+  v2c = nv2;
+  NotifyIfUnequal(v2c, T(2), __LINE__, notify);
+
+  const V<T> add = v2 + v3;
+  NotifyIfUnequal(add, T(5), __LINE__, notify);
+
+  const V<T> sub = v3 - v2;
+  NotifyIfUnequal(sub, T(1), __LINE__, notify);
+
+  const V<T> vand = v3 & v2;
+  NotifyIfUnequal(vand, T(2), __LINE__, notify);
+
+  const V<T> vor = add | v2;
+  NotifyIfUnequal(vor, T(7), __LINE__, notify);
+
+  const V<T> vxor = v3 ^ v2;
+  NotifyIfUnequal(vxor, T(1), __LINE__, notify);
+}
+
+// SSE does not allow shifting uint8_t, so instantiate for all other types.
+template <class T>
+void TestShifts(const HHNotify notify) {
+  const V<T> v1(1);
+  // Shifting out of right side => zero
+  NotifyIfUnequal(v1 >> 1, T(0), __LINE__, notify);
+
+  // Simple left shift
+  NotifyIfUnequal(v1 << 1, T(2), __LINE__, notify);
+
+  // Sign bit
+  constexpr int kSign = (sizeof(T) * 8) - 1;
+  constexpr T max = MaxValue<T>()();
+  constexpr T sign = ~(max >> 1);
+  NotifyIfUnequal(v1 << kSign, sign, __LINE__, notify);
+
+  // Shifting out of left side => zero. Arm cannot shift by the number of
+  // lane bits, so start with 'left-shifting' by one (via adding).
+  NotifyIfUnequal((v1 + v1) << kSign, T(0), __LINE__, notify);
+}
+
+template <class T>
+void TestLoadStore(const HHNotify notify) {
+  const size_t n = V<T>::N;
+  HH_ALIGNAS(32) T lanes[2 * n];
+  for (size_t i = 0; i < n; ++i) {
+    lanes[i] = 4;
+  }
+  for (size_t i = n; i < 2 * n; ++i) {
+    lanes[i] = 5;
+  }
+  // Aligned load
+  const V<T> v4 = Load<V<T>>(lanes);
+  NotifyIfUnequal(v4, T(4), __LINE__, notify);
+
+  // Aligned store
+  HH_ALIGNAS(32) T lanes4[n];
+  Store(v4, lanes4);
+  NotifyIfUnequal(Load<V<T>>(lanes4), T(4), __LINE__, notify);
+
+  // Unaligned load
+  const V<T> vu = LoadUnaligned<V<T>>(lanes + 1);
+  Store(vu, lanes4);
+  NotifyIfUnequal(lanes4[n - 1], T(5), __LINE__, notify);
+  for (size_t i = 1; i < n - 1; ++i) {
+    NotifyIfUnequal(lanes4[i], T(4), __LINE__, notify);
+  }
+
+  // Unaligned store
+  StoreUnaligned(v4, lanes + n / 2);
+  size_t i;
+  for (i = 0; i < 3 * n / 2; ++i) {
+    NotifyIfUnequal(lanes[i], T(4), __LINE__, notify);
+  }
+  // Subsequent values remain unchanged.
+  for (; i < 2 * n; ++i) {
+    NotifyIfUnequal(lanes[i], T(5), __LINE__, notify);
+  }
+}
+
+void TestAll(const HHNotify notify) {
+  TestMembersAndBinaryOperatorsExceptShifts<uint8_t>(notify);
+  TestMembersAndBinaryOperatorsExceptShifts<uint16_t>(notify);
+  TestMembersAndBinaryOperatorsExceptShifts<uint32_t>(notify);
+  TestMembersAndBinaryOperatorsExceptShifts<uint64_t>(notify);
+
+  TestShifts<uint16_t>(notify);
+  TestShifts<uint32_t>(notify);
+  TestShifts<uint64_t>(notify);
+
+  TestLoadStore<uint8_t>(notify);
+  TestLoadStore<uint16_t>(notify);
+  TestLoadStore<uint32_t>(notify);
+  TestLoadStore<uint64_t>(notify);
+}
+
+}  // namespace
+}  // namespace HH_TARGET_NAME
+
+template <TargetBits Target>
+void VectorTest<Target>::operator()(const HHNotify notify) const {
+  HH_TARGET_NAME::TestAll(notify);
+}
+
+// Instantiate for the current target.
+template struct VectorTest<HH_TARGET>;
+
+}  // namespace highwayhash
+#endif  // HH_DISABLE_TARGET_SPECIFIC
diff --git a/highwayhash/highwayhash/vector_test_target.h b/highwayhash/highwayhash/vector_test_target.h
new file mode 100644
index 000000000..f1ff6382d
--- /dev/null
+++ b/highwayhash/highwayhash/vector_test_target.h
@@ -0,0 +1,37 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAYHASH_VECTOR_TEST_TARGET_H_
+#define HIGHWAYHASH_VECTOR_TEST_TARGET_H_
+
+// WARNING: this is a "restricted" header because it is included from
+// translation units compiled with different flags. This header and its
+// dependencies must not define any function unless it is static inline and/or
+// within namespace HH_TARGET_NAME. See arch_specific.h for details.
+
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/hh_types.h"
+
+namespace highwayhash {
+
+// Usage: InstructionSets::RunAll<VectorTest>(). Calls "notify" for each test
+// failure.
+template <TargetBits Target>
+struct VectorTest {
+  void operator()(const HHNotify notify) const;
+};
+
+}  // namespace highwayhash
+
+#endif  // HIGHWAYHASH_VECTOR_TEST_TARGET_H_
diff --git a/highwayhash/highwayhash_logo.png b/highwayhash/highwayhash_logo.png
new file mode 100644
index 000000000..c19f139cc
Binary files /dev/null and b/highwayhash/highwayhash_logo.png differ
diff --git a/highwayhash/java/com/google/highwayhash/HighwayHash.java b/highwayhash/java/com/google/highwayhash/HighwayHash.java
new file mode 100644
index 000000000..70ff34a0f
--- /dev/null
+++ b/highwayhash/java/com/google/highwayhash/HighwayHash.java
@@ -0,0 +1,309 @@
+package com.google.highwayhash;
+
+/**
+ * HighwayHash algorithm. See <a href="https://github.com/google/highwayhash">
+ * HighwayHash on GitHub</a>
+ */
+public final class HighwayHash {
+  private final long[] v0 = new long[4];
+  private final long[] v1 = new long[4];
+  private final long[] mul0 = new long[4];
+  private final long[] mul1 = new long[4];
+  private boolean done = false;
+
+  /**
+   * @param key0 first 8 bytes of the key
+   * @param key1 next 8 bytes of the key
+   * @param key2 next 8 bytes of the key
+   * @param key3 last 8 bytes of the key
+   */
+  public HighwayHash(long key0, long key1, long key2, long key3) {
+    reset(key0, key1, key2, key3);
+  }
+
+  /**
+   * @param key array of size 4 with the key to initialize the hash with
+   */
+  public HighwayHash(long[] key) {
+    if (key.length != 4) {
+      throw new IllegalArgumentException(String.format("Key length (%s) must be 4", key.length));
+    }
+    reset(key[0], key[1], key[2], key[3]);
+  }
+
+  /**
+   * Updates the hash with 32 bytes of data. If you can read 4 long values
+   * from your data efficiently, prefer using update() instead for more speed.
+   * @param packet data array which has a length of at least pos + 32
+   * @param pos position in the array to read the first of 32 bytes from
+   */
+  public void updatePacket(byte[] packet, int pos) {
+    if (pos < 0) {
+      throw new IllegalArgumentException(String.format("Pos (%s) must be positive", pos));
+    }
+    if (pos + 32 > packet.length) {
+      throw new IllegalArgumentException("packet must have at least 32 bytes after pos");
+    }
+    long a0 = read64(packet, pos + 0);
+    long a1 = read64(packet, pos + 8);
+    long a2 = read64(packet, pos + 16);
+    long a3 = read64(packet, pos + 24);
+    update(a0, a1, a2, a3);
+  }
+
+  /**
+   * Updates the hash with 32 bytes of data given as 4 longs. This function is
+   * more efficient than updatePacket when you can use it.
+   * @param a0 first 8 bytes in little endian 64-bit long
+   * @param a1 next 8 bytes in little endian 64-bit long
+   * @param a2 next 8 bytes in little endian 64-bit long
+   * @param a3 last 8 bytes in little endian 64-bit long
+   */
+  public void update(long a0, long a1, long a2, long a3) {
+    if (done) {
+      throw new IllegalStateException("Can compute a hash only once per instance");
+    }
+    v1[0] += mul0[0] + a0;
+    v1[1] += mul0[1] + a1;
+    v1[2] += mul0[2] + a2;
+    v1[3] += mul0[3] + a3;
+    for (int i = 0; i < 4; ++i) {
+      mul0[i] ^= (v1[i] & 0xffffffffL) * (v0[i] >>> 32);
+      v0[i] += mul1[i];
+      mul1[i] ^= (v0[i] & 0xffffffffL) * (v1[i] >>> 32);
+    }
+    v0[0] += zipperMerge0(v1[1], v1[0]);
+    v0[1] += zipperMerge1(v1[1], v1[0]);
+    v0[2] += zipperMerge0(v1[3], v1[2]);
+    v0[3] += zipperMerge1(v1[3], v1[2]);
+    v1[0] += zipperMerge0(v0[1], v0[0]);
+    v1[1] += zipperMerge1(v0[1], v0[0]);
+    v1[2] += zipperMerge0(v0[3], v0[2]);
+    v1[3] += zipperMerge1(v0[3], v0[2]);
+  }
+
+
+  /**
+   * Updates the hash with the last 1 to 31 bytes of the data. You must use
+   * updatePacket first per 32 bytes of the data, if and only if 1 to 31 bytes
+   * of the data are not processed after that, updateRemainder must be used for
+   * those final bytes.
+   * @param bytes data array which has a length of at least pos + size_mod32
+   * @param pos position in the array to start reading size_mod32 bytes from
+   * @param size_mod32 the amount of bytes to read
+   */
+  public void updateRemainder(byte[] bytes, int pos, int size_mod32) {
+    if (pos < 0) {
+      throw new IllegalArgumentException(String.format("Pos (%s) must be positive", pos));
+    }
+    if (size_mod32 < 0 || size_mod32 >= 32) {
+      throw new IllegalArgumentException(
+          String.format("size_mod32 (%s) must be between 0 and 31", size_mod32));
+    }
+    if (pos + size_mod32 > bytes.length) {
+      throw new IllegalArgumentException("bytes must have at least size_mod32 bytes after pos");
+    }
+    int size_mod4 = size_mod32 & 3;
+    int remainder = size_mod32 & ~3;
+    byte[] packet = new byte[32];
+    for (int i = 0; i < 4; ++i) {
+      v0[i] += ((long)size_mod32 << 32) + size_mod32;
+    }
+    rotate32By(size_mod32, v1);
+    for (int i = 0; i < remainder; i++) {
+      packet[i] = bytes[pos + i];
+    }
+    if ((size_mod32 & 16) != 0) {
+      for (int i = 0; i < 4; i++) {
+        packet[28 + i] = bytes[pos + remainder + i + size_mod4 - 4];
+      }
+    } else {
+      if (size_mod4 != 0) {
+        packet[16 + 0] = bytes[pos + remainder + 0];
+        packet[16 + 1] = bytes[pos + remainder + (size_mod4 >>> 1)];
+        packet[16 + 2] = bytes[pos + remainder + (size_mod4 - 1)];
+      }
+    }
+    updatePacket(packet, 0);
+  }
+
+  /**
+   * Computes the hash value after all bytes were processed. Invalidates the
+   * state.
+   *
+   * NOTE: The 64-bit HighwayHash algorithm is declared stable and no longer subject to change.
+   *
+   * @return 64-bit hash
+   */
+  public long finalize64() {
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    done = true;
+    return v0[0] + v1[0] + mul0[0] + mul1[0];
+  }
+
+  /**
+   * Computes the hash value after all bytes were processed. Invalidates the state.
+   *
+   * @return array of size 2 containing 128-bit hash
+   */
+  public long[] finalize128() {
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    done = true;
+    long[] hash = new long[2];
+    hash[0] = v0[0] + mul0[0] + v1[2] + mul1[2];
+    hash[1] = v0[1] + mul0[1] + v1[3] + mul1[3];
+    return hash;
+  }
+
+  /**
+   * Computes the hash value after all bytes were processed. Invalidates the state.
+   *
+   * @return array of size 4 containing 256-bit hash
+   */
+  public long[] finalize256() {
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    permuteAndUpdate();
+    done = true;
+    long[] hash = new long[4];
+    modularReduction(v1[1] + mul1[1], v1[0] + mul1[0],
+                     v0[1] + mul0[1], v0[0] + mul0[0],
+                     hash, 0);
+    modularReduction(v1[3] + mul1[3], v1[2] + mul1[2],
+                     v0[3] + mul0[3], v0[2] + mul0[2],
+                     hash, 2);
+    return hash;
+  }
+  private void reset(long key0, long key1, long key2, long key3) {
+    mul0[0] = 0xdbe6d5d5fe4cce2fL;
+    mul0[1] = 0xa4093822299f31d0L;
+    mul0[2] = 0x13198a2e03707344L;
+    mul0[3] = 0x243f6a8885a308d3L;
+    mul1[0] = 0x3bd39e10cb0ef593L;
+    mul1[1] = 0xc0acf169b5f18a8cL;
+    mul1[2] = 0xbe5466cf34e90c6cL;
+    mul1[3] = 0x452821e638d01377L;
+    v0[0] = mul0[0] ^ key0;
+    v0[1] = mul0[1] ^ key1;
+    v0[2] = mul0[2] ^ key2;
+    v0[3] = mul0[3] ^ key3;
+    v1[0] = mul1[0] ^ ((key0 >>> 32) | (key0 << 32));
+    v1[1] = mul1[1] ^ ((key1 >>> 32) | (key1 << 32));
+    v1[2] = mul1[2] ^ ((key2 >>> 32) | (key2 << 32));
+    v1[3] = mul1[3] ^ ((key3 >>> 32) | (key3 << 32));
+  }
+
+  private long zipperMerge0(long v1, long v0) {
+    return (((v0 & 0xff000000L) | (v1 & 0xff00000000L)) >>> 24) |
+             (((v0 & 0xff0000000000L) | (v1 & 0xff000000000000L)) >>> 16) |
+             (v0 & 0xff0000L) | ((v0 & 0xff00L) << 32) |
+             ((v1 & 0xff00000000000000L) >>> 8) | (v0 << 56);
+  }
+
+  private long zipperMerge1(long v1, long v0) {
+    return (((v1 & 0xff000000L) | (v0 & 0xff00000000L)) >>> 24) |
+             (v1 & 0xff0000L) | ((v1 & 0xff0000000000L) >>> 16) |
+             ((v1 & 0xff00L) << 24) | ((v0 & 0xff000000000000L) >>> 8) |
+             ((v1 & 0xffL) << 48) | (v0 & 0xff00000000000000L);
+  }
+
+  private long read64(byte[] src, int pos) {
+    // Mask with 0xffL so that it is 0..255 as long (byte can only be -128..127)
+    return (src[pos + 0] & 0xffL) | ((src[pos + 1] & 0xffL) << 8) |
+        ((src[pos + 2] & 0xffL) << 16) | ((src[pos + 3] & 0xffL) << 24) |
+        ((src[pos + 4] & 0xffL) << 32) | ((src[pos + 5] & 0xffL) << 40) |
+        ((src[pos + 6] & 0xffL) << 48) | ((src[pos + 7] & 0xffL) << 56);
+  }
+
+  private void rotate32By(long count, long[] lanes) {
+    for (int i = 0; i < 4; ++i) {
+      long half0 = (lanes[i] & 0xffffffffL);
+      long half1 = (lanes[i] >>> 32) & 0xffffffffL;
+      lanes[i] = ((half0 << count)  & 0xffffffffL) | (half0 >>> (32 - count));
+      lanes[i] |= ((long)(((half1 << count) & 0xffffffffL) |
+          (half1 >>> (32 - count)))) << 32;
+    }
+  }
+
+  private void permuteAndUpdate() {
+    update((v0[2] >>> 32) | (v0[2] << 32),
+        (v0[3] >>> 32) | (v0[3] << 32),
+        (v0[0] >>> 32) | (v0[0] << 32),
+        (v0[1] >>> 32) | (v0[1] << 32));
+  }
+
+  private void modularReduction(long a3_unmasked, long a2, long a1,
+                                long a0, long[] hash, int pos) {
+    long a3 = a3_unmasked & 0x3FFFFFFFFFFFFFFFL;
+    hash[pos + 1] = a1 ^ ((a3 << 1) | (a2 >>> 63)) ^ ((a3 << 2) | (a2 >>> 62));
+    hash[pos + 0] = a0 ^ (a2 << 1) ^ (a2 << 2);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * NOTE: The 64-bit HighwayHash algorithm is declared stable and no longer subject to change.
+   *
+   * @param data array with data bytes
+   * @param offset position of first byte of data to read from
+   * @param length number of bytes from data to read
+   * @param key array of size 4 with the key to initialize the hash with
+   * @return 64-bit hash for the given data
+   */
+  public static long hash64(byte[] data, int offset, int length, long[] key) {
+    HighwayHash h = new HighwayHash(key);
+    h.processAll(data, offset, length);
+    return h.finalize64();
+  }
+
+  /**
+   * @param data array with data bytes
+   * @param offset position of first byte of data to read from
+   * @param length number of bytes from data to read
+   * @param key array of size 4 with the key to initialize the hash with
+   * @return array of size 2 containing 128-bit hash for the given data
+   */
+  public static long[] hash128(byte[] data, int offset, int length, long[] key) {
+    HighwayHash h = new HighwayHash(key);
+    h.processAll(data, offset, length);
+    return h.finalize128();
+  }
+
+  /**
+   * @param data array with data bytes
+   * @param offset position of first byte of data to read from
+   * @param length number of bytes from data to read
+   * @param key array of size 4 with the key to initialize the hash with
+   * @return array of size 4 containing 256-bit hash for the given data
+   */
+  public static long[] hash256(byte[] data, int offset, int length, long[] key) {
+    HighwayHash h = new HighwayHash(key);
+    h.processAll(data, offset, length);
+    return h.finalize256();
+  }
+
+  private void processAll(byte[] data, int offset, int length) {
+    int i;
+    for (i = 0; i + 32 <= length; i += 32) {
+      updatePacket(data, offset + i);
+    }
+    if ((length & 31) != 0) {
+      updateRemainder(data, offset + i, length & 31);
+    }
+  }
+}
diff --git a/highwayhash/java/com/google/highwayhash/HighwayHashTest.java b/highwayhash/java/com/google/highwayhash/HighwayHashTest.java
new file mode 100644
index 000000000..4da4bdce2
--- /dev/null
+++ b/highwayhash/java/com/google/highwayhash/HighwayHashTest.java
@@ -0,0 +1,76 @@
+package com.google.highwayhash;
+
+public class HighwayHashTest {
+  private static void testHash64(long expected, byte[] data, int length, long[] key) {
+    long hash = HighwayHash.hash64(data, 0, length, key);
+    if (expected != hash) {
+      System.out.println("Test failed: expected " + expected
+          + ", got "  + hash + " size: " + length);
+      throw new IllegalStateException("Test failed");
+    }
+  }
+
+  private static  void testKnownValuesWithKey1234() {
+    long[] key = {1, 2, 3, 4};
+
+    byte[] b = new byte[33];
+    for (int i = 0; i < b.length; i++) {
+      b[i] = (byte) (128 + i);
+    }
+
+    testHash64(0x53c516cce478cad7L, b, 33, key);
+    testHash64(0x7858f24d2d79b2b2L, new byte[] {-1}, 1, key);
+  }
+
+  private static void testArrays() {
+    // HighwayHash results for byte strings [], [0], [0,1], [0,1,2], etc...,
+    // these match the values in the C++ test of HighwayHash at
+    // https://github.com/google/highwayhash/blob/master/highwayhash/highwayhash_test.cc
+    long[] expected64 = {
+      0x907A56DE22C26E53L, 0x7EAB43AAC7CDDD78L, 0xB8D0569AB0B53D62L,
+      0x5C6BEFAB8A463D80L, 0xF205A46893007EDAL, 0x2B8A1668E4A94541L,
+      0xBD4CCC325BEFCA6FL, 0x4D02AE1738F59482L, 0xE1205108E55F3171L,
+      0x32D2644EC77A1584L, 0xF6E10ACDB103A90BL, 0xC3BBF4615B415C15L,
+      0x243CC2040063FA9CL, 0xA89A58CE65E641FFL, 0x24B031A348455A23L,
+      0x40793F86A449F33BL, 0xCFAB3489F97EB832L, 0x19FE67D2C8C5C0E2L,
+      0x04DD90A69C565CC2L, 0x75D9518E2371C504L, 0x38AD9B1141D3DD16L,
+      0x0264432CCD8A70E0L, 0xA9DB5A6288683390L, 0xD7B05492003F028CL,
+      0x205F615AEA59E51EL, 0xEEE0C89621052884L, 0x1BFC1A93A7284F4FL,
+      0x512175B5B70DA91DL, 0xF71F8976A0A2C639L, 0xAE093FEF1F84E3E7L,
+      0x22CA92B01161860FL, 0x9FC7007CCF035A68L, 0xA0C964D9ECD580FCL,
+      0x2C90F73CA03181FCL, 0x185CF84E5691EB9EL, 0x4FC1F5EF2752AA9BL,
+      0xF5B7391A5E0A33EBL, 0xB9B84B83B4E96C9CL, 0x5E42FE712A5CD9B4L,
+      0xA150F2F90C3F97DCL, 0x7FA522D75E2D637DL, 0x181AD0CC0DFFD32BL,
+      0x3889ED981E854028L, 0xFB4297E8C586EE2DL, 0x6D064A45BB28059CL,
+      0x90563609B3EC860CL, 0x7AA4FCE94097C666L, 0x1326BAC06B911E08L,
+      0xB926168D2B154F34L, 0x9919848945B1948DL, 0xA2A98FC534825EBEL,
+      0xE9809095213EF0B6L, 0x582E5483707BC0E9L, 0x086E9414A88A6AF5L,
+      0xEE86B98D20F6743DL, 0xF89B7FF609B1C0A7L, 0x4C7D9CC19E22C3E8L,
+      0x9A97005024562A6FL, 0x5DD41CF423E6EBEFL, 0xDF13609C0468E227L,
+      0x6E0DA4F64188155AL, 0xB755BA4B50D7D4A1L, 0x887A3484647479BDL,
+      0xAB8EEBE9BF2139A0L, 0x75542C5D4CD2A6FFL
+    };
+
+    byte[] data = new byte[65];
+    long[] key = {0x0706050403020100L, 0x0F0E0D0C0B0A0908L,
+        0x1716151413121110L, 0x1F1E1D1C1B1A1918L};
+
+    for (int i = 0; i <= 64; i++) {
+      data[i] = (byte) i;
+    }
+
+    for (int i = 0; i <= 64; i++) {
+      testHash64(expected64[i], data, i, key);
+    }
+  }
+
+  public static void main(String[] args) {
+
+    // 128-bit and 256-bit tests to be added when they are declared frozen in the C++ version
+
+    testKnownValuesWithKey1234();
+    testArrays();
+
+    System.out.println("Test success");
+  }
+}
diff --git a/highwayhash/msvc/Benchmark.vcxproj b/highwayhash/msvc/Benchmark.vcxproj
new file mode 100644
index 000000000..3a4122391
--- /dev/null
+++ b/highwayhash/msvc/Benchmark.vcxproj
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{7EA1E7F8-FD88-45DD-A09A-E429F747E315}</ProjectGuid>
+    <RootNamespace>Benchmark</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4477</DisableSpecificWarnings>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4477</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\benchmark.cc">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="..\highwayhash\highwayhash_target.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_avx2.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_portable.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_sse41.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_target.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_vsx.cc" />
+    <ClCompile Include="..\highwayhash\instruction_sets.cc" />
+    <ClCompile Include="..\highwayhash\nanobenchmark.cc" />
+    <ClCompile Include="..\highwayhash\os_specific.cc" />
+    <ClCompile Include="..\highwayhash\sip_hash.cc" />
+    <ClCompile Include="..\highwayhash\sip_tree_hash.cc">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\compiler_specific.h" />
+    <ClInclude Include="..\highwayhash\hh_avx2.h" />
+    <ClInclude Include="..\highwayhash\hh_portable.h" />
+    <ClInclude Include="..\highwayhash\hh_sse41.h" />
+    <ClInclude Include="..\highwayhash\hh_types.h" />
+    <ClInclude Include="..\highwayhash\highwayhash.h" />
+    <ClInclude Include="..\highwayhash\highwayhash_target.h" />
+    <ClInclude Include="..\highwayhash\highwayhash_test_target.h" />
+    <ClInclude Include="..\highwayhash\iaca.h" />
+    <ClInclude Include="..\highwayhash\instruction_sets.h" />
+    <ClInclude Include="..\highwayhash\nanobenchmark.h" />
+    <ClInclude Include="..\highwayhash\os_specific.h" />
+    <ClInclude Include="..\highwayhash\sip_hash.h" />
+    <ClInclude Include="..\highwayhash\sip_tree_hash.h" />
+    <ClInclude Include="..\highwayhash\state_helpers.h" />
+    <ClInclude Include="..\highwayhash\tsc_timer.h" />
+    <ClInclude Include="..\highwayhash\vector256.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/highwayhash/msvc/HighwayHash.sln b/highwayhash/msvc/HighwayHash.sln
new file mode 100644
index 000000000..db91e87e5
--- /dev/null
+++ b/highwayhash/msvc/HighwayHash.sln
@@ -0,0 +1,78 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.24720.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HighwayHashTest", "HighwayHash.vcxproj", "{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Benchmark", "Benchmark.vcxproj", "{7EA1E7F8-FD88-45DD-A09A-E429F747E315}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "VectorTest", "VectorTest.vcxproj", "{3A970C45-BF39-45A4-811B-3134DB4802E9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SipHashTest", "SipHashTest.vcxproj", "{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NanobenchmarkExample", "NanobenchmarkExample.vcxproj", "{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ProfilerExample", "ProfilerExample.vcxproj", "{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Debug|x64.ActiveCfg = Debug|x64
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Debug|x64.Build.0 = Debug|x64
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Debug|x86.ActiveCfg = Debug|Win32
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Debug|x86.Build.0 = Debug|Win32
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Release|x64.ActiveCfg = Release|x64
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Release|x64.Build.0 = Release|x64
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Release|x86.ActiveCfg = Release|Win32
+		{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}.Release|x86.Build.0 = Release|Win32
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Debug|x64.ActiveCfg = Debug|x64
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Debug|x64.Build.0 = Debug|x64
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Debug|x86.ActiveCfg = Debug|Win32
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Debug|x86.Build.0 = Debug|Win32
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Release|x64.ActiveCfg = Release|x64
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Release|x64.Build.0 = Release|x64
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Release|x86.ActiveCfg = Release|Win32
+		{7EA1E7F8-FD88-45DD-A09A-E429F747E315}.Release|x86.Build.0 = Release|Win32
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Debug|x64.ActiveCfg = Debug|x64
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Debug|x64.Build.0 = Debug|x64
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Debug|x86.ActiveCfg = Debug|Win32
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Debug|x86.Build.0 = Debug|Win32
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Release|x64.ActiveCfg = Release|x64
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Release|x64.Build.0 = Release|x64
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Release|x86.ActiveCfg = Release|Win32
+		{3A970C45-BF39-45A4-811B-3134DB4802E9}.Release|x86.Build.0 = Release|Win32
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Debug|x64.ActiveCfg = Debug|x64
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Debug|x64.Build.0 = Debug|x64
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Debug|x86.ActiveCfg = Debug|Win32
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Debug|x86.Build.0 = Debug|Win32
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Release|x64.ActiveCfg = Release|x64
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Release|x64.Build.0 = Release|x64
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Release|x86.ActiveCfg = Release|Win32
+		{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}.Release|x86.Build.0 = Release|Win32
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Debug|x64.ActiveCfg = Debug|x64
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Debug|x64.Build.0 = Debug|x64
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Debug|x86.ActiveCfg = Debug|Win32
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Debug|x86.Build.0 = Debug|Win32
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Release|x64.ActiveCfg = Release|x64
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Release|x64.Build.0 = Release|x64
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Release|x86.ActiveCfg = Release|Win32
+		{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}.Release|x86.Build.0 = Release|Win32
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Debug|x64.ActiveCfg = Debug|x64
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Debug|x64.Build.0 = Debug|x64
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Debug|x86.ActiveCfg = Debug|Win32
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Debug|x86.Build.0 = Debug|Win32
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Release|x64.ActiveCfg = Release|x64
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Release|x64.Build.0 = Release|x64
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Release|x86.ActiveCfg = Release|Win32
+		{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/highwayhash/msvc/HighwayHash.vcxproj b/highwayhash/msvc/HighwayHash.vcxproj
new file mode 100644
index 000000000..145d02e42
--- /dev/null
+++ b/highwayhash/msvc/HighwayHash.vcxproj
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{7D931BA1-A782-42B9-8DBD-4E30F6CF0B06}</ProjectGuid>
+    <RootNamespace>HighwayHash</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <ProjectName>HighwayHashTest</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4477</DisableSpecificWarnings>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4477</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\compiler_specific.h" />
+    <ClInclude Include="..\highwayhash\hh_avx2.h" />
+    <ClInclude Include="..\highwayhash\hh_portable.h" />
+    <ClInclude Include="..\highwayhash\hh_sse41.h" />
+    <ClInclude Include="..\highwayhash\hh_types.h" />
+    <ClInclude Include="..\highwayhash\highwayhash_target.h" />
+    <ClInclude Include="..\highwayhash\highwayhash_test_target.h" />
+    <ClInclude Include="..\highwayhash\instruction_sets.h" />
+    <ClInclude Include="..\highwayhash\nanobenchmark.h" />
+    <ClInclude Include="..\highwayhash\os_specific.h" />
+    <ClInclude Include="..\highwayhash\targets.h" />
+    <ClInclude Include="..\highwayhash\tsc_timer.h" />
+    <ClInclude Include="..\highwayhash\vector128.h" />
+    <ClInclude Include="..\highwayhash\vector256.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\hh_avx2.cc">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="..\highwayhash\hh_portable.cc" />
+    <ClCompile Include="..\highwayhash\hh_sse41.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_target.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="..\highwayhash\highwayhash_test.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_avx2.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_portable.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_sse41.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_target.cc" />
+    <ClCompile Include="..\highwayhash\highwayhash_test_vsx.cc" />
+    <ClCompile Include="..\highwayhash\instruction_sets.cc" />
+    <ClCompile Include="..\highwayhash\nanobenchmark.cc" />
+    <ClCompile Include="..\highwayhash\os_specific.cc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/highwayhash/msvc/NanobenchmarkExample.vcxproj b/highwayhash/msvc/NanobenchmarkExample.vcxproj
new file mode 100644
index 000000000..286c79bed
--- /dev/null
+++ b/highwayhash/msvc/NanobenchmarkExample.vcxproj
@@ -0,0 +1,134 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{A541B6F3-BA9B-47B6-8EE3-DC59FA0AD0BD}</ProjectGuid>
+    <RootNamespace>NanobenchmarkExample</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\nanobenchmark.cc" />
+    <ClCompile Include="..\highwayhash\nanobenchmark_example.cc" />
+    <ClCompile Include="..\highwayhash\os_specific.cc" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\nanobenchmark.h" />
+    <ClInclude Include="..\highwayhash\os_specific.h" />
+    <ClInclude Include="..\highwayhash\tsc_timer.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/highwayhash/msvc/ProfilerExample.vcxproj b/highwayhash/msvc/ProfilerExample.vcxproj
new file mode 100644
index 000000000..68dc71ffe
--- /dev/null
+++ b/highwayhash/msvc/ProfilerExample.vcxproj
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{C34AE519-4BD3-4DFC-A138-FCB8B8A73773}</ProjectGuid>
+    <RootNamespace>ProfilerExample</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\compiler_specific.h" />
+    <ClInclude Include="..\highwayhash\os_specific.h" />
+    <ClInclude Include="..\highwayhash\profiler.h" />
+    <ClInclude Include="..\highwayhash\tsc_timer.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\os_specific.cc" />
+    <ClCompile Include="..\highwayhash\profiler_example.cc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/highwayhash/msvc/SipHashTest.vcxproj b/highwayhash/msvc/SipHashTest.vcxproj
new file mode 100644
index 000000000..d9a8dc1ae
--- /dev/null
+++ b/highwayhash/msvc/SipHashTest.vcxproj
@@ -0,0 +1,141 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{C94126C0-D763-4F6E-A388-0BF4C6B9CC3D}</ProjectGuid>
+    <RootNamespace>SipHashTest</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\scalar_sip_tree_hash.cc" />
+    <ClCompile Include="..\highwayhash\sip_hash.cc" />
+    <ClCompile Include="..\highwayhash\sip_hash_test.cc" />
+    <ClCompile Include="..\highwayhash\sip_tree_hash.cc">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\compiler_specific.h" />
+    <ClInclude Include="..\highwayhash\scalar_sip_tree_hash.h" />
+    <ClInclude Include="..\highwayhash\sip_hash.h" />
+    <ClInclude Include="..\highwayhash\sip_tree_hash.h" />
+    <ClInclude Include="..\highwayhash\state_helpers.h" />
+    <ClInclude Include="..\highwayhash\vector256.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/highwayhash/msvc/VectorTest.vcxproj b/highwayhash/msvc/VectorTest.vcxproj
new file mode 100644
index 000000000..5faf19e3d
--- /dev/null
+++ b/highwayhash/msvc/VectorTest.vcxproj
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{3A970C45-BF39-45A4-811B-3134DB4802E9}</ProjectGuid>
+    <RootNamespace>VectorTest</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>$(SolutionDir)\..</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\highwayhash\arch_specific.h" />
+    <ClInclude Include="..\highwayhash\compiler_specific.h" />
+    <ClInclude Include="..\highwayhash\instruction_sets.h" />
+    <ClInclude Include="..\highwayhash\vector128.h" />
+    <ClInclude Include="..\highwayhash\vector256.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\highwayhash\arch_specific.cc" />
+    <ClCompile Include="..\highwayhash\instruction_sets.cc" />
+    <ClCompile Include="..\highwayhash\vector_test.cc">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="..\highwayhash\vector_test_avx2.cc" />
+    <ClCompile Include="..\highwayhash\vector_test_portable.cc" />
+    <ClCompile Include="..\highwayhash\vector_test_sse41.cc" />
+    <ClCompile Include="..\highwayhash\vector_test_target.cc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/name.cc b/name.cc
index 38456fe8c..4f4d43ddf 100644
--- a/name.cc
+++ b/name.cc
@@ -61,10 +61,10 @@ frame *name::frameTrans(coenv &e)
 }
 
 
-types::ty *name::getType(coenv &e, bool tacit)
+types::ty *name::getType(coenv &e, ErrorMode tacit)
 {
   types::ty *t=signatureless(varGetType(e));
-  if (!tacit && t && t->kind == ty_error)
+  if (tacit==ErrorMode::NORMAL && t && t->kind == ty_error)
     // Report errors associated with regarding the name as a variable.
     varTrans(trans::READ, e, t);
   return t ? t : typeTrans(e, tacit);
@@ -110,14 +110,14 @@ trans::varEntry *simpleName::getCallee(coenv &e, signature *sig)
   return ve;
 }
 
-types::ty *simpleName::typeTrans(coenv &e, bool tacit)
+types::ty *simpleName::typeTrans(coenv &e, ErrorMode tacit)
 {
   types::ty *t = e.e.lookupType(id);
   if (t) {
     return t;
   }
   else {
-    if (!tacit) {
+    if (tacit==ErrorMode::NORMAL) {
       em.error(getPos());
       em << "no type of name \'" << id << "\'";
     }
@@ -158,11 +158,11 @@ AsymptoteLsp::SymbolLit simpleName::getLit() const
   return AsymptoteLsp::SymbolLit(static_cast<std::string>(id));
 }
 
-record *qualifiedName::castToRecord(types::ty *t, bool tacit)
+record *qualifiedName::castToRecord(types::ty *t, ErrorMode tacit)
 {
   switch (t->kind) {
     case ty_overloaded:
-      if (!tacit) {
+      if (tacit==ErrorMode::NORMAL) {
         em.compiler(qualifier->getPos());
         em << "name::getType returned overloaded";
       }
@@ -172,7 +172,7 @@ record *qualifiedName::castToRecord(types::ty *t, bool tacit)
     case ty_error:
       return 0;
     default:
-      if (!tacit) {
+      if (tacit==ErrorMode::NORMAL) {
         em.error(qualifier->getPos());
         em << "type \'" << *t << "\' is not a structure";
       }
@@ -233,14 +233,14 @@ void qualifiedName::varTrans(action act, coenv &e, types::ty *target)
 
 types::ty *qualifiedName::varGetType(coenv &e)
 {
-  types::ty *qt = qualifier->getType(e, true);
+  types::ty *qt = qualifier->getType(e, ErrorMode::SUPPRESS);
 
   // Look for virtual fields.
   types::ty *t = qt->virtualFieldGetType(id);
   if (t)
     return t;
 
-  record *r = castToRecord(qt, true);
+  record *r = castToRecord(qt, ErrorMode::SUPPRESS);
   return r ? r->e.varGetType(id) : 0;
 }
 
@@ -257,8 +257,8 @@ trans::varEntry *qualifiedName::getVarEntry(coenv &e)
 {
   varEntry *qv = qualifier->getVarEntry(e);
 
-  types::ty *qt = qualifier->getType(e, true);
-  record *r = castToRecord(qt, true);
+  types::ty *qt = qualifier->getType(e, ErrorMode::SUPPRESS);
+  record *r = castToRecord(qt, ErrorMode::SUPPRESS);
   if (r) {
     types::ty *t = signatureless(r->e.varGetType(id));
     varEntry *v = t ? r->e.lookupVarByType(id, t) : 0;
@@ -268,7 +268,7 @@ trans::varEntry *qualifiedName::getVarEntry(coenv &e)
     return qv;
 }
 
-types::ty *qualifiedName::typeTrans(coenv &e, bool tacit)
+types::ty *qualifiedName::typeTrans(coenv &e, ErrorMode tacit)
 {
   types::ty *rt = qualifier->getType(e, tacit);
 
@@ -278,12 +278,12 @@ types::ty *qualifiedName::typeTrans(coenv &e, bool tacit)
 
   tyEntry *ent = r->e.lookupTyEntry(id);
   if (ent) {
-    if (!tacit)
+    if (tacit == ErrorMode::NORMAL)
       ent->reportPerm(READ, getPos(), e.c);
     return ent->t;
   }
   else {
-    if (!tacit) {
+    if (tacit == ErrorMode::NORMAL) {
       em.error(getPos());
       em << "no matching field or type of name \'" << id << "\' in \'"
          << *r << "\'";
@@ -294,9 +294,9 @@ types::ty *qualifiedName::typeTrans(coenv &e, bool tacit)
 
 tyEntry *qualifiedName::tyEntryTrans(coenv &e)
 {
-  types::ty *rt = qualifier->getType(e, false);
+  types::ty *rt = qualifier->getType(e, ErrorMode::NORMAL);
 
-  record *r = castToRecord(rt, false);
+  record *r = castToRecord(rt, ErrorMode::NORMAL);
   if (!r)
     return new tyEntry(primError(), nullptr, nullptr, nullPos);
 
diff --git a/name.h b/name.h
index 0684b21f6..8b880b535 100644
--- a/name.h
+++ b/name.h
@@ -51,7 +51,7 @@ class name : public absyn {
   // Because this is used only on qualifiers (ie. names to the left of a
   // dot), it does not look at function variables.
   // Tacit means that no error messages will be reported to the user.
-  virtual types::ty *getType(coenv &e, bool tacit = false);
+  virtual types::ty *getType(coenv &e, ErrorMode tacit = ErrorMode::NORMAL);
 
   // Pushes the highest level frame possible onto the stack.  Returning
   // the frame pushed.  If no frame can be pushed, returns 0.
@@ -76,7 +76,7 @@ class name : public absyn {
 
   // As a type:
   // Determines the type, as used in a variable declaration.
-  virtual types::ty *typeTrans(coenv &e, bool tacit = false) = 0;
+  virtual types::ty *typeTrans(coenv &e, ErrorMode tacit=ErrorMode::NORMAL) = 0;
   // Constructs the tyEntry of the name, needed so that we know the
   // parent frame for allocating new objects of that type.  Reports
   // errors as typeTrans() does with tacit=false.
@@ -86,6 +86,12 @@ class name : public absyn {
   virtual void print(ostream& out) const {
     out << "<base name>";
   }
+  virtual void printPath(ostream& out) const { print(out); }
+  virtual symbol asPath() const {
+    ostringstream out;
+    printPath(out);
+    return symbol::literalTrans(out.str());  
+  }
 
   [[nodiscard]]
   virtual symbol getName() const = 0;
@@ -114,7 +120,7 @@ class simpleName : public name {
   trans::varEntry *getCallee(coenv &e, types::signature *sig) override;
 
   // As a type:
-  types::ty *typeTrans(coenv &e, bool tacit = false) override;
+  types::ty *typeTrans(coenv &e, ErrorMode tacit = ErrorMode::NORMAL) override;
   virtual trans::tyEntry *tyEntryTrans(coenv &e) override;
   trans::frame *tyFrameTrans(coenv &e) override;
 
@@ -122,7 +128,9 @@ class simpleName : public name {
   void print(ostream& out) const override {
     out << id;
   }
-
+  void printPath(ostream& out) const override {
+    out << id;
+  }
   [[nodiscard]]
   symbol getName() const override {
     return id;
@@ -139,7 +147,7 @@ class qualifiedName : public name {
 
   // Gets the record type associated with the qualifier. Reports an
   // error and returns null if the type is not a record.
-  record *castToRecord(types::ty *t, bool tacit = false);
+  record *castToRecord(types::ty *t, ErrorMode tacit = ErrorMode::NORMAL);
 
   // Translates as a virtual field, if possible.  qt is the type of the
   // qualifier.  Return true if there was a matching virtual field.
@@ -161,7 +169,7 @@ class qualifiedName : public name {
   trans::varEntry *getCallee(coenv &e, types::signature *sig) override;
 
   // As a type:
-  types::ty *typeTrans(coenv &e, bool tacit = false) override;
+  types::ty *typeTrans(coenv &e, ErrorMode tacit = ErrorMode::NORMAL) override;
   trans::tyEntry *tyEntryTrans(coenv &e) override;
   trans::frame *tyFrameTrans(coenv &e) override;
 
@@ -169,6 +177,10 @@ class qualifiedName : public name {
   void print(ostream& out) const override {
     out << *qualifier << "." << id;
   }
+  void printPath(ostream& out) const override {
+    qualifier->printPath(out);
+    out << "/" << id;
+  }
 
   [[nodiscard]]
   symbol getName() const override {
diff --git a/newexp.cc b/newexp.cc
index ea33313bc..750482b59 100644
--- a/newexp.cc
+++ b/newexp.cc
@@ -70,7 +70,7 @@ types::ty *newRecordExp::trans(coenv &e)
 
 types::ty *newRecordExp::getType(coenv &e)
 {
-  types::ty *t = result->trans(e, true);
+  types::ty *t = result->trans(e, ErrorMode::SUPPRESS);
   if (t->kind != ty_error && t->kind != ty_record)
     return primError();
   else
diff --git a/record.cc b/record.cc
index 2da10ca40..cd42f4e8b 100644
--- a/record.cc
+++ b/record.cc
@@ -47,6 +47,113 @@ trans::access *record::initializer() {
   return &a;
 }
 
+mem::pair<ty*, ty*> computeKVTypes(
+        trans::venv& ve, const position& pos, ErrorMode tacit= ErrorMode::NORMAL
+)
+{
+  auto modeGuard = em.modeGuard(tacit);
+  mem::pair<ty*, ty*> errorPair(primError(), primError());
+
+  // TODO: Make the lookup more efficient. (See DEFSYMBOL in camp.l.)
+  const symbol SYM_BRACKETS= symbol::trans("[]");
+  const symbol SYM_BRACKETS_ASSIGN= symbol::trans("[=]");
+  ty* getTy= ve.getType(SYM_BRACKETS);
+  ty* setTy= ve.getType(SYM_BRACKETS_ASSIGN);
+  if (getTy == nullptr) {
+    if (setTy != nullptr) {
+      em.error(pos);
+      em << "operator[=] defined without operator[]";
+    }
+    return errorPair;
+  }
+
+  // Find the keytype and valuetype based on operator[].
+  if (getTy->isOverloaded()) {
+    em.error(pos);
+    em << "multiple operator[] definitions in one struct";
+    return errorPair;
+  }
+  if (getTy->kind != ty_function) {
+    em.error(pos);
+    em << "operator[] is not a function";
+    return errorPair;
+  }
+  types::function* get= static_cast<types::function*>(getTy);
+  types::ty* valTy= get->getResult();
+  signature* getSig= get->getSignature();
+  // TODO: Can we get the position of the definition of operator[] rather than
+  // the end of the struct?
+  if (getSig->hasRest() || getSig->getNumFormals() != 1) {
+    em.error(pos);
+    em << "operator[] must have exactly one parameter";
+    return errorPair;
+  }
+  ty* keyTy= getSig->getFormal(0).t;
+
+  if (setTy != nullptr) {
+    // Find the keytype and valuetype based on operator[=].
+    if (setTy->isOverloaded()) {
+      em.error(pos);
+      em << "multiple operator[=] definitions in one struct";
+      return errorPair;
+    }
+    if (setTy->kind != ty_function) {
+      em.error(pos);
+      em << "operator[=] is not a function";
+      return errorPair;
+    }
+    types::function* set= static_cast<types::function*>(setTy);
+    types::ty* setResult= set->getResult();
+    if (setResult->kind != ty_void) {
+      em.error(pos);
+      em << "operator[=] must return void";
+      return errorPair;
+    }
+    signature* setSig= set->getSignature();
+    if (setSig->hasRest() || setSig->getNumFormals() != 2) {
+      em.error(pos);
+      em << "operator[=] must have exactly two parameters";
+      return errorPair;
+    }
+    ty* setKeyTy= setSig->getFormal(0).t;
+    ty* setValTy= setSig->getFormal(1).t;
+
+    // Check that they agree.
+    if (!keyTy->equiv(setKeyTy) || !setKeyTy->equiv(keyTy)) {
+      em.error(pos);
+      em << "first parameter of operator[] and operator[=] must match";
+      return errorPair;
+    }
+    if (!valTy->equiv(setValTy) || !setValTy->equiv(valTy)) {
+      em.error(pos);
+      em << "return type of operator[] and second parameter of operator[=] "
+            "must match";
+      return errorPair;
+    }
+  }
+
+  return mem::pair<ty*, ty*>(keyTy, valTy);
+}
+
+void record::computeKVTypes(const position& pos)
+{
+  std::tie(kType, vType)= types::computeKVTypes(e.ve, pos);
+}
+
+ty *record::keyType() {
+  if (kType != nullptr) {
+    return kType;
+  }
+  return types::computeKVTypes(e.ve, nullPos, ErrorMode::SUPPRESS).first;
+}
+
+ty *record::valType() {
+  if (vType != nullptr) {
+    return vType;
+  }
+  return types::computeKVTypes(e.ve, nullPos, ErrorMode::SUPPRESS).second;
+}
+
 dummyRecord::dummyRecord(symbol name)
   : record(name, new frame(name, 0,0))
 {
diff --git a/record.h b/record.h
index cd5771619..e2fc1d8dd 100644
--- a/record.h
+++ b/record.h
@@ -35,6 +35,9 @@ class record : public ty {
   // The runtime representation of the record used by the virtual machine.
   vm::lambda *init;
 
+  ty *kType = nullptr;
+  ty *vType = nullptr;
+
 public:
   // The name bindings for fields of the record.
   protoenv e;
@@ -56,18 +59,18 @@ class record : public ty {
     return getName(); // May change in the future.
   }
 
-  bool isReference() {
+  bool isReference() override {
     return true;
   }
 
-  size_t hash() const {
+  size_t hash() const override{
     // Use the pointer, as two records are equivalent only if they are the
     // same object.
     return (size_t)this;
   }
 
   // Initialize to null by default.
-  trans::access *initializer();
+  trans::access *initializer() override;
 
   frame *getLevel(bool statically = false)
   {
@@ -95,7 +98,12 @@ class record : public ty {
   // Create a statically enclosed record from this record.
   record *newRecord(symbol id, bool statically);
 
-  void print(ostream& out) const
+  // Sets the keytype and valuetype based on operator[] and operator[=].
+  void computeKVTypes(const position& pos);
+  ty *keyType() override;
+  ty *valType();
+
+  void print(ostream& out) const override
   {
     out << name;
   }
diff --git a/runarray.in b/runarray.in
index 6748f7a89..0ec7e564e 100644
--- a/runarray.in
+++ b/runarray.in
@@ -20,12 +20,16 @@ triplearray2* => tripleArray2()
 callableReal* => realRealFunction()
 
 
+#include <cstdint>
+#include <string>
+
 #include "array.h"
 #include "arrayop.h"
 #include "triple.h"
 #include "path3.h"
 #include "Delaunay.h"
 #include "glrender.h"
+#include "hashing.h"
 
 #ifdef HAVE_LIBFFTW3
 #include "fftw++.h"
@@ -932,6 +936,22 @@ Intarray* complement(Intarray *a, Int n)
   return r;
 }
 
+Int hash(Intarray *a)
+{
+  size_t asize=checkArray(a);
+#if COMPACT
+  const uint64_t *start = reinterpret_cast<uint64_t*>(a->data());
+  span<const uint64_t> s = {start, asize};
+  return hashing::hashSpan(s);
+#else
+  std::vector<uint64_t> v(asize);
+  for (size_t i = 0; i < asize; ++i) {
+    v[i] = static_cast<uint64_t>(read<Int>(a,i));
+  }
+  return hashing::hashSpan(v);
+#endif
+}
+
 // Generate the sequence {f(i) : i=0,1,...n-1} given a function f and integer n
 Intarray* :arraySequence(callable *f, Int n)
 {
diff --git a/runmath.in b/runmath.in
index cecf2a3cc..0c2b61736 100644
--- a/runmath.in
+++ b/runmath.in
@@ -13,8 +13,10 @@ pairarray* => pairArray()
 #include <random>
 #include <memory>
 #include <cmath>
-#include <inttypes.h>
+#include <cstdint>
+#include <cstring>
 
+#include "hashing.h"
 #include "mathop.h"
 #include "path.h"
 
@@ -447,3 +449,29 @@ Int bitreverse(Int a, Int bits)
   unsigned int bytes=(bits+7)/8;
   return B[bytes-1]((unsigned long long) a) >> (8*bytes-bits);
 }
+
+Int :realHashHelper(real h)
+{
+  static_assert(sizeof(real) == sizeof(uint64_t),
+                "To hash a real, it must be a 64-bit float.");
+  uint64_t hAsInt;
+  std::memcpy(&hAsInt, &h, sizeof hAsInt);
+  span<const uint64_t> s = {&hAsInt, 1};
+  return hashing::hashSpan(s);
+}
+
+callable* :realHash(real h)
+{
+  return new vm::thunk(new vm::bfunc(realHashHelper), h);
+}
+
+Int :intHashHelper(Int h)
+{
+  uint64_t h2 = static_cast<uint64_t>(h);
+  return hashing::hashInt(h2);
+}
+
+callable* :intHash(Int h)
+{
+  return new vm::thunk(new vm::bfunc(intHashHelper), h);
+}
diff --git a/runstring.in b/runstring.in
index 24c4ef89b..5c72bef1c 100644
--- a/runstring.in
+++ b/runstring.in
@@ -6,15 +6,19 @@
  *****/
 
 stringarray2* => stringArray2()
+Intarray*  => IntArray()
 
 #include <cfloat>
+#include <cstdint>
 #include <cstring>
 #include <iomanip>
 #include <ctime>
 #include <chrono>
 #include <algorithm>
+#include <random>
 
 #include "array.h"
+#include "hashing.h"
 
 using namespace camp;
 using namespace vm;
@@ -22,9 +26,11 @@ using namespace settings;
 
 typedef array stringarray;
 typedef array stringarray2;
+typedef array Intarray;
 
 using types::stringArray;
 using types::stringArray2;
+using types::IntArray;
 
 namespace types {
 extern const char *names[];
@@ -108,7 +114,7 @@ void checkformat(const char *ptr, bool intformat)
     } /* End of else statement */
   }
 }
-
+  
 // Autogenerated routines:
 
 
@@ -124,6 +130,17 @@ Int length(string *s)
   return (Int) s->length();
 }
 
+Int :stringHashHelper(string s)
+{
+  span<const char> sSpan = {s.data(), s.length()};
+  return hashing::hashSpan(s);
+}
+
+callable* :stringHash(string s)
+{
+  return new thunk(new bfunc(stringHashHelper), s);
+}
+
 Int find(string *s, string t, Int pos=0)
 {
   size_t n=s->find(t,pos);
@@ -408,6 +425,24 @@ Int ascii(string s)
   return s.empty() ? -1 : (unsigned char) s[0];
 }
 
+Int hash(string s)
+{
+  span<const char> sSpan = {s.data(), s.length()};
+  return hashing::hashSpan(s);
+}
+
+Intarray* fingerprint2024(string s)
+{
+  span<const char> sSpan = {s.data(), s.length()};
+  std::array<uint64_t, 4> result = hashing::fingerprint(s);
+  array *retv = new array(8);
+  for (int i = 0; i < 4; ++i) {
+    (*retv)[i] = static_cast<Int>(result[i] & ((UINT64_C(1) << 32) - 1));
+    (*retv)[i + 4] = static_cast<Int>(result[i] >> 32);
+  }
+  return retv;
+}
+
 string string(Int x)
 {
   ostringstream buf;
diff --git a/span.hpp b/span.hpp
new file mode 100644
index 000000000..3d2d86a42
--- /dev/null
+++ b/span.hpp
@@ -0,0 +1,1947 @@
+//
+// span for C++98 and later.
+// Based on http://wg21.link/p0122r7
+// For more information see https://github.com/martinmoene/span-lite
+//
+// Copyright 2018-2021 Martin Moene
+//
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef NONSTD_SPAN_HPP_INCLUDED
+#define NONSTD_SPAN_HPP_INCLUDED
+
+#define span_lite_MAJOR  0
+#define span_lite_MINOR  11
+#define span_lite_PATCH  0
+
+#define span_lite_VERSION  span_STRINGIFY(span_lite_MAJOR) "." span_STRINGIFY(span_lite_MINOR) "." span_STRINGIFY(span_lite_PATCH)
+
+#define span_STRINGIFY(  x )  span_STRINGIFY_( x )
+#define span_STRINGIFY_( x )  #x
+
+// span configuration:
+
+#define span_SPAN_DEFAULT  0
+#define span_SPAN_NONSTD   1
+#define span_SPAN_STD      2
+
+// tweak header support:
+
+#ifdef __has_include
+# if __has_include(<nonstd/span.tweak.hpp>)
+#  include <nonstd/span.tweak.hpp>
+# endif
+#define span_HAVE_TWEAK_HEADER  1
+#else
+#define span_HAVE_TWEAK_HEADER  0
+//# pragma message("span.hpp: Note: Tweak header not supported.")
+#endif
+
+// span selection and configuration:
+
+#define span_HAVE( feature )  ( span_HAVE_##feature )
+
+#ifndef  span_CONFIG_SELECT_SPAN
+# define span_CONFIG_SELECT_SPAN  ( span_HAVE_STD_SPAN ? span_SPAN_STD : span_SPAN_NONSTD )
+#endif
+
+#ifndef  span_CONFIG_EXTENT_TYPE
+# define span_CONFIG_EXTENT_TYPE  std::size_t
+#endif
+
+#ifndef  span_CONFIG_SIZE_TYPE
+# define span_CONFIG_SIZE_TYPE  std::size_t
+#endif
+
+#ifdef span_CONFIG_INDEX_TYPE
+# error `span_CONFIG_INDEX_TYPE` is deprecated since v0.7.0; it is replaced by `span_CONFIG_SIZE_TYPE`.
+#endif
+
+// span configuration (features):
+
+#ifndef  span_FEATURE_WITH_INITIALIZER_LIST_P2447
+# define span_FEATURE_WITH_INITIALIZER_LIST_P2447  0
+#endif
+
+#ifndef  span_FEATURE_WITH_CONTAINER
+#ifdef   span_FEATURE_WITH_CONTAINER_TO_STD
+# define span_FEATURE_WITH_CONTAINER  span_IN_STD( span_FEATURE_WITH_CONTAINER_TO_STD )
+#else
+# define span_FEATURE_WITH_CONTAINER  0
+# define span_FEATURE_WITH_CONTAINER_TO_STD  0
+#endif
+#endif
+
+#ifndef  span_FEATURE_CONSTRUCTION_FROM_STDARRAY_ELEMENT_TYPE
+# define span_FEATURE_CONSTRUCTION_FROM_STDARRAY_ELEMENT_TYPE  0
+#endif
+
+#ifndef  span_FEATURE_MEMBER_AT
+# define span_FEATURE_MEMBER_AT  0
+#endif
+
+#ifndef  span_FEATURE_MEMBER_BACK_FRONT
+# define span_FEATURE_MEMBER_BACK_FRONT  1
+#endif
+
+#ifndef  span_FEATURE_MEMBER_CALL_OPERATOR
+# define span_FEATURE_MEMBER_CALL_OPERATOR  0
+#endif
+
+#ifndef  span_FEATURE_MEMBER_SWAP
+# define span_FEATURE_MEMBER_SWAP  0
+#endif
+
+#ifndef  span_FEATURE_NON_MEMBER_FIRST_LAST_SUB
+# define span_FEATURE_NON_MEMBER_FIRST_LAST_SUB  0
+#elif    span_FEATURE_NON_MEMBER_FIRST_LAST_SUB
+# define span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_SPAN       1
+# define span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_CONTAINER  1
+#endif
+
+#ifndef  span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_SPAN
+# define span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_SPAN  0
+#endif
+
+#ifndef  span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_CONTAINER
+# define span_FEATURE_NON_MEMBER_FIRST_LAST_SUB_CONTAINER  0
+#endif
+
+#ifndef  span_FEATURE_COMPARISON
+# define span_FEATURE_COMPARISON  0  // Note: C++20 does not provide comparison
+#endif
+
+#ifndef  span_FEATURE_SAME
+# define span_FEATURE_SAME  0
+#endif
+
+#if span_FEATURE_SAME && !span_FEATURE_COMPARISON
+# error `span_FEATURE_SAME` requires `span_FEATURE_COMPARISON`
+#endif
+
+#ifndef  span_FEATURE_MAKE_SPAN
+#ifdef   span_FEATURE_MAKE_SPAN_TO_STD
+# define span_FEATURE_MAKE_SPAN  span_IN_STD( span_FEATURE_MAKE_SPAN_TO_STD )
+#else
+# define span_FEATURE_MAKE_SPAN  0
+# define span_FEATURE_MAKE_SPAN_TO_STD  0
+#endif
+#endif
+
+#ifndef  span_FEATURE_BYTE_SPAN
+# define span_FEATURE_BYTE_SPAN  0
+#endif
+
+// Control presence of exception handling (try and auto discover):
+
+#ifndef span_CONFIG_NO_EXCEPTIONS
+# if defined(_MSC_VER)
+#  include <cstddef>    // for _HAS_EXCEPTIONS
+# endif
+# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (_HAS_EXCEPTIONS)
+#  define span_CONFIG_NO_EXCEPTIONS  0
+# else
+#  define span_CONFIG_NO_EXCEPTIONS  1
+#  undef  span_CONFIG_CONTRACT_VIOLATION_THROWS
+#  undef  span_CONFIG_CONTRACT_VIOLATION_TERMINATES
+#  define span_CONFIG_CONTRACT_VIOLATION_THROWS  0
+#  define span_CONFIG_CONTRACT_VIOLATION_TERMINATES  1
+# endif
+#endif
+
+// Control pre- and postcondition violation behaviour:
+
+#if    defined( span_CONFIG_CONTRACT_LEVEL_ON )
+# define        span_CONFIG_CONTRACT_LEVEL_MASK  0x11
+#elif  defined( span_CONFIG_CONTRACT_LEVEL_OFF )
+# define        span_CONFIG_CONTRACT_LEVEL_MASK  0x00
+#elif  defined( span_CONFIG_CONTRACT_LEVEL_EXPECTS_ONLY )
+# define        span_CONFIG_CONTRACT_LEVEL_MASK  0x01
+#elif  defined( span_CONFIG_CONTRACT_LEVEL_ENSURES_ONLY )
+# define        span_CONFIG_CONTRACT_LEVEL_MASK  0x10
+#else
+# define        span_CONFIG_CONTRACT_LEVEL_MASK  0x11
+#endif
+
+#if    defined( span_CONFIG_CONTRACT_VIOLATION_THROWS )
+# define        span_CONFIG_CONTRACT_VIOLATION_THROWS_V  span_CONFIG_CONTRACT_VIOLATION_THROWS
+#else
+# define        span_CONFIG_CONTRACT_VIOLATION_THROWS_V  0
+#endif
+
+#if    defined( span_CONFIG_CONTRACT_VIOLATION_THROWS     ) && span_CONFIG_CONTRACT_VIOLATION_THROWS && \
+       defined( span_CONFIG_CONTRACT_VIOLATION_TERMINATES ) && span_CONFIG_CONTRACT_VIOLATION_TERMINATES
+# error Please define none or one of span_CONFIG_CONTRACT_VIOLATION_THROWS and span_CONFIG_CONTRACT_VIOLATION_TERMINATES to 1, but not both.
+#endif
+
+// C++ language version detection (C++23 is speculative):
+// Note: VC14.0/1900 (VS2015) lacks too much from C++14.
+
+#ifndef   span_CPLUSPLUS
+# if defined(_MSVC_LANG ) && !defined(__clang__)
+#  define span_CPLUSPLUS  (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
+# else
+#  define span_CPLUSPLUS  __cplusplus
+# endif
+#endif
+
+#define span_CPP98_OR_GREATER  ( span_CPLUSPLUS >= 199711L )
+#define span_CPP11_OR_GREATER  ( span_CPLUSPLUS >= 201103L )
+#define span_CPP14_OR_GREATER  ( span_CPLUSPLUS >= 201402L )
+#define span_CPP17_OR_GREATER  ( span_CPLUSPLUS >= 201703L )
+#define span_CPP20_OR_GREATER  ( span_CPLUSPLUS >= 202002L )
+#define span_CPP23_OR_GREATER  ( span_CPLUSPLUS >= 202300L )
+
+// C++ language version (represent 98 as 3):
+
+#define span_CPLUSPLUS_V  ( span_CPLUSPLUS / 100 - (span_CPLUSPLUS > 200000 ? 2000 : 1994) )
+
+#define span_IN_STD( v )  ( ((v) == 98 ? 3 : (v)) >= span_CPLUSPLUS_V )
+
+#define span_CONFIG(         feature )  ( span_CONFIG_##feature )
+#define span_FEATURE(        feature )  ( span_FEATURE_##feature )
+#define span_FEATURE_TO_STD( feature )  ( span_IN_STD( span_FEATURE( feature##_TO_STD ) ) )
+
+// Use C++20 std::span if available and requested:
+
+#if span_CPP20_OR_GREATER && defined(__has_include )
+# if __has_include( <span> )
+#  define span_HAVE_STD_SPAN  1
+# else
+#  define span_HAVE_STD_SPAN  0
+# endif
+#else
+# define  span_HAVE_STD_SPAN  0
+#endif
+
+#define  span_USES_STD_SPAN  ( (span_CONFIG_SELECT_SPAN == span_SPAN_STD) || ((span_CONFIG_SELECT_SPAN == span_SPAN_DEFAULT) && span_HAVE_STD_SPAN) )
+
+//
+// Use C++20 std::span:
+//
+
+#if span_USES_STD_SPAN
+
+#include <span>
+
+namespace nonstd {
+
+using std::span;
+using std::dynamic_extent;
+
+// Note: C++20 does not provide comparison
+// using std::operator==;
+// using std::operator!=;
+// using std::operator<;
+// using std::operator<=;
+// using std::operator>;
+// using std::operator>=;
+}  // namespace nonstd
+
+#else  // span_USES_STD_SPAN
+
+#include <algorithm>
+
+// Compiler versions:
+//
+// MSVC++  6.0  _MSC_VER == 1200  span_COMPILER_MSVC_VERSION ==  60  (Visual Studio 6.0)
+// MSVC++  7.0  _MSC_VER == 1300  span_COMPILER_MSVC_VERSION ==  70  (Visual Studio .NET 2002)
+// MSVC++  7.1  _MSC_VER == 1310  span_COMPILER_MSVC_VERSION ==  71  (Visual Studio .NET 2003)
+// MSVC++  8.0  _MSC_VER == 1400  span_COMPILER_MSVC_VERSION ==  80  (Visual Studio 2005)
+// MSVC++  9.0  _MSC_VER == 1500  span_COMPILER_MSVC_VERSION ==  90  (Visual Studio 2008)
+// MSVC++ 10.0  _MSC_VER == 1600  span_COMPILER_MSVC_VERSION == 100  (Visual Studio 2010)
+// MSVC++ 11.0  _MSC_VER == 1700  span_COMPILER_MSVC_VERSION == 110  (Visual Studio 2012)
+// MSVC++ 12.0  _MSC_VER == 1800  span_COMPILER_MSVC_VERSION == 120  (Visual Studio 2013)
+// MSVC++ 14.0  _MSC_VER == 1900  span_COMPILER_MSVC_VERSION == 140  (Visual Studio 2015)
+// MSVC++ 14.1  _MSC_VER >= 1910  span_COMPILER_MSVC_VERSION == 141  (Visual Studio 2017)
+// MSVC++ 14.2  _MSC_VER >= 1920  span_COMPILER_MSVC_VERSION == 142  (Visual Studio 2019)
+
+#if defined(_MSC_VER ) && !defined(__clang__)
+# define span_COMPILER_MSVC_VER      (_MSC_VER )
+# define span_COMPILER_MSVC_VERSION  (_MSC_VER / 10 - 10 * ( 5 + (_MSC_VER < 1900 ) ) )
+#else
+# define span_COMPILER_MSVC_VER      0
+# define span_COMPILER_MSVC_VERSION  0
+#endif
+
+#define span_COMPILER_VERSION( major, minor, patch )  ( 10 * ( 10 * (major) + (minor) ) + (patch) )
+
+#if defined(__clang__)
+# define span_COMPILER_CLANG_VERSION  span_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#else
+# define span_COMPILER_CLANG_VERSION  0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+# define span_COMPILER_GNUC_VERSION  span_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#else
+# define span_COMPILER_GNUC_VERSION  0
+#endif
+
+// half-open range [lo..hi):
+#define span_BETWEEN( v, lo, hi )  ( (lo) <= (v) && (v) < (hi) )
+
+// Compiler warning suppression:
+
+#if defined(__clang__)
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wundef"
+# pragma clang diagnostic ignored "-Wmismatched-tags"
+# define span_RESTORE_WARNINGS()   _Pragma( "clang diagnostic pop" )
+
+#elif defined __GNUC__
+# pragma GCC   diagnostic push
+# pragma GCC   diagnostic ignored "-Wundef"
+# define span_RESTORE_WARNINGS()   _Pragma( "GCC diagnostic pop" )
+
+#elif span_COMPILER_MSVC_VER >= 1900
+# define span_DISABLE_MSVC_WARNINGS(codes)  __pragma(warning(push))  __pragma(warning(disable: codes))
+# define span_RESTORE_WARNINGS()            __pragma(warning(pop ))
+
+// Suppress the following MSVC GSL warnings:
+// - C26439, gsl::f.6 : special function 'function' can be declared 'noexcept'
+// - C26440, gsl::f.6 : function 'function' can be declared 'noexcept'
+// - C26472, gsl::t.1 : don't use a static_cast for arithmetic conversions;
+//                      use brace initialization, gsl::narrow_cast or gsl::narrow
+// - C26473: gsl::t.1 : don't cast between pointer types where the source type and the target type are the same
+// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
+// - C26490: gsl::t.1 : don't use reinterpret_cast
+
+span_DISABLE_MSVC_WARNINGS( 26439 26440 26472 26473 26481 26490 )
+
+#else
+# define span_RESTORE_WARNINGS()  /*empty*/
+#endif
+
+// Presence of language and library features:
+
+#ifdef _HAS_CPP0X
+# define span_HAS_CPP0X  _HAS_CPP0X
+#else
+# define span_HAS_CPP0X  0
+#endif
+
+#define span_CPP11_80   (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1400)
+#define span_CPP11_90   (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1500)
+#define span_CPP11_100  (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1600)
+#define span_CPP11_110  (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1700)
+#define span_CPP11_120  (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1800)
+#define span_CPP11_140  (span_CPP11_OR_GREATER || span_COMPILER_MSVC_VER >= 1900)
+
+#define span_CPP14_000  (span_CPP14_OR_GREATER)
+#define span_CPP14_120  (span_CPP14_OR_GREATER || span_COMPILER_MSVC_VER >= 1800)
+#define span_CPP14_140  (span_CPP14_OR_GREATER || span_COMPILER_MSVC_VER >= 1900)
+
+#define span_CPP17_000  (span_CPP17_OR_GREATER)
+
+// Presence of C++11 language features:
+
+#define span_HAVE_ALIAS_TEMPLATE            span_CPP11_140
+#define span_HAVE_AUTO                      span_CPP11_100
+#define span_HAVE_CONSTEXPR_11              span_CPP11_140
+#define span_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG  span_CPP11_120
+#define span_HAVE_EXPLICIT_CONVERSION       span_CPP11_140
+#define span_HAVE_INITIALIZER_LIST          span_CPP11_120
+#define span_HAVE_IS_DEFAULT                span_CPP11_140
+#define span_HAVE_IS_DELETE                 span_CPP11_140
+#define span_HAVE_NOEXCEPT                  span_CPP11_140
+#define span_HAVE_NORETURN                ( span_CPP11_140 && ! span_BETWEEN( span_COMPILER_GNUC_VERSION, 1, 480 ) )
+#define span_HAVE_NULLPTR                   span_CPP11_100
+#define span_HAVE_STATIC_ASSERT             span_CPP11_100
+
+// Presence of C++14 language features:
+
+#define span_HAVE_CONSTEXPR_14              span_CPP14_000
+
+// Presence of C++17 language features:
+
+#define span_HAVE_DEPRECATED                span_CPP17_000
+#define span_HAVE_NODISCARD                 span_CPP17_000
+
+// MSVC: template parameter deduction guides since Visual Studio 2017 v15.7
+
+#if defined(__cpp_deduction_guides)
+# define span_HAVE_DEDUCTION_GUIDES         1
+#else
+# define span_HAVE_DEDUCTION_GUIDES         (span_CPP17_OR_GREATER && ! span_BETWEEN( span_COMPILER_MSVC_VER, 1, 1913 ))
+#endif
+
+// Presence of C++ library features:
+
+#define span_HAVE_ADDRESSOF                 span_CPP17_000
+#define span_HAVE_ARRAY                     span_CPP11_110
+#define span_HAVE_BYTE                      span_CPP17_000
+#define span_HAVE_CONDITIONAL               span_CPP11_120
+#define span_HAVE_CONTAINER_DATA_METHOD    (span_CPP11_140 || ( span_COMPILER_MSVC_VER >= 1500 && span_HAS_CPP0X ))
+#define span_HAVE_DATA                      span_CPP17_000
+#define span_HAVE_LONGLONG                  span_CPP11_80
+#define span_HAVE_REMOVE_CONST              span_CPP11_110
+#define span_HAVE_SNPRINTF                  span_CPP11_140
+#define span_HAVE_STRUCT_BINDING            span_CPP11_120
+#define span_HAVE_TYPE_TRAITS               span_CPP11_90
+
+// Presence of byte-lite:
+
+#ifdef NONSTD_BYTE_LITE_HPP
+# define span_HAVE_NONSTD_BYTE  1
+#else
+# define span_HAVE_NONSTD_BYTE  0
+#endif
+
+// C++ feature usage:
+
+#if span_HAVE_ADDRESSOF
+# define span_ADDRESSOF(x)  std::addressof(x)
+#else
+# define span_ADDRESSOF(x)  (&x)
+#endif
+
+#if span_HAVE_CONSTEXPR_11
+# define span_constexpr constexpr
+#else
+# define span_constexpr /*span_constexpr*/
+#endif
+
+#if span_HAVE_CONSTEXPR_14
+# define span_constexpr14 constexpr
+#else
+# define span_constexpr14 /*span_constexpr*/
+#endif
+
+#if span_HAVE_EXPLICIT_CONVERSION
+# define span_explicit explicit
+#else
+# define span_explicit /*explicit*/
+#endif
+
+#if span_HAVE_IS_DELETE
+# define span_is_delete = delete
+#else
+# define span_is_delete
+#endif
+
+#if span_HAVE_IS_DELETE
+# define span_is_delete_access public
+#else
+# define span_is_delete_access private
+#endif
+
+#if span_HAVE_NOEXCEPT && ! span_CONFIG_CONTRACT_VIOLATION_THROWS_V
+# define span_noexcept noexcept
+#else
+# define span_noexcept /*noexcept*/
+#endif
+
+#if span_HAVE_NULLPTR
+# define span_nullptr nullptr
+#else
+# define span_nullptr NULL
+#endif
+
+#if span_HAVE_DEPRECATED
+# define span_deprecated(msg) [[deprecated(msg)]]
+#else
+# define span_deprecated(msg) /*[[deprecated]]*/
+#endif
+
+#if span_HAVE_NODISCARD
+# define span_nodiscard [[nodiscard]]
+#else
+# define span_nodiscard /*[[nodiscard]]*/
+#endif
+
+#if span_HAVE_NORETURN
+# define span_noreturn [[noreturn]]
+#else
+# define span_noreturn /*[[noreturn]]*/
+#endif
+
+// Other features:
+
+#define span_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR  span_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+#define span_HAVE_ITERATOR_CTOR                    span_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG
+
+// Additional includes:
+
+#if span_HAVE( ADDRESSOF )
+# include <memory>
+#endif
+
+#if span_HAVE( ARRAY )
+# include <array>
+#endif
+
+#if span_HAVE( BYTE )
+# include <cstddef>
+#endif
+
+#if span_HAVE( DATA )
+# include <iterator> // for std::data(), std::size()
+#endif
+
+#if span_HAVE( TYPE_TRAITS )
+# include <type_traits>
+#endif
+
+#if ! span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR )
+# include <vector>
+#endif
+
+#if span_FEATURE( MEMBER_AT ) > 1
+# include <cstdio>
+#endif
+
+#if ! span_CONFIG( NO_EXCEPTIONS )
+# include <stdexcept>
+#endif
+
+// Contract violation
+
+#define span_ELIDE_CONTRACT_EXPECTS  ( 0 == ( span_CONFIG_CONTRACT_LEVEL_MASK & 0x01 ) )
+#define span_ELIDE_CONTRACT_ENSURES  ( 0 == ( span_CONFIG_CONTRACT_LEVEL_MASK & 0x10 ) )
+
+#if span_ELIDE_CONTRACT_EXPECTS
+# define span_constexpr_exp    span_constexpr
+# define span_EXPECTS( cond )  /* Expect elided */
+#else
+# define span_constexpr_exp    span_constexpr14
+# define span_EXPECTS( cond )  span_CONTRACT_CHECK( "Precondition", cond )
+#endif
+
+#if span_ELIDE_CONTRACT_ENSURES
+# define span_constexpr_ens    span_constexpr
+# define span_ENSURES( cond )  /* Ensures elided */
+#else
+# define span_constexpr_ens    span_constexpr14
+# define span_ENSURES( cond )  span_CONTRACT_CHECK( "Postcondition", cond )
+#endif
+
+#define span_CONTRACT_CHECK( type, cond ) \
+    cond ? static_cast< void >( 0 ) \
+         : nonstd::span_lite::detail::report_contract_violation( span_LOCATION( __FILE__, __LINE__ ) ": " type " violation." )
+
+#ifdef __GNUG__
+# define span_LOCATION( file, line )  file ":" span_STRINGIFY( line )
+#else
+# define span_LOCATION( file, line )  file "(" span_STRINGIFY( line ) ")"
+#endif
+
+// Method enabling
+
+#if span_HAVE( DEFAULT_FUNCTION_TEMPLATE_ARG )
+
+#define span_REQUIRES_0(VA) \
+    template< bool B = (VA), typename std::enable_if<B, int>::type = 0 >
+
+# if span_BETWEEN( span_COMPILER_MSVC_VERSION, 1, 140 )
+// VS 2013 and earlier seem to have trouble with SFINAE for default non-type arguments
+# define span_REQUIRES_T(VA) \
+    , typename = typename std::enable_if< ( VA ), nonstd::span_lite::detail::enabler >::type
+# else
+# define span_REQUIRES_T(VA) \
+    , typename std::enable_if< (VA), int >::type = 0
+# endif
+
+#define span_REQUIRES_R(R, VA) \
+    typename std::enable_if< (VA), R>::type
+
+#define span_REQUIRES_A(VA) \
+    , typename std::enable_if< (VA), void*>::type = nullptr
+
+#else
+
+# define span_REQUIRES_0(VA)    /*empty*/
+# define span_REQUIRES_T(VA)    /*empty*/
+# define span_REQUIRES_R(R, VA) R
+# define span_REQUIRES_A(VA)    /*empty*/
+
+#endif
+
+namespace nonstd {
+namespace span_lite {
+
+// [views.constants], constants
+
+typedef span_CONFIG_EXTENT_TYPE extent_t;
+typedef span_CONFIG_SIZE_TYPE   size_t;
+
+span_constexpr const extent_t dynamic_extent = static_cast<extent_t>( -1 );
+
+template< class T, extent_t Extent = dynamic_extent >
+class span;
+
+// Tag to select span constructor taking a container (prevent ms-gsl warning C26426):
+
+struct with_container_t { span_constexpr with_container_t() span_noexcept {} };
+const  span_constexpr   with_container_t with_container;
+
+// C++11 emulation:
+
+namespace std11 {
+
+#if span_HAVE( REMOVE_CONST )
+
+using std::remove_cv;
+using std::remove_const;
+using std::remove_volatile;
+
+#else
+
+template< class T > struct remove_const            { typedef T type; };
+template< class T > struct remove_const< T const > { typedef T type; };
+
+template< class T > struct remove_volatile               { typedef T type; };
+template< class T > struct remove_volatile< T volatile > { typedef T type; };
+
+template< class T >
+struct remove_cv
+{
+    typedef typename std11::remove_volatile< typename std11::remove_const< T >::type >::type type;
+};
+
+#endif  // span_HAVE( REMOVE_CONST )
+
+#if span_HAVE( TYPE_TRAITS )
+
+using std::is_same;
+using std::is_signed;
+using std::integral_constant;
+using std::true_type;
+using std::false_type;
+using std::remove_reference;
+
+#else
+
+template< class T, T v > struct integral_constant { enum { value = v }; };
+typedef integral_constant< bool, true  > true_type;
+typedef integral_constant< bool, false > false_type;
+
+template< class T, class U > struct is_same : false_type{};
+template< class T          > struct is_same<T, T> : true_type{};
+
+template< typename T >  struct is_signed : false_type {};
+template<> struct is_signed<signed char> : true_type {};
+template<> struct is_signed<signed int > : true_type {};
+template<> struct is_signed<signed long> : true_type {};
+
+#endif
+
+} // namespace std11
+
+// C++17 emulation:
+
+namespace std17 {
+
+template< bool v > struct bool_constant : std11::integral_constant<bool, v>{};
+
+#if span_CPP11_120
+
+template< class...>
+using void_t = void;
+
+#endif
+
+#if span_HAVE( DATA )
+
+using std::data;
+using std::size;
+
+#elif span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR )
+
+template< typename T, std::size_t N >
+inline span_constexpr auto size( const T(&)[N] ) span_noexcept -> size_t
+{
+    return N;
+}
+
+template< typename C >
+inline span_constexpr auto size( C const & cont ) -> decltype( cont.size() )
+{
+    return cont.size();
+}
+
+template< typename T, std::size_t N >
+inline span_constexpr auto data( T(&arr)[N] ) span_noexcept -> T*
+{
+    return &arr[0];
+}
+
+template< typename C >
+inline span_constexpr auto data( C & cont ) -> decltype( cont.data() )
+{
+    return cont.data();
+}
+
+template< typename C >
+inline span_constexpr auto data( C const & cont ) -> decltype( cont.data() )
+{
+    return cont.data();
+}
+
+template< typename E >
+inline span_constexpr auto data( std::initializer_list<E> il ) span_noexcept -> E const *
+{
+    return il.begin();
+}
+
+#endif // span_HAVE( DATA )
+
+#if span_HAVE( BYTE )
+using std::byte;
+#elif span_HAVE( NONSTD_BYTE )
+using nonstd::byte;
+#endif
+
+} // namespace std17
+
+// C++20 emulation:
+
+namespace std20 {
+
+#if span_HAVE( DEDUCTION_GUIDES )
+template< class T >
+using iter_reference_t = decltype( *std::declval<T&>() );
+#endif
+
+} // namespace std20
+
+// Implementation details:
+
+namespace detail {
+
+/*enum*/ struct enabler{};
+
+template< typename T >
+span_constexpr bool is_positive( T x )
+{
+    return std11::is_signed<T>::value ? x >= 0 : true;
+}
+
+#if span_HAVE( TYPE_TRAITS )
+
+template< class Q >
+struct is_span_oracle : std::false_type{};
+
+template< class T, span_CONFIG_EXTENT_TYPE Extent >
+struct is_span_oracle< span<T, Extent> > : std::true_type{};
+
+template< class Q >
+struct is_span : is_span_oracle< typename std::remove_cv<Q>::type >{};
+
+template< class Q >
+struct is_std_array_oracle : std::false_type{};
+
+#if span_HAVE( ARRAY )
+
+template< class T, std::size_t Extent >
+struct is_std_array_oracle< std::array<T, Extent> > : std::true_type{};
+
+#endif
+
+template< class Q >
+struct is_std_array : is_std_array_oracle< typename std::remove_cv<Q>::type >{};
+
+template< class Q >
+struct is_array : std::false_type {};
+
+template< class T >
+struct is_array<T[]> : std::true_type {};
+
+template< class T, std::size_t N >
+struct is_array<T[N]> : std::true_type {};
+
+#if span_CPP11_140 && ! span_BETWEEN( span_COMPILER_GNUC_VERSION, 1, 500 )
+
+template< class, class = void >
+struct has_size_and_data : std::false_type{};
+
+template< class C >
+struct has_size_and_data
+<
+    C, std17::void_t<
+        decltype( std17::size(std::declval<C>()) ),
+        decltype( std17::data(std::declval<C>()) ) >
+> : std::true_type{};
+
+template< class, class, class = void >
+struct is_compatible_element : std::false_type {};
+
+template< class C, class E >
+struct is_compatible_element
+<
+    C, E, std17::void_t<
+        decltype( std17::data(std::declval<C>()) ) >
+> : std::is_convertible< typename std::remove_pointer<decltype( std17::data( std::declval<C&>() ) )>::type(*)[], E(*)[] >{};
+
+template< class C >
+struct is_container : std17::bool_constant
+<
+    ! is_span< C >::value
+    && ! is_array< C >::value
+    && ! is_std_array< C >::value
+    &&   has_size_and_data< C >::value
+>{};
+
+template< class C, class E >
+struct is_compatible_container : std17::bool_constant
+<
+    is_container<C>::value
+    && is_compatible_element<C,E>::value
+>{};
+
+#else // span_CPP11_140
+
+template<
+    class C, class E
+        span_REQUIRES_T((
+            ! is_span< C >::value
+            && ! is_array< C >::value
+            && ! is_std_array< C >::value
+            && ( std::is_convertible< typename std::remove_pointer<decltype( std17::data( std::declval<C&>() ) )>::type(*)[], E(*)[] >::value)
+        //  &&   has_size_and_data< C >::value
+        ))
+        , class = decltype( std17::size(std::declval<C>()) )
+        , class = decltype( std17::data(std::declval<C>()) )
+>
+struct is_compatible_container : std::true_type{};
+
+#endif // span_CPP11_140
+
+#endif // span_HAVE( TYPE_TRAITS )
+
+#if ! span_CONFIG( NO_EXCEPTIONS )
+#if   span_FEATURE( MEMBER_AT ) > 1
+
+// format index and size:
+
+#if defined(__clang__)
+# pragma clang diagnostic ignored "-Wlong-long"
+#elif defined __GNUC__
+# pragma GCC   diagnostic ignored "-Wformat=ll"
+# pragma GCC   diagnostic ignored "-Wlong-long"
+#endif
+
+span_noreturn inline void throw_out_of_range( size_t idx, size_t size )
+{
+    const char fmt[] = "span::at(): index '%lli' is out of range [0..%lli)";
+    char buffer[ 2 * 20 + sizeof fmt ];
+    sprintf( buffer, fmt, static_cast<long long>(idx), static_cast<long long>(size) );
+
+    throw std::out_of_range( buffer );
+}
+
+#else // MEMBER_AT
+
+span_noreturn inline void throw_out_of_range( size_t /*idx*/, size_t /*size*/ )
+{
+    throw std::out_of_range( "span::at(): index outside span" );
+}
+#endif  // MEMBER_AT
+#endif  // NO_EXCEPTIONS
+
+#if span_CONFIG( CONTRACT_VIOLATION_THROWS_V )
+
+struct contract_violation : std::logic_error
+{
+    explicit contract_violation( char const * const message )
+        : std::logic_error( message )
+    {}
+};
+
+inline void report_contract_violation( char const * msg )
+{
+    throw contract_violation( msg );
+}
+
+#else // span_CONFIG( CONTRACT_VIOLATION_THROWS_V )
+
+span_noreturn inline void report_contract_violation( char const * /*msg*/ ) span_noexcept
+{
+    std::terminate();
+}
+
+#endif // span_CONFIG( CONTRACT_VIOLATION_THROWS_V )
+
+}  // namespace detail
+
+// Prevent signed-unsigned mismatch:
+
+#define span_sizeof(T)  static_cast<extent_t>( sizeof(T) )
+
+template< class T >
+inline span_constexpr size_t to_size( T size )
+{
+    return static_cast<size_t>( size );
+}
+
+//
+// [views.span] - A view over a contiguous, single-dimension sequence of objects
+//
+template< class T, extent_t Extent /*= dynamic_extent*/ >
+class span
+{
+public:
+    // constants and types
+
+    typedef T element_type;
+    typedef typename std11::remove_cv< T >::type value_type;
+
+    typedef T &       reference;
+    typedef T *       pointer;
+    typedef T const * const_pointer;
+    typedef T const & const_reference;
+
+    typedef size_t    size_type;
+    typedef extent_t  extent_type;
+
+    typedef pointer        iterator;
+    typedef const_pointer  const_iterator;
+
+    typedef std::ptrdiff_t difference_type;
+
+    typedef std::reverse_iterator< iterator >       reverse_iterator;
+    typedef std::reverse_iterator< const_iterator > const_reverse_iterator;
+
+//    static constexpr extent_type extent = Extent;
+    enum { extent = Extent };
+
+    // 26.7.3.2 Constructors, copy, and assignment [span.cons]
+
+    span_REQUIRES_0(
+        ( Extent == 0 ) ||
+        ( Extent == dynamic_extent )
+    )
+    span_constexpr span() span_noexcept
+        : data_( span_nullptr )
+        , size_( 0 )
+    {
+        // span_EXPECTS( data() == span_nullptr );
+        // span_EXPECTS( size() == 0 );
+    }
+
+#if span_HAVE( ITERATOR_CTOR )
+    // Didn't yet succeed in combining the next two constructors:
+
+    span_constexpr_exp span( std::nullptr_t, size_type count )
+        : data_( span_nullptr )
+        , size_( count )
+    {
+        span_EXPECTS( data_ == span_nullptr && count == 0 );
+    }
+
+    template< typename It
+        span_REQUIRES_T((
+            std::is_convertible<decltype(*std::declval<It&>()), element_type &>::value
+        ))
+    >
+    span_constexpr_exp span( It first, size_type count )
+        : data_( to_address( first ) )
+        , size_( count )
+    {
+        span_EXPECTS(
+            ( data_ == span_nullptr && count == 0 ) ||
+            ( data_ != span_nullptr && detail::is_positive( count ) )
+        );
+    }
+#else
+    span_constexpr_exp span( pointer ptr, size_type count )
+        : data_( ptr )
+        , size_( count )
+    {
+        span_EXPECTS(
+            ( ptr == span_nullptr && count == 0 ) ||
+            ( ptr != span_nullptr && detail::is_positive( count ) )
+        );
+    }
+#endif
+
+#if span_HAVE( ITERATOR_CTOR )
+    template< typename It, typename End
+        span_REQUIRES_T((
+            std::is_convertible<decltype(&*std::declval<It&>()), element_type *>::value
+            && ! std::is_convertible<End, std::size_t>::value
+        ))
+     >
+    span_constexpr_exp span( It first, End last )
+        : data_( to_address( first ) )
+        , size_( to_size( last - first ) )
+    {
+        span_EXPECTS(
+             last - first >= 0
+        );
+    }
+#else
+    span_constexpr_exp span( pointer first, pointer last )
+        : data_( first )
+        , size_( to_size( last - first ) )
+    {
+        span_EXPECTS(
+            last - first >= 0
+        );
+    }
+#endif
+
+    template< std::size_t N
+        span_REQUIRES_T((
+            (Extent == dynamic_extent || Extent == static_cast<extent_t>(N))
+            && std::is_convertible< value_type(*)[], element_type(*)[] >::value
+        ))
+    >
+    span_constexpr span( element_type ( &arr )[ N ] ) span_noexcept
+        : data_( span_ADDRESSOF( arr[0] ) )
+        , size_( N  )
+    {}
+
+#if span_HAVE( ARRAY )
+
+    template< std::size_t N
+        span_REQUIRES_T((
+            (Extent == dynamic_extent || Extent == static_cast<extent_t>(N))
+            && std::is_convertible< value_type(*)[], element_type(*)[] >::value
+        ))
+    >
+# if span_FEATURE( CONSTRUCTION_FROM_STDARRAY_ELEMENT_TYPE )
+        span_constexpr span( std::array< element_type, N > & arr ) span_noexcept
+# else
+        span_constexpr span( std::array< value_type, N > & arr ) span_noexcept
+# endif
+        : data_( arr.data() )
+        , size_( to_size( arr.size() ) )
+    {}
+
+    template< std::size_t N
+# if span_HAVE( DEFAULT_FUNCTION_TEMPLATE_ARG )
+        span_REQUIRES_T((
+            (Extent == dynamic_extent || Extent == static_cast<extent_t>(N))
+            && std::is_convertible< value_type(*)[], element_type(*)[] >::value
+        ))
+# endif
+    >
+    span_constexpr span( std::array< value_type, N> const & arr ) span_noexcept
+        : data_( arr.data() )
+        , size_( to_size( arr.size() ) )
+    {}
+
+#endif // span_HAVE( ARRAY )
+
+#if span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR )
+    template< class Container
+        span_REQUIRES_T((
+            detail::is_compatible_container< Container, element_type >::value
+        ))
+    >
+    span_constexpr span( Container & cont )
+        : data_( std17::data( cont ) )
+        , size_( to_size( std17::size( cont ) ) )
+    {}
+
+    template< class Container
+        span_REQUIRES_T((
+            std::is_const< element_type >::value
+            && detail::is_compatible_container< Container, element_type >::value
+        ))
+    >
+    span_constexpr span( Container const & cont )
+        : data_( std17::data( cont ) )
+        , size_( to_size( std17::size( cont ) ) )
+    {}
+
+#endif // span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR )
+
+#if span_FEATURE( WITH_CONTAINER )
+
+    template< class Container >
+    span_constexpr span( with_container_t, Container & cont )
+        : data_( cont.size() == 0 ? span_nullptr : span_ADDRESSOF( cont[0] ) )
+        , size_( to_size( cont.size() ) )
+    {}
+
+    template< class Container >
+    span_constexpr span( with_container_t, Container const & cont )
+        : data_( cont.size() == 0 ? span_nullptr : const_cast<pointer>( span_ADDRESSOF( cont[0] ) ) )
+        , size_( to_size( cont.size() ) )
+    {}
+#endif
+
+#if span_FEATURE( WITH_INITIALIZER_LIST_P2447 ) && span_HAVE( INITIALIZER_LIST )
+
+    // constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il) noexcept;
+
+#if !span_BETWEEN( span_COMPILER_MSVC_VERSION, 120, 130 )
+
+    template< extent_t U = Extent
+        span_REQUIRES_T((
+            U != dynamic_extent
+        ))
+    >
+#if span_COMPILER_GNUC_VERSION >= 900   // prevent GCC's "-Winit-list-lifetime"
+    span_constexpr14 explicit span( std::initializer_list<value_type> il ) span_noexcept
+    {
+        data_ = il.begin();
+        size_ = il.size();
+    }
+#else
+    span_constexpr explicit span( std::initializer_list<value_type> il ) span_noexcept
+        : data_( il.begin() )
+        , size_( il.size()  )
+    {}
+#endif
+
+#endif // MSVC 120 (VS2013)
+
+    template< extent_t U = Extent
+        span_REQUIRES_T((
+            U == dynamic_extent
+        ))
+    >
+#if span_COMPILER_GNUC_VERSION >= 900   // prevent GCC's "-Winit-list-lifetime"
+    span_constexpr14 /*explicit*/ span( std::initializer_list<value_type> il ) span_noexcept
+    {
+        data_ = il.begin();
+        size_ = il.size();
+    }
+#else
+    span_constexpr /*explicit*/ span( std::initializer_list<value_type> il ) span_noexcept
+        : data_( il.begin() )
+        , size_( il.size()  )
+    {}
+#endif
+
+#endif // P2447
+
+#if span_HAVE( IS_DEFAULT )
+    span_constexpr span( span const & other ) span_noexcept = default;
+
+    ~span() span_noexcept = default;
+
+    span_constexpr14 span & operator=( span const & other ) span_noexcept = default;
+#else
+    span_constexpr span( span const & other ) span_noexcept
+        : data_( other.data_ )
+        , size_( other.size_ )
+    {}
+
+    ~span() span_noexcept
+    {}
+
+    span_constexpr14 span & operator=( span const & other ) span_noexcept
+    {
+        data_ = other.data_;
+        size_ = other.size_;
+
+        return *this;
+    }
+#endif
+
+    template< class OtherElementType, extent_type OtherExtent
+        span_REQUIRES_T((
+            (Extent == dynamic_extent || OtherExtent == dynamic_extent || Extent == OtherExtent)
+            && std::is_convertible<OtherElementType(*)[], element_type(*)[]>::value
+        ))
+    >
+    span_constexpr_exp span( span<OtherElementType, OtherExtent> const & other ) span_noexcept
+        : data_( other.data() )
+        , size_( other.size() )
+    {
+        span_EXPECTS( OtherExtent == dynamic_extent || other.size() == to_size(OtherExtent) );
+    }
+
+    // 26.7.3.3 Subviews [span.sub]
+
+    template< extent_type Count >
+    span_constexpr_exp span< element_type, Count >
+    first() const
+    {
+        span_EXPECTS( detail::is_positive( Count ) && Count <= size() );
+
+        return span< element_type, Count >( data(), Count );
+    }
+
+    template< extent_type Count >
+    span_constexpr_exp span< element_type, Count >
+    last() const
+    {
+        span_EXPECTS( detail::is_positive( Count ) && Count <= size() );
+
+        return span< element_type, Count >( data() + (size() - Count), Count );
+    }
+
+#if span_HAVE( DEFAULT_FUNCTION_TEMPLATE_ARG )
+    template< size_type Offset, extent_type Count = dynamic_extent >
+#else
+    template< size_type Offset, extent_type Count /*= dynamic_extent*/ >
+#endif
+    span_constexpr_exp span< element_type, Count >
+    subspan() const
+    {
+        span_EXPECTS(
+            ( detail::is_positive( Offset ) && Offset <= size() ) &&
+            ( Count == dynamic_extent || (detail::is_positive( Count ) && Count + Offset <= size()) )
+        );
+
+        return span< element_type, Count >(
+            data() + Offset, Count != dynamic_extent ? Count : (Extent != dynamic_extent ? Extent - Offset : size() - Offset) );
+    }
+
+    span_constexpr_exp span< element_type, dynamic_extent >
+    first( size_type count ) const
+    {
+        span_EXPECTS( detail::is_positive( count ) && count <= size() );
+
+        return span< element_type, dynamic_extent >( data(), count );
+    }
+
+    span_constexpr_exp span< element_type, dynamic_extent >
+    last( size_type count ) const
+    {
+        span_EXPECTS( detail::is_positive( count ) && count <= size() );
+
+        return span< element_type, dynamic_extent >( data() + ( size() - count ), count );
+    }
+
+    span_constexpr_exp span< element_type, dynamic_extent >
+    subspan( size_type offset, size_type count = static_cast<size_type>(dynamic_extent) ) const
+    {
+        span_EXPECTS(
+            ( ( detail::is_positive( offset ) && offset <= size() ) ) &&
+            ( count == static_cast<size_type>(dynamic_extent) || ( detail::is_positive( count ) && offset + count <= size() ) )
+        );
+
+        return span< element_type, dynamic_extent >(
+            data() + offset, count == static_cast<size_type>(dynamic_extent) ? size() - offset : count );
+    }
+
+    // 26.7.3.4 Observers [span.obs]
+
+    span_constexpr size_type size() const span_noexcept
+    {
+        return size_;
+    }
+
+    span_constexpr std::ptrdiff_t ssize() const span_noexcept
+    {
+        return static_cast<std::ptrdiff_t>( size_ );
+    }
+
+    span_constexpr size_type size_bytes() const span_noexcept
+    {
+        return size() * to_size( sizeof( element_type ) );
+    }
+
+    span_nodiscard span_constexpr bool empty() const span_noexcept
+    {
+        return size() == 0;
+    }
+
+    // 26.7.3.5 Element access [span.elem]
+
+    span_constexpr_exp reference operator[]( size_type idx ) const
+    {
+        span_EXPECTS( detail::is_positive( idx ) && idx < size() );
+
+        return *( data() + idx );
+    }
+
+#if span_FEATURE( MEMBER_CALL_OPERATOR )
+    span_deprecated("replace operator() with operator[]")
+
+    span_constexpr_exp reference operator()( size_type idx ) const
+    {
+        span_EXPECTS( detail::is_positive( idx ) && idx < size() );
+
+        return *( data() + idx );
+    }
+#endif
+
+#if span_FEATURE( MEMBER_AT )
+    span_constexpr14 reference at( size_type idx ) const
+    {
+#if span_CONFIG( NO_EXCEPTIONS )
+        return this->operator[]( idx );
+#else
+        if ( !detail::is_positive( idx ) || size() <= idx )
+        {
+            detail::throw_out_of_range( idx, size() );
+        }
+        return *( data() + idx );
+#endif
+    }
+#endif
+
+    span_constexpr pointer data() const span_noexcept
+    {
+        return data_;
+    }
+
+#if span_FEATURE( MEMBER_BACK_FRONT )
+
+    span_constexpr_exp reference front() const span_noexcept
+    {
+        span_EXPECTS( ! empty() );
+
+        return *data();
+    }
+
+    span_constexpr_exp reference back() const span_noexcept
+    {
+        span_EXPECTS( ! empty() );
+
+        return *( data() + size() - 1 );
+    }
+
+#endif
+
+    // xx.x.x.x Modifiers [span.modifiers]
+
+#if span_FEATURE( MEMBER_SWAP )
+
+    span_constexpr14 void swap( span & other ) span_noexcept
+    {
+        using std::swap;
+        swap( data_, other.data_ );
+        swap( size_, other.size_ );
+    }
+#endif
+
+    // 26.7.3.6 Iterator support [span.iterators]
+
+    span_constexpr iterator begin() const span_noexcept
+    {
+#if span_CPP11_OR_GREATER
+        return { data() };
+#else
+        return iterator( data() );
+#endif
+    }
+
+    span_constexpr iterator end() const span_noexcept
+    {
+#if span_CPP11_OR_GREATER
+        return { data() + size() };
+#else
+        return iterator( data() + size() );
+#endif
+    }
+
+    span_constexpr const_iterator cbegin() const span_noexcept
+    {
+#if span_CPP11_OR_GREATER
+        return { data() };
+#else
+        return const_iterator( data() );
+#endif
+    }
+
+    span_constexpr const_iterator cend() const span_noexcept
+    {
+#if span_CPP11_OR_GREATER
+        return { data() + size() };
+#else
+        return const_iterator( data() + size() );
+#endif
+    }
+
+    span_constexpr reverse_iterator rbegin() const span_noexcept
+    {
+        return reverse_iterator( end() );
+    }
+
+    span_constexpr reverse_iterator rend() const span_noexcept
+    {
+        return reverse_iterator( begin() );
+    }
+
+    span_constexpr const_reverse_iterator crbegin() const span_noexcept
+    {
+        return const_reverse_iterator ( cend() );
+    }
+
+    span_constexpr const_reverse_iterator crend() const span_noexcept
+    {
+        return const_reverse_iterator( cbegin() );
+    }
+
+private:
+
+    // Note: C++20 has std::pointer_traits<Ptr>::to_address( it );
+
+#if span_HAVE( ITERATOR_CTOR )
+    static inline span_constexpr pointer to_address( std::nullptr_t ) span_noexcept
+    {
+        return nullptr;
+    }
+
+    template< typename U >
+    static inline span_constexpr U * to_address( U * p ) span_noexcept
+    {
+        return p;
+    }
+
+    template< typename Ptr
+        span_REQUIRES_T(( ! std::is_pointer<Ptr>::value ))
+    >
+    static inline span_constexpr pointer to_address( Ptr const & it ) span_noexcept
+    {
+        return to_address( it.operator->() );
+    }
+#endif // span_HAVE( ITERATOR_CTOR )
+
+private:
+    pointer   data_;
+    size_type size_;
+};
+
+// class template argument deduction guides:
+
+#if span_HAVE( DEDUCTION_GUIDES )
+
+template< class T, size_t N >
+span( T (&)[N] ) -> span<T, static_cast<extent_t>(N)>;
+
+template< class T, size_t N >
+span( std::array<T, N> & ) -> span<T, static_cast<extent_t>(N)>;
+
+template< class T, size_t N >
+span( std::array<T, N> const & ) -> span<const T, static_cast<extent_t>(N)>;
+
+#if span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR )
+
+template< class Container >
+span( Container& ) -> span<typename Container::value_type>;
+
+template< class Container >
+span( Container const & ) -> span<const typename Container::value_type>;
+
+#endif
+
+// iterator: constraints: It satisfies contiguous_­iterator.
+
+template< class It, class EndOrSize >
+span( It, EndOrSize ) -> span< typename std11::remove_reference< typename std20::iter_reference_t<It> >::type >;
+
+#endif // span_HAVE( DEDUCTION_GUIDES )
+
+// 26.7.3.7 Comparison operators [span.comparison]
+
+#if span_FEATURE( COMPARISON )
+#if span_FEATURE( SAME )
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool same( span<T1,E1> const & l, span<T2,E2> const & r ) span_noexcept
+{
+    return std11::is_same<T1, T2>::value
+        && l.size() == r.size()
+        && static_cast<void const*>( l.data() ) == r.data();
+}
+
+#endif
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator==( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return
+#if span_FEATURE( SAME )
+        same( l, r ) ||
+#endif
+        ( l.size() == r.size() && std::equal( l.begin(), l.end(), r.begin() ) );
+}
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator<( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return std::lexicographical_compare( l.begin(), l.end(), r.begin(), r.end() );
+}
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator!=( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return !( l == r );
+}
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator<=( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return !( r < l );
+}
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator>( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return ( r < l );
+}
+
+template< class T1, extent_t E1, class T2, extent_t E2  >
+inline span_constexpr bool operator>=( span<T1,E1> const & l, span<T2,E2> const & r )
+{
+    return !( l < r );
+}
+
+#endif // span_FEATURE( COMPARISON )
+
+// 26.7.2.6 views of object representation [span.objectrep]
+
+#if span_HAVE( BYTE ) || span_HAVE( NONSTD_BYTE )
+
+// Avoid MSVC 14.1 (1910), VS 2017: warning C4307: '*': integral constant overflow:
+
+template< typename T, extent_t Extent >
+struct BytesExtent
+{
+#if span_CPP11_OR_GREATER
+    enum ET : extent_t { value = span_sizeof(T) * Extent };
+#else
+    enum ET { value = span_sizeof(T) * Extent };
+#endif
+};
+
+template< typename T >
+struct BytesExtent< T, dynamic_extent >
+{
+#if span_CPP11_OR_GREATER
+    enum ET : extent_t { value = dynamic_extent };
+#else
+    enum ET { value = dynamic_extent };
+#endif
+};
+
+template< class T, extent_t Extent >
+inline span_constexpr span< const std17::byte, BytesExtent<T, Extent>::value >
+as_bytes( span<T,Extent> spn ) span_noexcept
+{
+#if 0
+    return { reinterpret_cast< std17::byte const * >( spn.data() ), spn.size_bytes() };
+#else
+    return span< const std17::byte, BytesExtent<T, Extent>::value >(
+        reinterpret_cast< std17::byte const * >( spn.data() ), spn.size_bytes() );  // NOLINT
+#endif
+}
+
+template< class T, extent_t Extent >
+inline span_constexpr span< std17::byte, BytesExtent<T, Extent>::value >
+as_writable_bytes( span<T,Extent> spn ) span_noexcept
+{
+#if 0
+    return { reinterpret_cast< std17::byte * >( spn.data() ), spn.size_bytes() };
+#else
+    return span< std17::byte, BytesExtent<T, Extent>::value >(
+        reinterpret_cast< std17::byte * >( spn.data() ), spn.size_bytes() );  // NOLINT
+#endif
+}
+
+#endif // span_HAVE( BYTE ) || span_HAVE( NONSTD_BYTE )
+
+// 27.8 Container and view access [iterator.container]
+
+template< class T, extent_t Extent /*= dynamic_extent*/ >
+span_constexpr std::size_t size( span<T,Extent> const & spn )
+{
+    return static_cast<std::size_t>( spn.size() );
+}
+
+template< class T, extent_t Extent /*= dynamic_extent*/ >
+span_constexpr std::ptrdiff_t ssize( span<T,Extent> const & spn )
+{
+    return static_cast<std::ptrdiff_t>( spn.size() );
+}
+
+}  // namespace span_lite
+}  // namespace nonstd
+
+// make available in nonstd:
+
+namespace nonstd {
+
+using span_lite::dynamic_extent;
+
+using span_lite::span;
+
+using span_lite::with_container;
+
+#if span_FEATURE( COMPARISON )
+#if span_FEATURE( SAME )
+using span_lite::same;
+#endif
+
+using span_lite::operator==;
+using span_lite::operator!=;
+using span_lite::operator<;
+using span_lite::operator<=;
+using span_lite::operator>;
+using span_lite::operator>=;
+#endif
+
+#if span_HAVE( BYTE )
+using span_lite::as_bytes;
+using span_lite::as_writable_bytes;
+#endif
+
+using span_lite::size;
+using span_lite::ssize;
+
+}  // namespace nonstd
+
+#endif  // span_USES_STD_SPAN
+
+// make_span() [span-lite extension]:
+
+#if span_FEATURE( MAKE_SPAN ) || span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_SPAN ) || span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_CONTAINER )
+
+#if span_USES_STD_SPAN
+# define  span_constexpr  constexpr
+# define  span_noexcept   noexcept
+# define  span_nullptr    nullptr
+# ifndef  span_CONFIG_EXTENT_TYPE
+#  define span_CONFIG_EXTENT_TYPE  std::size_t
+# endif
+using extent_t = span_CONFIG_EXTENT_TYPE;
+#endif  // span_USES_STD_SPAN
+
+namespace nonstd {
+namespace span_lite {
+
+template< class T >
+inline span_constexpr span<T>
+make_span( T * ptr, size_t count ) span_noexcept
+{
+    return span<T>( ptr, count );
+}
+
+template< class T >
+inline span_constexpr span<T>
+make_span( T * first, T * last ) span_noexcept
+{
+    return span<T>( first, last );
+}
+
+template< class T, std::size_t N >
+inline span_constexpr span<T, static_cast<extent_t>(N)>
+make_span( T ( &arr )[ N ] ) span_noexcept
+{
+    return span<T, static_cast<extent_t>(N)>( &arr[ 0 ], N );
+}
+
+#if span_USES_STD_SPAN || span_HAVE( ARRAY )
+
+template< class T, std::size_t N >
+inline span_constexpr span<T, static_cast<extent_t>(N)>
+make_span( std::array< T, N > & arr ) span_noexcept
+{
+    return span<T, static_cast<extent_t>(N)>( arr );
+}
+
+template< class T, std::size_t N >
+inline span_constexpr span< const T, static_cast<extent_t>(N) >
+make_span( std::array< T, N > const & arr ) span_noexcept
+{
+    return span<const T, static_cast<extent_t>(N)>( arr );
+}
+
+#endif // span_HAVE( ARRAY )
+
+#if span_USES_STD_SPAN || span_HAVE( INITIALIZER_LIST )
+
+template< class T >
+inline span_constexpr span< const T >
+make_span( std::initializer_list<T> il ) span_noexcept
+{
+    return span<const T>( il.begin(), il.size() );
+}
+
+#endif // span_HAVE( INITIALIZER_LIST )
+
+#if span_USES_STD_SPAN
+
+template< class Container, class EP = decltype( std::data(std::declval<Container&>())) >
+inline span_constexpr auto
+make_span( Container & cont ) span_noexcept -> span< typename std::remove_pointer<EP>::type >
+{
+    return span< typename std::remove_pointer<EP>::type >( cont );
+}
+
+template< class Container, class EP = decltype( std::data(std::declval<Container&>())) >
+inline span_constexpr auto
+make_span( Container const & cont ) span_noexcept -> span< const typename std::remove_pointer<EP>::type >
+{
+    return span< const typename std::remove_pointer<EP>::type >( cont );
+}
+
+#elif span_HAVE( CONSTRAINED_SPAN_CONTAINER_CTOR ) && span_HAVE( AUTO )
+
+template< class Container, class EP = decltype( std17::data(std::declval<Container&>())) >
+inline span_constexpr auto
+make_span( Container & cont ) span_noexcept -> span< typename std::remove_pointer<EP>::type >
+{
+    return span< typename std::remove_pointer<EP>::type >( cont );
+}
+
+template< class Container, class EP = decltype( std17::data(std::declval<Container&>())) >
+inline span_constexpr auto
+make_span( Container const & cont ) span_noexcept -> span< const typename std::remove_pointer<EP>::type >
+{
+    return span< const typename std::remove_pointer<EP>::type >( cont );
+}
+
+#else
+
+template< class T >
+inline span_constexpr span<T>
+make_span( span<T> spn ) span_noexcept
+{
+    return spn;
+}
+
+template< class T, class Allocator >
+inline span_constexpr span<T>
+make_span( std::vector<T, Allocator> & cont ) span_noexcept
+{
+    return span<T>( with_container, cont );
+}
+
+template< class T, class Allocator >
+inline span_constexpr span<const T>
+make_span( std::vector<T, Allocator> const & cont ) span_noexcept
+{
+    return span<const T>( with_container, cont );
+}
+
+#endif // span_USES_STD_SPAN || ( ... )
+
+#if ! span_USES_STD_SPAN && span_FEATURE( WITH_CONTAINER )
+
+template< class Container >
+inline span_constexpr span<typename Container::value_type>
+make_span( with_container_t, Container & cont ) span_noexcept
+{
+    return span< typename Container::value_type >( with_container, cont );
+}
+
+template< class Container >
+inline span_constexpr span<const typename Container::value_type>
+make_span( with_container_t, Container const & cont ) span_noexcept
+{
+    return span< const typename Container::value_type >( with_container, cont );
+}
+
+#endif // ! span_USES_STD_SPAN && span_FEATURE( WITH_CONTAINER )
+
+// extensions: non-member views:
+// this feature implies the presence of make_span()
+
+#if span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_SPAN )
+
+template< extent_t Count, class T, extent_t Extent >
+span_constexpr span<T, Count>
+first( span<T, Extent> spn )
+{
+    return spn.template first<Count>();
+}
+
+template< class T, extent_t Extent >
+span_constexpr span<T>
+first( span<T, Extent> spn, size_t count )
+{
+    return spn.first( count );
+}
+
+template< extent_t Count, class T, extent_t Extent >
+span_constexpr span<T, Count>
+last( span<T, Extent> spn )
+{
+    return spn.template last<Count>();
+}
+
+template< class T, extent_t Extent >
+span_constexpr span<T>
+last( span<T, Extent> spn, size_t count )
+{
+    return spn.last( count );
+}
+
+template< size_t Offset, extent_t Count, class T, extent_t Extent >
+span_constexpr span<T, Count>
+subspan( span<T, Extent> spn )
+{
+    return spn.template subspan<Offset, Count>();
+}
+
+template< class T, extent_t Extent >
+span_constexpr span<T>
+subspan( span<T, Extent> spn, size_t offset, extent_t count = dynamic_extent )
+{
+    return spn.subspan( offset, count );
+}
+
+#endif // span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_SPAN )
+
+#if span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_CONTAINER ) && span_CPP11_120
+
+template< extent_t Count, class T >
+span_constexpr auto
+first( T & t ) -> decltype( make_span(t).template first<Count>() )
+{
+    return make_span( t ).template first<Count>();
+}
+
+template< class T >
+span_constexpr auto
+first( T & t, size_t count ) -> decltype( make_span(t).first(count) )
+{
+    return make_span( t ).first( count );
+}
+
+template< extent_t Count, class T >
+span_constexpr auto
+last( T & t ) -> decltype( make_span(t).template last<Count>() )
+{
+    return make_span(t).template last<Count>();
+}
+
+template< class T >
+span_constexpr auto
+last( T & t, extent_t count ) -> decltype( make_span(t).last(count) )
+{
+    return make_span( t ).last( count );
+}
+
+template< size_t Offset, extent_t Count = dynamic_extent, class T >
+span_constexpr auto
+subspan( T & t ) -> decltype( make_span(t).template subspan<Offset, Count>() )
+{
+    return make_span( t ).template subspan<Offset, Count>();
+}
+
+template< class T >
+span_constexpr auto
+subspan( T & t, size_t offset, extent_t count = dynamic_extent ) -> decltype( make_span(t).subspan(offset, count) )
+{
+    return make_span( t ).subspan( offset, count );
+}
+
+#endif // span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_CONTAINER )
+
+}  // namespace span_lite
+}  // namespace nonstd
+
+// make available in nonstd:
+
+namespace nonstd {
+using span_lite::make_span;
+
+#if span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_SPAN ) || ( span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_CONTAINER ) && span_CPP11_120 )
+
+using span_lite::first;
+using span_lite::last;
+using span_lite::subspan;
+
+#endif // span_FEATURE( NON_MEMBER_FIRST_LAST_SUB_[SPAN|CONTAINER] )
+
+}  // namespace nonstd
+
+#endif // #if span_FEATURE_TO_STD( MAKE_SPAN )
+
+#if span_CPP11_OR_GREATER && span_FEATURE( BYTE_SPAN ) && ( span_HAVE( BYTE ) || span_HAVE( NONSTD_BYTE ) )
+
+namespace nonstd {
+namespace span_lite {
+
+template< class T >
+inline span_constexpr auto
+byte_span( T & t ) span_noexcept -> span< std17::byte, span_sizeof(T) >
+{
+    return span< std17::byte, span_sizeof(t) >( reinterpret_cast< std17::byte * >( &t ), span_sizeof(T) );
+}
+
+template< class T >
+inline span_constexpr auto
+byte_span( T const & t ) span_noexcept -> span< const std17::byte, span_sizeof(T) >
+{
+    return span< const std17::byte, span_sizeof(t) >( reinterpret_cast< std17::byte const * >( &t ), span_sizeof(T) );
+}
+
+}  // namespace span_lite
+}  // namespace nonstd
+
+// make available in nonstd:
+
+namespace nonstd {
+using span_lite::byte_span;
+}  // namespace nonstd
+
+#endif // span_FEATURE( BYTE_SPAN )
+
+#if span_HAVE( STRUCT_BINDING )
+
+#if   span_CPP14_OR_GREATER
+# include <tuple>
+#elif span_CPP11_OR_GREATER
+# include <tuple>
+namespace std {
+    template< std::size_t I, typename T >
+    using tuple_element_t = typename tuple_element<I, T>::type;
+}
+#else
+namespace std {
+    template< typename T >
+    class tuple_size; /*undefined*/
+
+    template< std::size_t I, typename T >
+    class tuple_element; /* undefined */
+}
+#endif // span_CPP14_OR_GREATER
+
+namespace std {
+
+// 26.7.X Tuple interface
+
+// std::tuple_size<>:
+
+template< typename ElementType, nonstd::span_lite::extent_t Extent >
+class tuple_size< nonstd::span<ElementType, Extent> > : public integral_constant<size_t, static_cast<size_t>(Extent)> {};
+
+// std::tuple_size<>: Leave undefined for dynamic extent:
+
+template< typename ElementType >
+class tuple_size< nonstd::span<ElementType, nonstd::dynamic_extent> >;
+
+// std::tuple_element<>:
+
+template< size_t I, typename ElementType, nonstd::span_lite::extent_t Extent >
+class tuple_element< I, nonstd::span<ElementType, Extent> >
+{
+public:
+#if span_HAVE( STATIC_ASSERT )
+    static_assert( Extent != nonstd::dynamic_extent && I < Extent, "tuple_element<I,span>: dynamic extent or index out of range" );
+#endif
+    using type = ElementType;
+};
+
+// std::get<>(), 2 variants:
+
+template< size_t I, typename ElementType, nonstd::span_lite::extent_t Extent >
+span_constexpr ElementType & get( nonstd::span<ElementType, Extent> & spn ) span_noexcept
+{
+#if span_HAVE( STATIC_ASSERT )
+    static_assert( Extent != nonstd::dynamic_extent && I < Extent, "get<>(span): dynamic extent or index out of range" );
+#endif
+    return spn[I];
+}
+
+template< size_t I, typename ElementType, nonstd::span_lite::extent_t Extent >
+span_constexpr ElementType const & get( nonstd::span<ElementType, Extent> const & spn ) span_noexcept
+{
+#if span_HAVE( STATIC_ASSERT )
+    static_assert( Extent != nonstd::dynamic_extent && I < Extent, "get<>(span): dynamic extent or index out of range" );
+#endif
+    return spn[I];
+}
+
+} // end namespace std
+
+#endif // span_HAVE( STRUCT_BINDING )
+
+#if ! span_USES_STD_SPAN
+span_RESTORE_WARNINGS()
+#endif  // span_USES_STD_SPAN
+
+#endif  // NONSTD_SPAN_HPP_INCLUDED
diff --git a/stm.cc b/stm.cc
index de60c9d5e..670b84f74 100644
--- a/stm.cc
+++ b/stm.cc
@@ -358,6 +358,93 @@ void extendedForStm::prettyprint(ostream &out, Int indent)
   body->prettyprint(out, indent+1);
 }
 
+runnable *forArrayInit(position pos, symbol i) {
+  // int i = 0;
+  return new vardec(pos, new tyEntryTy(pos, primInt()),
+                    new decid(pos,
+                              new decidstart(pos, i),
+                              new intExp(pos, 0)));
+}
+
+runnable *forIterInit(position pos, symbol it, exp *iterExp) {
+  // var it = object.operator iter();
+  return new vardec(pos, new tyEntryTy(pos, primInferred()),
+                    new decid(pos,
+                              new decidstart(pos, it),
+                              iterExp));
+}
+
+exp *forArrayTest(position pos, symbol i, symbol a) {
+  // i < a.length;
+  return new binaryExp(pos,
+                       new nameExp(pos, i),
+                       SYM_LT,
+                       new nameExp(pos,
+                                   new qualifiedName(pos,
+                                                     new simpleName(pos, a),
+                                                     symbol::trans("length"))));
+}
+
+exp* forIterTest(position pos, symbol it)
+{
+  // it.valid();
+  return new callExp(
+          pos,
+          new fieldExp(pos, new nameExp(pos, it), symbol::literalTrans("valid"))
+  );
+}
+
+runnable *forArrayUpdate(position pos, symbol i) {
+  // ++i;
+  return new expStm(pos, new prefixExp(pos, new nameExp(pos, i), SYM_PLUS));
+}
+
+runnable* forIterUpdate(position pos, symbol it)
+{
+  // it.advance();
+  return new expStm(
+          pos, new callExp(
+                       pos, new fieldExp(
+                                    pos, new nameExp(pos, it),
+                                    symbol::literalTrans("advance")
+                            )
+               )
+  );
+}
+
+extendedForStm::LoopType extendedForStm::transObjectDec(symbol a, coenv &e) {
+  // Get the start type.  Handle type inference as a special case.
+  types::ty *t = start->trans(e, ErrorMode::SUPPRESS);
+  if (t->kind == types::ty_inferred) {
+    // First ensure the array expression is an unambiguous array.
+
+    types::ty *atOriginal = set->cgetType(e);
+    types::ty *at = atOriginal->signatureless();
+    if (at && at->kind == ty_array) {
+      // var a=set;
+      tyEntryTy tet(pos, at);
+      decid dec1(pos, new decidstart(pos, a), set);
+      vardec(pos, &tet, &dec1).trans(e);
+      return LoopType::ARRAY;
+    }
+    em.error(set->getPos());
+    if (atOriginal->isOverloaded()) {
+      em << "cannot resolve type for iteration";
+    } else {
+      em << "cannot iterate over expression of type '" << *atOriginal << "'";
+    }
+
+    // On failure, don't bother trying to translate the loop.
+    return LoopType::TY_ERROR;
+
+  }
+  // start[] a=set;
+  arrayTy at(pos, start, new dimensions(pos));
+  decid dec1(pos, new decidstart(pos, a), set);
+  vardec(pos, &at, &dec1).trans(e);
+  return LoopType::ARRAY;
+}
+
 void extendedForStm::trans(coenv &e) {
   // Translate into the syntax:
   //
@@ -373,57 +460,59 @@ void extendedForStm::trans(coenv &e) {
   symbol a=symbol::gensym("a");
   symbol i=symbol::gensym("i");
 
-  // Get the start type.  Handle type inference as a special case.
-  types::ty *t = start->trans(e, true);
-  if (t->kind == types::ty_inferred) {
-
-    // First ensure the array expression is an unambiguous array.
-    types::ty *at = set->cgetType(e);
-    if (at->kind != ty_array) {
-      em.error(set->getPos());
-      em << "expression is not an array of inferable type";
-
-      // On failure, don't bother trying to translate the loop.
-      return;
-    }
-
-    // var a=set;
-    tyEntryTy tet(pos, primInferred());
-    decid dec1(pos, new decidstart(pos, a), set);
-    vardec(pos, &tet, &dec1).trans(e);
+  if (set->cgetType(e)->isError()) {
+    // Translate the object for the error message.
+    set->trans(e);
+    // On failure, don't bother trying to translate the loop.
+    return;
   }
-  else {
-    // start[] a=set;
-    arrayTy at(pos, start, new dimensions(pos));
-    decid dec1(pos, new decidstart(pos, a), set);
-    vardec(pos, &at, &dec1).trans(e);
+  // Is `set.operator iter()` a valid expression?
+  exp* iterExp=
+          new callExp(pos, new fieldExp(pos, set, symbol::opTrans("iter")));
+  LoopType loopType;
+  if (!iterExp->cgetType(e)->isError()) {
+    loopType = LoopType::ITERABLE;
+  } else {
+    loopType = transObjectDec(a, e);
   }
-
-  // { start var=a[i]; body }
+  // On failure, don't bother trying to translate the loop.
+  if (loopType == LoopType::TY_ERROR)
+    return;
+  exp *varInitExp = nullptr;
+  if (loopType == LoopType::ITERABLE) {
+    // start var=i.get();
+    varInitExp = new callExp(
+            pos,
+            new fieldExp(pos, new nameExp(pos, i), symbol::literalTrans("get"))
+    );
+  } else {
+    // start var=a[i];
+    varInitExp = new subscriptExp(pos, new nameExp(pos, a),
+                                  new nameExp(pos, i));
+  }
+  // { start var = <varInitExp>; body }
   block b(pos);
-  decid dec2(pos,
-             new decidstart(pos, var),
-             new subscriptExp(pos, new nameExp(pos, a),
-                              new nameExp(pos, i)));
+  decid dec2(pos, new decidstart(pos, var), varInitExp);
   b.add(new vardec(pos, start, &dec2));
   b.add(body);
 
-  // for (int i=0; i < a.length; ++i)
-  //   <block>
-  forStm(pos,
-         new vardec(pos, new tyEntryTy(pos, primInt()),
-                    new decid(pos,
-                              new decidstart(pos, i),
-                              new intExp(pos, 0))),
-         new binaryExp(pos,
-                       new nameExp(pos, i),
-                       SYM_LT,
-                       new nameExp(pos,
-                                   new qualifiedName(pos,
-                                                     new simpleName(pos, a),
-                                                     symbol::trans("length")))),
-         new expStm(pos, new prefixExp(pos, new nameExp(pos, i), SYM_PLUS)),
-         new blockStm(pos, &b)).trans(e);
+  if (loopType == LoopType::ARRAY) {
+    // for (int i=0; i < a.length; ++i)
+    //   <block>
+    forStm(pos,
+           forArrayInit(pos, i),
+           forArrayTest(pos, i, a),
+           forArrayUpdate(pos, i),
+           new blockStm(pos, &b)).trans(e);
+  } else {
+    // for (var i=set.operator iter(); i.valid(); i.advance())
+    //   <block>
+    forStm(pos,
+           forIterInit(pos, i, iterExp),
+           forIterTest(pos, i),
+           forIterUpdate(pos, i),
+           new blockStm(pos, &b)).trans(e);
+  }
 }
 
 
diff --git a/stm.h b/stm.h
index b8fb4a86b..764057b76 100644
--- a/stm.h
+++ b/stm.h
@@ -170,6 +170,11 @@ class extendedForStm : public stm {
 
   stm *body;
 
+  enum class LoopType { ARRAY, ITERABLE, TY_ERROR };
+
+  // Declares an array a, initialized to `set`. May involve an implicit cast.
+  LoopType transObjectDec(symbol a, coenv& e);
+
 public:
   extendedForStm(position pos, astType *start, symbol var, exp *set, stm *body)
     : stm(pos), start(start), var(var), set(set), body(body) {}
diff --git a/tests/Makefile b/tests/Makefile
index d26b65050..53591a399 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,6 +1,6 @@
 .NOTPARALLEL:
 
-TESTDIRS = string arith frames types imp array pic gs io template
+TESTDIRS = string arith frames types imp array pic gs io template datastructures
 
 EXTRADIRS = gsl output
 
diff --git a/tests/datastructures/bracketsTest.asy b/tests/datastructures/bracketsTest.asy
new file mode 100644
index 000000000..0b0e39667
--- /dev/null
+++ b/tests/datastructures/bracketsTest.asy
@@ -0,0 +1,267 @@
+import TestLib;
+
+StartTest('brackets');
+
+struct Foo {
+  int x = 876;
+  int y = 999;
+  int default = -1010;
+
+  int operator[](string key) {
+    if (key == 'x')
+      return x;
+    if (key == 'y')
+      return y;
+    return default;
+  }
+
+  void operator[=](string key, int value) {
+    if (key == 'x')
+      x = value;
+    if (key == 'y')
+      y = value;
+  }
+
+  void setXY(int value) {
+    this['x'] = this['y'] = value;
+    assert(this['x'] == value);
+    assert(this['y'] == value);
+    ++this['y'];
+    assert(this['y'] == value + 1);
+    --this['y'];
+    assert(this['y'] == value);
+  }
+
+  void reset() {
+    x = 876;
+    y = 999;
+    default = -1010;
+  }
+}
+
+Foo d;
+
+assert(d['x'] == 876);
+assert(d['y'] == 999);
+assert(d['z'] == -1010);
+
+d['x'] = 123;
+d['y'] = 456;
+d['z'] = 789;
+
+assert(d['x'] == 123);
+assert(d['y'] == 456);
+assert(d['z'] == -1010);
+
+d.setXY(32167);
+assert(d['x'] == 32167);
+assert(d['y'] == 32167);
+assert(d['z'] == -1010);
+
+d['x'] = d['y'] = d['z'] = 4567;
+assert(d['x'] == 4567);
+assert(d['y'] == 4567);
+assert(d['z'] == -1010);
+
+{
+  d.reset();
+  int initialX = d.x;
+  // Check evaluation in self-operations.
+  int count = 0;
+  Foo func() {
+    ++count;
+    return d;
+  }
+  int count2 = 0;
+  string x() {
+    ++count2;
+    return 'x';
+  }
+
+  func()[x()] += 111;
+  assert(count == 1);
+  assert(count2 == 1);
+  assert(d['x'] == initialX+111);
+  assert(func()[x()] == initialX+111);
+  assert(count == 2);
+  assert(count2 == 2);
+}
+
+{
+  // Check settability of operator[] and operator[=].
+  Foo e;
+  e.operator[=]=new void(string key, int value) { };
+  e['x'] = 123;
+  assert(e['x'] == 876);
+  e.operator[]=new int(string key) { return 0; };
+  assert(e['x'] == 0);
+}
+
+{
+  // Overload the object before the braces.
+  d.reset();
+  Foo d();
+  d['x'] = 10191;
+  assert(d['x'] == 10191);
+}
+
+{
+  // Overload the value to be assigned.
+  d.reset();
+  int v = 451;
+  int v() { return 10191; }
+  d['x'] = v;
+  assert(d['x'] == 451);
+}
+
+{
+  d.reset();
+  // Overload the key.
+  string k = 'x';
+  string k() { return 'y'; }
+  assert(d[k] == 876);
+  d[k] = 10191;
+  assert(d['x'] == 10191);
+}
+{
+  // Implicit casting on the key.
+  d.reset();
+  struct A {}
+  int count = 0;
+  string operator cast(A) {
+    ++count;
+    return 'x';
+  }
+  A k;
+  assert(d[k] == 876);
+  assert(count == 1);
+  d[k] = 10191;
+  assert(d['x'] == 10191);
+  assert(count == 2);
+}
+{
+  // Implicit casting on the value.
+  d.reset();
+  struct B {}
+  int count = 0;
+  int operator cast(B) {
+    ++count;
+    return 10191;
+  }
+  B v;
+  d['x'] = d['y'] = d['z'] = v;
+  assert(count == 1);
+  assert(d['x'] == 10191);
+  assert(d['y'] == 10191);
+  assert(d['z'] == -1010);
+}
+{
+  // Implicit casting when the value is a field of something other than a name.
+  d.reset();
+  struct A {}
+  int count = 0;
+  int operator cast(A) {
+    ++count;
+    return 10191;
+  }
+  struct B {
+    A a;
+  }
+  d['x'] = d['y'] = d['z'] = (new B).a;
+  assert(count == 1);
+  assert(d['x'] == 10191);
+  assert(d['y'] == 10191);
+  assert(d['z'] == -1010);
+}
+{
+  // Test the order of evaluation of the object, key, and value.
+  int objectCount = 0;
+  int keyCount = 0;
+  int valueCount = 0;
+  int setCount = 0;
+  struct Bar {
+    assert(++objectCount == 1);
+    assert(keyCount == 0);
+    assert(valueCount == 0);
+    assert(setCount == 0);
+    int operator[](string key) {
+      assert(false);
+      return 0;
+    }
+    void operator[=](string key, int value) {
+      assert(objectCount == 1);
+      assert(keyCount == 1);
+      assert(valueCount == 1);
+      assert(++setCount == 1);
+    }
+  }
+  string getKey() {
+    assert(objectCount == 1);
+    assert(++keyCount == 1);
+    assert(valueCount == 0);
+    assert(setCount == 0);
+    return '';
+  }
+  int getValue() {
+    assert(objectCount == 1);
+    assert(keyCount == 1);
+    assert(++valueCount == 1);
+    assert(setCount == 0);
+    return 0;
+  }
+  (new Bar)[getKey()] = getValue();
+}
+{
+  // Test the order of evaluation for a self-expression.
+  int objectCount = 0;
+  int keyCount = 0;
+  int getCount = 0;
+  int valueCount = 0;
+  int setCount = 0;
+  struct Bar {
+    assert(++objectCount == 1);
+    assert(keyCount == 0);
+    assert(getCount == 0);
+    assert(valueCount == 0);
+    assert(setCount == 0);
+    int operator[](string key) {
+      assert(objectCount == 1);
+      assert(keyCount == 1);
+      assert(++getCount == 1);
+      assert(valueCount == 0);
+      assert(setCount == 0);
+      return 0;
+    }
+    void operator[=](string key, int value) {
+      assert(objectCount == 1);
+      assert(keyCount == 1);
+      assert(getCount == 1);
+      assert(valueCount == 1);
+      assert(++setCount == 1);
+    }
+  }
+  string getKey() {
+    assert(objectCount == 1);
+    assert(++keyCount == 1);
+    assert(getCount == 0);
+    assert(valueCount == 0);
+    assert(setCount == 0);
+    return '';
+  }
+  int getValue() {
+    assert(objectCount == 1);
+    assert(keyCount == 1);
+    assert(getCount == 1);
+    assert(++valueCount == 1);
+    assert(setCount == 0);
+    return 0;
+  }
+  (new Bar)[getKey()] += getValue();
+  assert(objectCount == 1);
+  assert(keyCount == 1);
+  assert(getCount == 1);
+  assert(valueCount == 1);
+  assert(setCount == 1);
+}
+
+EndTest();
diff --git a/tests/datastructures/changeWhileIterTest.asy b/tests/datastructures/changeWhileIterTest.asy
new file mode 100644
index 000000000..fdac13277
--- /dev/null
+++ b/tests/datastructures/changeWhileIterTest.asy
@@ -0,0 +1,66 @@
+import TestLib;
+
+StartTest('Change collections.map values while iterating');
+{
+  from collections.map(K=string, V=int) access
+      Map_K_V as Map_string_int,
+      NaiveMap_K_V as NaiveMap_string_int;
+  Map_string_int map = NaiveMap_string_int(0);
+  map['a'] = 1;
+  map['b'] = 2;
+  map['c'] = 3;
+  map['d'] = 4;
+  for (string key : map) {
+    int _ = map[key];  // Okay to read values of existing keys.
+    assert(map['e'] == 0);  // Okay to read non-existent keys (given nullValue).
+    map[key] = 5;  // Okay to change values of existing keys.
+    map['b'] = 6;  // Okay to change values of existing keys.
+    map['e'] = 0;  // Okay to soft-delete non-existent keys.
+    // Uncommenting any of the following should cause errors:
+    // map['e'] = 7;  // Should cause errors since we are adding a key.
+    // map['b'] = 0;  // Should cause errors since we are deleting a key.
+    // map.delete(it.get());  // Not allowed to delete keys while iterating.
+  }
+}
+EndTest();
+
+StartTest('Change collections.hashmap values while iterating');
+{
+  using ktype = string;
+  // The following can be uncommented to specify a custom hash function:
+  // struct wrapped_string {
+  //   string s;
+  //   int hash() { return 1; }
+  //   autounravel wrapped_string operator cast(string s) {
+  //     wrapped_string result = new wrapped_string;
+  //     result.s = s;
+  //     return result;
+  //   }
+  //   autounravel bool operator ==(wrapped_string a, wrapped_string b) {
+  //     return a.s == b.s;
+  //   }
+  //   autounravel bool operator !=(wrapped_string a, wrapped_string b) {
+  //     return a.s != b.s;
+  //   }
+  // }
+  // using ktype = wrapped_string;
+  from collections.hashmap(K=ktype, V=int) access
+      HashMap_K_V as HashMap_string_int;
+  var map = HashMap_string_int(0);
+  map['a'] = 1;
+  map['b'] = 2;
+  map['c'] = 3;
+  map['d'] = 4;
+  for (string key : map) {
+    int _ = map[key];  // Okay to read values of existing keys.
+    assert(map['e'] == 0);  // Okay to read non-existent keys (given nullValue).
+    map[key] = 5;  // Okay to change values of existing keys.
+    map['b'] = 6;  // Okay to change values of existing keys.
+    map['e'] = 0;  // Okay to soft-delete non-existent keys.
+    // Uncommenting any of the following should cause errors:
+    // map['e'] = 7;  // Should cause errors since we are adding a key.
+    // map['b'] = 0;  // Should cause errors since we are deleting a key.
+    // map.delete(key);  // Not allowed to delete keys while iterating.
+  }
+}
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/enumerateTest.asy b/tests/datastructures/enumerateTest.asy
new file mode 100644
index 000000000..77960ca84
--- /dev/null
+++ b/tests/datastructures/enumerateTest.asy
@@ -0,0 +1,55 @@
+import TestLib;
+StartTest('enumerate');
+
+from collections.enumerate(T=string) access
+    enumerate,
+    Iterable_Pair_int_T as Iterable_Pair_int_string,
+    Iterable_T as Iterable_string;
+
+from collections.iter(T=string) access Iterable;
+
+string[] strings = {'a', 'b', 'c'};
+
+{
+  // enumerate over iterable
+  bool[] triggered = array(strings.length, false);
+
+  for (var is : enumerate(Iterable(strings))) {
+    int i = is.k;
+    string s = is.v;
+    assert(s == strings[i]);
+    for (int j = 0; j < i; ++j) {
+      assert(triggered[j]);
+    }
+    for (int j = i; j < strings.length; ++j) {
+      assert(!triggered[j]);
+    }
+    assert(!triggered[i]);
+    triggered[i] = true;
+  }
+
+  assert(all(triggered));
+}
+
+{
+  // enumerate over array
+  bool[] triggered = array(strings.length, false);
+
+  for (var is : enumerate(strings)) {
+    int i = is.k;
+    string s = is.v;
+    assert(s == strings[i]);
+    for (int j = 0; j < i; ++j) {
+      assert(triggered[j]);
+    }
+    for (int j = i; j < strings.length; ++j) {
+      assert(!triggered[j]);
+    }
+    assert(!triggered[i]);
+    triggered[i] = true;
+  }
+
+  assert(all(triggered));
+}
+
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/hashTest.asy b/tests/datastructures/hashTest.asy
new file mode 100644
index 000000000..86f19be43
--- /dev/null
+++ b/tests/datastructures/hashTest.asy
@@ -0,0 +1,73 @@
+import TestLib;
+
+StartTest('int.hash');
+{
+  int x = 3;
+  assert(x.hash() >= 0);
+  assert(x.hash() == (3).hash());
+  for (int i = 0; i < 1000; ++i) {
+    assert(i.hash() >= 0);
+  }
+  // The hash should be roughly uniformly distributed in the space of 62-bit
+  // integers.
+  // This assertion will fail on roughly 1 in 2^32 runs.
+  assert(x.hash() > 2^30,
+         'Probabilistic test failed. Chance of spurious failure is roughly 1 '
+         'in 2^32.');
+  // This assertion will fail on roughly 1 in 2^62 runs.
+  assert(x.hash() != (4).hash());
+}
+EndTest();
+
+StartTest('string.hash');
+{
+  string s = 'hello';
+  assert(s.hash() >= 0);
+  assert(s.hash() == ('hello').hash());
+
+  // The hash should be roughly uniformly distributed in the space of 62-bit
+  // integers.
+  // This assertion will fail on roughly 1 in 2^32 runs.
+  assert(s.hash() > 2^30,
+         'Probabilistic test failed. Chance of spurious failure is roughly 1 '
+         'in 2^32.');
+  // This assertion will fail on roughly 1 in 2^62 runs.
+  assert(s.hash() != ('world').hash(),
+         'Probabilistic test failed. Chance of spurious failure is roughly 1 '
+         'in 2^32.');
+}
+EndTest();
+
+StartTest('real.hash');
+{
+  real ONE = 1.0;
+  assert(ONE.hash() >= 0);
+  assert(ONE.hash() == (1.0).hash());
+  // The hash should be roughly uniformly distributed in the space of 62-bit
+  // integers.
+  // This assertion will fail on roughly 1 in 2^32 runs.
+  assert(ONE.hash() > 2^30,
+         'Probabilistic test failed. Chance of spurious failure is roughly 1 '
+         'in 2^32.');
+  // This assertion will fail on roughly 1 in 2^62 runs.
+  assert(ONE.hash() != (1.0 + 1e-15).hash());
+  // This assertion will fail on roughly 1 in 2^62 runs.
+  assert((1.0).hash() != (1).hash());
+}
+EndTest();
+
+StartTest('hash(int[])');
+{
+  int[] a = {1, 2, 3};
+  assert(hash(a) >= 0);
+  assert(hash(a) == hash(new int[] {1, 2, 3}));
+  // The hash should be roughly uniformly distributed in the space of 62-bit
+  // integers.
+  // This assertion will fail on roughly 1 in 2^32 runs.
+  assert(hash(a) > 2^30,
+         'Probabilistic test failed. Chance of spurious failure is roughly 1 '
+         'in 2^32.');
+  // This assertion will fail on roughly 1 in 2^62 runs.
+  assert(hash(a) != hash(new int[] {1, 2, 4}));
+}
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/hashmapTest.asy b/tests/datastructures/hashmapTest.asy
new file mode 100644
index 000000000..e2d5f78a2
--- /dev/null
+++ b/tests/datastructures/hashmapTest.asy
@@ -0,0 +1,264 @@
+import TestLib;
+
+StartTest('collections.hashmap');
+
+struct wrapped_int {
+  restricted int t;
+  void operator init(int t) {
+    this.t = t;
+  }
+  autounravel bool operator ==(wrapped_int a, wrapped_int b) {
+    if (alias(a, null)) return alias(b, null);
+    if (alias(b, null)) return false;
+    return a.t == b.t;
+  }
+  autounravel bool operator !=(wrapped_int a, wrapped_int b) {
+    return !(a == b);
+  }
+  autounravel bool operator <(wrapped_int a, wrapped_int b) {
+    return a.t < b.t;
+  }
+  int hash() { return t.hash(); }
+}
+
+wrapped_int wrap(int t) = wrapped_int;  // `wrap` is alias for constructor
+wrapped_int operator cast(int t) = wrap;
+
+
+from collections.map(K=wrapped_int, V=real) access
+    Map_K_V as Map_int_real,
+    NaiveMap_K_V as NaiveMap_int_real;
+
+from collections.hashmap(K=wrapped_int, V=real) access
+    HashMap_K_V as HashMap_int_real;
+
+// from collections.smallintmap(T=real) access
+//     SmallIntMap_V as MapSmallint_real;
+
+struct ActionEnum {
+  static restricted int num = 0;
+  static private int next() {
+    int result = num;
+    ++num;
+    return result;
+  }
+  static restricted int SIZE = next();
+  static restricted int EMPTY = next();
+  static restricted int CONTAINS = next();
+  static restricted int FOR_EACH_CONTAINS = next();
+  static restricted int PUT = next();
+  static restricted int SOFT_DELETE = next();
+  static restricted int FIND_DELETE = next();
+
+  static string toString(int action) {
+    if (action == SIZE) return 'SIZE';
+    if (action == EMPTY) return 'EMPTY';
+    if (action == CONTAINS) return 'CONTAINS';
+    if (action == FOR_EACH_CONTAINS) return 'FOR_EACH_CONTAINS';
+    if (action == PUT) return 'PUT';
+    if (action == SOFT_DELETE) return 'SOFT_DELETE';
+    if (action == FIND_DELETE) return 'FIND_DELETE';
+    return 'UNKNOWN';
+  }
+}
+
+using Action=void(int maxItem...Map_int_real[]);
+
+Action[] actions = new Action[ActionEnum.num];
+actions[ActionEnum.SIZE] = new void(int maxItem ...Map_int_real[] maps) {
+  int referenceIndex = rand() % maps.length;
+  int referenceSize = maps[referenceIndex].size();
+  for (Map_int_real map : maps) {
+    assert(map.size() == referenceSize);
+  }
+  //write('size: ' + (string)referenceSize);
+};
+actions[ActionEnum.EMPTY] = new void(int maxItem ...Map_int_real[] maps) {
+  bool referenceEmpty = maps[rand() % maps.length].empty();
+  for (Map_int_real map : maps) {
+    assert(map.empty() == referenceEmpty);
+  }
+};
+actions[ActionEnum.CONTAINS] = new void(int maxItem ...Map_int_real[] maps) {
+  int key = rand() % maxItem;
+  bool referenceContains = maps[rand() % maps.length].contains(key);
+  for (Map_int_real map : maps) {
+    assert(map.contains(key) == referenceContains);
+  }
+};
+actions[ActionEnum.FOR_EACH_CONTAINS] = new void(
+    int maxItem
+    ...Map_int_real[] maps
+) {
+  for (Map_int_real map : maps) {
+    for (wrapped_int key : map) {
+      for (Map_int_real map_ : maps) {
+        assert(map_.contains(key));
+        if (isnan(map[key]))
+          assert(isnan(map_[key]));
+        else
+          assert(map_[key] == map[key]);
+      }
+    }
+  }
+};
+actions[ActionEnum.PUT] = new void(int maxItem ...Map_int_real[] maps) {
+  wrapped_int key = rand() % maxItem;
+  real value = rand();
+  for (Map_int_real map : maps) {
+    map[key] = value;
+  }
+};
+actions[ActionEnum.SOFT_DELETE] = new void(int maxItem ...Map_int_real[] maps) {
+  wrapped_int key = rand() % maxItem;
+  for (Map_int_real map : maps) {
+    map[key] = nan;
+  }
+};
+actions[ActionEnum.FIND_DELETE] = new void(int maxItem ...Map_int_real[] maps) {
+  int whichmap = rand() % maps.length;
+  Map_int_real referenceMap = maps[whichmap];
+  int size = referenceMap.size();
+  if (size == 0)
+    return;
+  int index = rand() % size;
+  wrapped_int key = ((wrapped_int[])referenceMap)[index];
+  for (Map_int_real map : maps) {
+    assert(map.contains(key));
+    map.delete(key);
+  }
+};
+
+real[] increasingProbs = new real[ActionEnum.num];
+increasingProbs[ActionEnum.SIZE] = 0.1;
+increasingProbs[ActionEnum.EMPTY] = 0.1;
+increasingProbs[ActionEnum.CONTAINS] = 0.1;
+increasingProbs[ActionEnum.FOR_EACH_CONTAINS] = 0.05;
+increasingProbs[ActionEnum.PUT] = 0.4;
+increasingProbs[ActionEnum.SOFT_DELETE] = 0.15;
+increasingProbs[ActionEnum.FIND_DELETE] = 0.1;
+assert(sum(increasingProbs) == 1, 'Probabilities do not sum to 1');
+
+real[] decreasingProbs = new real[ActionEnum.num];
+decreasingProbs[ActionEnum.SIZE] = 0.1;
+decreasingProbs[ActionEnum.EMPTY] = 0.1;
+decreasingProbs[ActionEnum.CONTAINS] = 0.1;
+decreasingProbs[ActionEnum.FOR_EACH_CONTAINS] = 0.05;
+decreasingProbs[ActionEnum.PUT] = 0.1;
+decreasingProbs[ActionEnum.SOFT_DELETE] = 0.35;
+decreasingProbs[ActionEnum.FIND_DELETE] = 0.2;
+assert(sum(decreasingProbs) == 1, 'Probabilities do not sum to 1');
+
+int chooseAction(real[] probs) {
+  real r = unitrand();
+  real sum = 0;
+  for (int i = 0; i < probs.length; ++i) {
+    sum += probs[i];
+    if (r < sum) {
+      return i;
+    }
+  }
+  return probs.length - 1;
+} 
+
+bool intsEqual(int, int) = operator ==;
+Map_int_real naiveMap = NaiveMap_int_real(nan, isnan);
+Map_int_real hashMap = HashMap_int_real(nan, isnan);
+// Map_int_real smallintMap = makeMapSmallint(nan, isnan);
+
+from collections.zip(T=int) access zip;
+from mapArray(Src=wrapped_int, Dst=int) access map;
+int get(wrapped_int a) {
+  return a.t;
+}
+
+int[] operator cast(wrapped_int[] a) {
+  for (wrapped_int x : a) {
+    assert(!alias(x, null), 'Null element in array');
+  }
+  return map(get, a);
+}
+
+string differences(wrapped_int[] aArray, wrapped_int[] bArray) {
+  if (aArray.length != bArray.length) {
+    return 'Different sizes: ' + string(aArray.length) + ' vs '
+            + string(bArray.length);
+  }
+  int[] aIntArray = map(get, aArray);
+  int[] bIntArray = map(get, bArray);
+  string arrayValues = '[\n';
+  bool different = false;
+  for (int i = 0; i < aIntArray.length; ++i) {
+    arrayValues += '  [' + format('%5d', aIntArray[i]) + ',' 
+                   + format('%5d', bIntArray[i]) + ']';
+    if (!alias(aArray[i], bArray[i])) {
+      arrayValues += '  <---';
+      different = true;
+    }
+    arrayValues += '\n';
+  }
+  arrayValues += ']';
+  // write(arrayValues + '\n');
+  if (different) {
+    return arrayValues;
+  }
+  return '';
+}
+
+int n = 2000;
+int startDecreasing = n * 2 # 5;  // two-fifths of the way through
+int maxKey = 100;
+for (int i = 0; i < n; ++i) {
+  real[] probs = i < startDecreasing ? increasingProbs : decreasingProbs;
+  int choice = chooseAction(probs);
+  actions[choice](maxKey, naiveMap, hashMap);
+  //write(naiveMap.size());
+  if (naiveMap.size() != hashMap.size()) {
+    write('Naive size: ' + (string)naiveMap.size() + ' Hash size: '
+          + (string)hashMap.size());
+    assert(false, 'Sizes do not match');
+  }
+
+  bool keyDifferenceFound = false;
+  bool valueDifferenceFound = false;
+  assert(naiveMap.operator iter != null, 'Naive set has no iter');
+  assert(hashMap.operator iter != null, 'Hash set has no iter');
+  for (var ita = naiveMap.operator iter(), itb = hashMap.operator iter();
+       ita.valid() && itb.valid();
+       ita.advance(), itb.advance())
+  {
+    wrapped_int a = ita.get();
+    wrapped_int b = itb.get();
+    if (!alias(a, b)) {
+      keyDifferenceFound = true;
+      break;
+    }
+    if (naiveMap[a] != hashMap[b]) {
+      valueDifferenceFound = true;
+      break;
+    }
+  }
+  if (keyDifferenceFound) {
+    assert(false, 'Naive vs hash: \n'
+                  + differences((wrapped_int[])naiveMap, (wrapped_int[])hashMap)
+          );
+  }
+  if (valueDifferenceFound) {
+    write('value difference found');
+    for (var ita = naiveMap.operator iter(), itb = hashMap.operator iter();
+         ita.valid() && itb.valid();
+         ita.advance(), itb.advance())
+    {
+      wrapped_int a = ita.get();
+      wrapped_int b = itb.get();
+      if (naiveMap[a] != hashMap[b]) {
+        write('key: ' + (string)a.t + ' value: ' + (string)naiveMap[a] + ' '
+              + (string)hashMap[b]);
+      }
+    }
+    assert(false);
+  }
+}
+
+
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/hashrepsetTest.asy b/tests/datastructures/hashrepsetTest.asy
new file mode 100644
index 000000000..557009e90
--- /dev/null
+++ b/tests/datastructures/hashrepsetTest.asy
@@ -0,0 +1,316 @@
+import TestLib;
+
+srand(4282308941601638229);
+
+StartTest("HashRepSet");
+
+// from wrapper(T=int) access
+//     Wrapper_T as wrapped_int,
+//     wrap;
+struct wrapped_int {
+  restricted int t;
+  void operator init(int t) {
+    this.t = t;
+  }
+  autounravel bool operator ==(wrapped_int a, wrapped_int b) {
+    if (alias(a, null)) return alias(b, null);
+    if (alias(b, null)) return false;
+    return a.t == b.t;
+  }
+  autounravel bool operator !=(wrapped_int a, wrapped_int b) {
+    return !(a == b);
+  }
+  autounravel bool operator <(wrapped_int a, wrapped_int b) {
+    return a.t < b.t;
+  }
+  int hash() { return t.hash(); }
+}
+
+wrapped_int wrap(int t) = wrapped_int;  // `wrap` is alias for constructor
+
+from collections.repset(T=wrapped_int) access
+    RepSet_T as Set_wrapped_int,
+    NaiveRepSet_T as NaiveSet_wrapped_int;
+
+from collections.hashrepset(T=wrapped_int) access
+    HashRepSet_T as HashSet_wrapped_int;
+
+from collections.enumerate(T=wrapped_int) access enumerate;
+
+struct ActionEnum {
+  static restricted int num = 0;
+  static private int make() {
+    return ++num - 1;
+  }
+  autounravel restricted int CONTAINS = make();
+  autounravel restricted int GET = make();
+  autounravel restricted int ADD = make();
+  autounravel restricted int UPDATE = make();
+  autounravel restricted int DELETE = make();
+  autounravel restricted int DELETE_CONTAINS = make();
+}
+
+from collections.zip(T=int) access zip;
+from mapArray(Src=wrapped_int, Dst=int) access map;
+int get(wrapped_int a) {
+  return a.t;
+}
+
+int[] operator cast(wrapped_int[] a) {
+  for (wrapped_int x : a) {
+    assert(!alias(x, null), 'Null element in array');
+  }
+  return map(get, a);
+}
+
+string differences(wrapped_int[] aArray, wrapped_int[] bArray) {
+  if (aArray.length != bArray.length) {
+    return 'Different sizes: ' + string(aArray.length) + ' vs ' +
+           string(bArray.length);
+  }
+  int[] aIntArray = map(get, aArray);
+  int[] bIntArray = map(get, bArray);
+  string arrayValues = '[\n';
+  bool different = false;
+  for (int i = 0; i < aIntArray.length; ++i) {
+    arrayValues += '  [' + format('%5d', aIntArray[i]) + ',' 
+                   + format('%5d', bIntArray[i]) + ']';
+    if (!alias(aArray[i], bArray[i])) {
+      arrayValues += '  <---';
+      different = true;
+    }
+    arrayValues += '\n';
+  }
+  arrayValues += ']';
+  // write(arrayValues + '\n');
+  if (different) {
+    return arrayValues;
+  }
+  return '';
+}
+
+string differences(Set_wrapped_int a, Set_wrapped_int b) {
+  if (a.size() != b.size()) {
+    return 'Different sizes: ' + string(a.size()) + ' vs ' + string(b.size());
+  }
+  wrapped_int[] aArray, bArray;
+  for (wrapped_int x : a) {
+    aArray.push(x);
+  }
+  for (wrapped_int x : b) {
+    bArray.push(x);
+  }
+  aArray = sort(aArray, operator<);
+  bArray = sort(bArray, operator<);
+  return differences(aArray, bArray);
+}
+
+string string(int[] a) {
+  string result = '[';
+  for (int i = 0; i < a.length; ++i) {
+    if (i > 0) {
+      result += ', ';
+    }
+    result += string(a[i]);
+  }
+  result += ']';
+  return result;
+}
+
+string string(wrapped_int[] a) {
+  string result = '[';
+  for (int i = 0; i < a.length; ++i) {
+    if (i > 0) {
+      result += ', ';
+    }
+    if (alias(a[i], null)) {
+      result += 'null';
+    } else {
+      result += string(a[i].t);
+    }
+  }
+  result += ']';
+  return result;
+}
+
+string string(bool[] a) {
+  string result = '[';
+  for (int i = 0; i < a.length; ++i) {
+    if (i > 0) {
+      result += ', ';
+    }
+    result += a[i] ? 'true' : 'false';
+  }
+  result += ']';
+  return result;
+}
+
+typedef void Action(int ...Set_wrapped_int[]);
+
+Action[] actions = new Action[ActionEnum.num];
+actions[ADD] = new void(int maxItem ...Set_wrapped_int[] sets) {
+  wrapped_int toInsert = wrap(rand() % maxItem);
+  // write('Inserting ' + string(toInsert.t) + '\n');
+  bool[] results = new bool[];
+  for (Set_wrapped_int s : sets) {
+    results.push(s.add(toInsert));
+  }
+  if (results.length > 0) {
+    bool expected = results[0];
+    for (bool r : results) {
+      assert(r == expected, 'Different results: ' + string(results));
+    }
+  }
+};
+actions[UPDATE] = new void(int maxItem ...Set_wrapped_int[] sets) {
+  wrapped_int toReplace = wrap(rand() % maxItem);
+  // write('Replacing ' + string(toReplace.t) + '\n');
+  wrapped_int[] results = new wrapped_int[];
+  for (Set_wrapped_int s : sets) {
+    results.push(s.update(toReplace));
+  }
+  if (results.length > 0) {
+    wrapped_int expected = results[0];
+    for (wrapped_int r : results) {
+      if (!alias(r, expected)) {
+        write(flush);
+        assert(false, 'Different results: ' + string(results));
+      }
+    }
+  }
+};
+actions[DELETE] = new void(int maxItem ...Set_wrapped_int[] sets) {
+  wrapped_int toDelete = wrap(rand() % maxItem);
+  // write('Deleting ' + string(toDelete.t) + '\n');
+  wrapped_int[] results = new wrapped_int[];
+  for (Set_wrapped_int s : sets) {
+    results.push(s.delete(toDelete));
+  }
+  if (results.length > 0) {
+    wrapped_int expected = results[0];
+    for (wrapped_int r : results) {
+      if (!alias(r, expected)) {
+        assert(false, 'Different results: ' + string(results));
+      }
+    }
+  }
+};
+actions[CONTAINS] = new void(int maxItem ...Set_wrapped_int[] sets)
+{
+  int toCheck = rand() % maxItem;
+  // write('Checking ' + string(toCheck) + '\n');
+  bool[] results = new bool[];
+  for (Set_wrapped_int s : sets) {
+    results.push(s.contains(wrap(toCheck)));
+  }
+  if (results.length > 0) {
+    bool expected = results[0];
+    for (bool r : results) {
+      assert(r == expected, 'Different results: ' + string(results));
+    }
+  }
+};
+actions[GET] = new void(int maxItem ...Set_wrapped_int[] sets)
+{
+  int toCheck = rand() % maxItem;
+  // write('Getting ' + string(toCheck) + '\n');
+  wrapped_int[] results = new wrapped_int[];
+  for (Set_wrapped_int s : sets) {
+    results.push(s.get(wrap(toCheck)));
+  }
+  if (results.length > 0) {
+    wrapped_int expected = results[0];
+    for (wrapped_int r : results) {
+      if (!alias(r, expected)) {
+        assert(false, 'Different results: ' + string(results));
+      }
+    }
+  }
+};
+actions[DELETE_CONTAINS] = new void(int ...Set_wrapped_int[] sets) {
+  if (sets.length == 0) {
+    return;
+  }
+  int initialSize = sets[0].size();
+  if (initialSize == 0) {
+    return;
+  }
+  int indexToDelete = rand() % initialSize;
+  // write('Iterating to ' + string(indexToDelete));
+  wrapped_int toDelete = null;
+  for (var kv : enumerate(sets[0])) {
+    if (kv.k == indexToDelete) {
+      toDelete = kv.v;
+      break;
+    }
+  }
+  // write('Deleting ' + string(toDelete.t));
+  int i = 0;
+  for (Set_wrapped_int s : sets) {
+    assert(s.contains(toDelete), 'Contains failed ' + string(i));
+    wrapped_int deleted = s.delete(toDelete);
+    assert(!alias(deleted, null), 'Delete returned null');
+    typedef bool F(wrapped_int, wrapped_int);
+    assert(((F)operator ==) != ((F)alias));
+    assert(deleted == toDelete, 'Delete returned ' + string(deleted.t) +
+                      ' instead of ' + string(toDelete.t));
+    assert(!s.contains(toDelete), 'Contains failed');
+    assert(s.size() == initialSize - 1, 'Size failed');
+    ++i;
+  }
+};
+real[] increasingProbs = new real[ActionEnum.num];
+increasingProbs[ADD] = 0.7;
+increasingProbs[UPDATE] = 0.1;
+increasingProbs[DELETE] = 0.05;
+increasingProbs[CONTAINS] = 0.05;
+increasingProbs[GET] = 0.05;
+increasingProbs[DELETE_CONTAINS] = 0.05;
+assert(sum(increasingProbs) == 1, 'Probabilities do not sum to 1');
+
+real[] decreasingProbs = new real[ActionEnum.num];
+decreasingProbs[ADD] = 0.1;
+decreasingProbs[UPDATE] = 0.1;
+decreasingProbs[DELETE] = 0.4;
+decreasingProbs[CONTAINS] = 0.05;
+decreasingProbs[GET] = 0.05;
+decreasingProbs[DELETE_CONTAINS] = 0.3;
+assert(sum(decreasingProbs) == 1, 'Probabilities do not sum to 1');
+
+Set_wrapped_int naiveSet = NaiveSet_wrapped_int(null);
+Set_wrapped_int hashSet = HashSet_wrapped_int(null);
+
+int chooseAction(real[] probs) {
+  real r = unitrand();
+  real sum = 0;
+  for (int i = 0; i < probs.length; ++i) {
+    sum += probs[i];
+    if (r < sum) {
+      return i;
+    }
+  }
+  return probs.length - 1;
+} 
+
+int maxSize = 0;
+for (int i = 0; i < 2000; ++i) {
+  real[] probs = i < 800 ? increasingProbs : decreasingProbs;
+  int choice = chooseAction(probs);
+  actions[choice](100, naiveSet, hashSet);
+  bool differenceFound = false;
+  for (var ita = naiveSet.operator iter(), itb = hashSet.operator iter();
+       ita.valid() && itb.valid();
+       ita.advance(), itb.advance()) {
+    if (!alias(ita.get(), itb.get())) {
+      differenceFound = true;
+      break;
+    }
+  }
+  if (differenceFound) {
+    assert(false, 'Naive vs hash: \n' + differences(naiveSet, hashSet));
+  }
+
+  maxSize = max(maxSize, naiveSet.size());
+}
+
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/iterTest.asy b/tests/datastructures/iterTest.asy
new file mode 100644
index 000000000..87546549f
--- /dev/null
+++ b/tests/datastructures/iterTest.asy
@@ -0,0 +1,117 @@
+import TestLib;
+
+StartTest('operator iter');
+
+struct Iter_string {
+  string get();
+  void advance();
+  bool valid();
+}
+
+struct Iterable_string {
+  string[] a;
+  void operator init(string[] a) {
+    this.a = a;
+  }
+  Iter_string operator iter() {
+    Iter_string it;
+    int i = 0;
+    it.get = new string() {
+      return a[i];
+    };
+    it.advance = new void() {
+      ++i;
+    };
+    it.valid = new bool() {
+      return i < a.length;
+    };
+    return it;
+  }
+}
+
+Iterable_string is = Iterable_string(new string[]{'a', 'b', 'c'});
+for (var it = is.operator iter(); it.valid(); it.advance()) {
+  assert(it.get() == 'a' || it.get() == 'b' || it.get() == 'c');
+}
+
+{
+  // For loop with implicit variable type
+  int count = 0;
+  for (var s : is) {
+    ++count;
+    assert(s == 'a' || s == 'b' || s == 'c');
+  }
+  assert(count == 3);
+}
+{
+  // For loop with explicit variable type
+  int count = 0;
+  for (string s : is) {
+    ++count;
+    assert(s == 'a' || s == 'b' || s == 'c');
+  }
+  assert(count == 3);
+}
+
+{
+  // Test closure behavior
+  struct ArrayIter {
+    int[] a;
+    int i;
+    int get() {
+      return a[i];
+    }
+    void advance() {
+      ++i;
+    }
+    bool valid() {
+      return i < a.length;
+    }
+    void operator init(int[] a) {
+      this.a = a;
+      i = 0;
+    }
+  }
+  struct ArrayIterable {
+    int[] a;
+    void operator init(int[] a) {
+      this.a = a;
+    }
+    ArrayIter operator iter() {
+      return ArrayIter(a);
+    }
+  }
+  using Function = int();
+  ArrayIterable list = ArrayIterable(sequence(10));
+  Function[] funcs;
+  for (var i : list) {
+    funcs.push(new int() {
+      return i;
+    });
+  }
+  for (int i = 0; i < 10; ++i) {
+    assert(funcs[i]() == i);
+  }
+}
+{
+  // Implicitly cast a function to an array
+  using Function = int(int);
+  int[] operator cast(Function f) {
+    return sequence(f, 10);
+  }
+  int f(int i) { return i + 17; }
+  int f = 0;  // Cannot be cast to int[].
+  int count = 0;
+  for (int i : f) {
+    assert(i == f(count));
+    ++count;
+  }
+  assert(count == 10);
+}
+
+
+// Consider: iterate over enum via static operator iter()?
+
+
+
+EndTest();
\ No newline at end of file
diff --git a/tests/datastructures/queueTest.asy b/tests/datastructures/queueTest.asy
new file mode 100644
index 000000000..a1b893e2c
--- /dev/null
+++ b/tests/datastructures/queueTest.asy
@@ -0,0 +1,113 @@
+import TestLib;
+
+StartTest("Queue");
+
+from collections.queue(T=int) access
+    Queue_T as Queue_int,
+    makeNaiveQueue,
+    makeArrayQueue,
+    makeLinkedQueue,
+    makeQueue;
+
+
+struct ActionEnum {
+  static restricted int numActions = 0;
+  static private int next() {
+    return ++numActions - 1;
+  }
+  static restricted int PUSH = next();
+  static restricted int POP = next();
+}
+
+from collections.zip(T=int) access zip;
+
+string differences(Queue_int a, Queue_int b) {
+  if (a.size() != b.size()) {
+    return 'Different sizes: ' + string(a.size()) + ' vs ' + string(b.size());
+  }
+  if (a.size() != 0) {
+    if (a.peek() != b.peek()) {
+      return 'Different peek: ' + string(a.peek()) + ' vs ' + string(b.peek());
+    }
+  }
+  int[] aArray, bArray;
+  for (int ia : a) {
+    aArray.push(ia);
+  }
+  for (int ib : b) {
+    bArray.push(ib);
+  }
+  if (!all(aArray == bArray)) {
+    write(zip(aArray, bArray));
+    return 'Different contents';
+  }
+  return '';
+}
+
+string string(int[] a) {
+  string result = '[';
+  for (int i = 0; i < a.length; ++i) {
+    if (i > 0) {
+      result += ', ';
+    }
+    result += string(a[i]);
+  }
+  result += ']';
+  return result;
+}
+
+typedef void Action(...Queue_int[]);
+
+Action[] actions = new Action[ActionEnum.numActions];
+actions[ActionEnum.PUSH] = new void(...Queue_int[] qs) {
+  int toPush = rand();
+  for (Queue_int q : qs) {
+    q.push(toPush);
+  }
+};
+actions[ActionEnum.POP] = new void(...Queue_int[] qs) {
+  int[] results = new int[];
+  for (Queue_int q : qs) {
+    if (q.size() > 0) {
+      results.push(q.pop());
+    }
+  }
+  if (results.length > 0) {
+    int expected = results[0];
+    for (int r : results) {
+      assert(r == expected, 'Different results: ' + string(results));
+    }
+  }
+};
+
+real[] increasingProbs = new real[ActionEnum.numActions];
+increasingProbs[ActionEnum.PUSH] = 0.7;
+increasingProbs[ActionEnum.POP] = 0.3;
+
+real[] decreasingProbs = new real[ActionEnum.numActions];
+decreasingProbs[ActionEnum.PUSH] = 0.3;
+decreasingProbs[ActionEnum.POP] = 0.7;
+
+Queue_int naive = makeNaiveQueue(new int[]);
+Queue_int array = makeArrayQueue(new int[]);
+Queue_int linked = makeLinkedQueue(new int[]);
+
+for (int i = 0; i < 2000; ++i) {
+  // if (i % 100 == 0) {
+  //   write('Step ' + string(i));
+  //   write('Naive: ' + string(naive.toArray()));
+  //   write('Array: ' + string(array.toArray()));
+  //   write('Linked: ' + string(linked.toArray()));
+  // }
+  real[] probs = i < 800 ? increasingProbs : decreasingProbs;
+  int choice = (unitrand() < probs[ActionEnum.PUSH]
+                ? ActionEnum.PUSH
+                : ActionEnum.POP);
+  actions[choice](naive, array, linked);
+  string diffs = differences(naive, array);
+  assert(diffs == '', 'Naive vs array: \n' + diffs);
+  diffs = differences(naive, linked);
+  assert(diffs == '', 'Naive vs linked: \n' + diffs);
+}
+
+EndTest();
\ No newline at end of file
diff --git a/tests/imp/unravel.asy b/tests/imp/unravel.asy
index 338bc3795..0ba3d5fcb 100644
--- a/tests/imp/unravel.asy
+++ b/tests/imp/unravel.asy
@@ -104,7 +104,7 @@ StartTest("unravel");
   assert(x==4);
 }
 {
-  access 'imp/imports/A' as A;
+  access imp.imports.A as A;
   int x=3;
   from A unravel B;
   from B unravel x;
diff --git a/tests/template/imports/pureset.asy b/tests/template/imports/pureset.asy
deleted file mode 100644
index 472a37ccb..000000000
--- a/tests/template/imports/pureset.asy
+++ /dev/null
@@ -1,116 +0,0 @@
-typedef import(T);
-
-struct Set_T {
-  int size();
-  bool empty() {
-    return size() == 0;
-  }
-  bool contains(T item);
-  bool insert(T item);
-  T replace(T item);     // Inserts item, and returns the item that was
-                         // replaced, or emptyresponse if no item was replaced.
-  T get(T item);
-  bool delete(T item);
-  // Calls process on each item in the collection until process returns false.
-  void forEach(bool process(T item));
-}
-
-struct NaiveSet_T {
-  private T[] buffer = new T[0];
-  private T emptyresponse;
-  private bool equiv(T a, T b);
-
-  void operator init(bool equiv(T a, T b), T emptyresponse) {
-    this.equiv = equiv;
-    this.emptyresponse = emptyresponse;
-  }
-
-  int size() {
-    return buffer.length;
-  }
-
-  bool contains(T item) {
-    for (T a : buffer) {
-      if (equiv(a, item)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  bool insert(T item) {
-    if (contains(item)) {
-      return false;
-    }
-    buffer.push(item);
-    return true;
-  }
-
-  T replace(T item) {
-    for (int i = 0; i < buffer.length; ++i) {
-      if (equiv(buffer[i], item)) {
-        T old = buffer[i];
-        buffer[i] = item;
-        return old;
-      }
-    }
-    buffer.push(item);
-    return emptyresponse;
-  }
-
-  T get(T item) {
-    for (T a : buffer) {
-      if (equiv(a, item)) {
-        return a;
-      }
-    }
-    return emptyresponse;
-  }
-
-  bool delete(T item) {
-    for (int i = 0; i < buffer.length; ++i) {
-      if (equiv(buffer[i], item)) {
-        buffer[i] = buffer[buffer.length - 1];
-        buffer.pop();
-        return true;
-      }
-    }
-    return false;
-  }
-
-  void forEach(bool process(T item)) {
-    for (T a : buffer) {
-      if (!process(a)) {
-        return;
-      }
-    }
-  }
-
-}
-
-Set_T operator cast(NaiveSet_T naiveSet) {
-  Set_T set = new Set_T;
-  set.size = naiveSet.size;
-  set.contains = naiveSet.contains;
-  set.insert = naiveSet.insert;
-  set.replace = naiveSet.replace;
-  set.get = naiveSet.get;
-  set.delete = naiveSet.delete;
-  set.forEach = naiveSet.forEach;
-  return set;
-}
-
-T[] operator cast(Set_T set) {
-  T[] buffer = new T[set.size()];
-  int i = 0;
-  set.forEach(new bool(T item) {
-    buffer[i] = item;
-    ++i;
-    return true;
-  });
-  return buffer;
-}
-
-Set_T makeNaiveSet(bool equiv(T, T), T emptyresponse) {
-  return NaiveSet_T(equiv, emptyresponse);
-}
\ No newline at end of file
diff --git a/tests/template/imports/wrapper.asy b/tests/template/imports/wrapper.asy
index 1e44cdd79..649774eac 100644
--- a/tests/template/imports/wrapper.asy
+++ b/tests/template/imports/wrapper.asy
@@ -5,6 +5,11 @@ struct Wrapper_T {
   void operator init(T t) {
     this.t = t;
   }
+  autounravel bool operator ==(Wrapper_T a, Wrapper_T b) {
+    // NOTE: This won't compile if T is an array type since == is
+    // vectorized for arrays.
+    return a.t == b.t;
+  }
 }
 
 Wrapper_T wrap(T t) {
diff --git a/tests/template/imports/wrapperWithEquals.asy b/tests/template/imports/wrapperWithEquals.asy
deleted file mode 100644
index 0423112e6..000000000
--- a/tests/template/imports/wrapperWithEquals.asy
+++ /dev/null
@@ -1,11 +0,0 @@
-typedef import(T);
-
-access "template/imports/wrapper"(T=T) as wrapper;
-unravel wrapper;
-
-bool operator == (Wrapper_T a, Wrapper_T b) {
-  return a.t == b.t;
-}
-bool operator != (Wrapper_T a, Wrapper_T b) {
-  return a.t != b.t;
-}
\ No newline at end of file
diff --git a/tests/template/initTest.asy b/tests/template/initTest.asy
index c5d0ba2d3..7d2238ce8 100644
--- a/tests/template/initTest.asy
+++ b/tests/template/initTest.asy
@@ -1,6 +1,6 @@
-access "template/imports/A"(T=int) as a;
+access template.imports.A(T=int) as a;
 unravel a;
-access "template/imports/B"(T=A) as b;
+access template.imports.B(T=A) as b;
 unravel b;
 
 import TestLib;
@@ -16,6 +16,6 @@ struct X {
   }
 }
 
-access "template/imports/C"(T=X.A) as p;
+access template.imports.C(T=X.A) as p;
 
 EndTest();
diff --git a/tests/template/multiImport.asy b/tests/template/multiImport.asy
index 94d5c30fe..3e7335110 100644
--- a/tests/template/multiImport.asy
+++ b/tests/template/multiImport.asy
@@ -2,10 +2,10 @@ import TestLib;
 
 StartTest('multiple_imports');
 struct A {int x=1;}
-access "template/imports/C"(T=A) as p;
+access template.imports.C(T=A) as p;
 assert(p.global == 17);
 p.global = 42;
-access "template/imports/C"(T=A) as q;
+access template.imports.C(T=A) as q;
 assert(q.global == 42);
 EndTest();
 
@@ -14,12 +14,12 @@ struct B {int x=1;}
 void f(int expected, int newValue) {
   // Importing inside a function is not recommended practice, but it should
   // work.
-  access "template/imports/C"(T=B) as p;
+  access template.imports.C(T=B) as p;
   assert(p.global == expected);
   p.global = newValue;
 }
 f(17, 23);
 f(23, 27);
-access "template/imports/C"(T=B) as p;
+access template.imports.C(T=B) as p;
 assert(p.global == 27);
 EndTest();
diff --git a/tests/template/nestedImport.asy b/tests/template/nestedImport.asy
index 2e0af81f1..fceb96649 100644
--- a/tests/template/nestedImport.asy
+++ b/tests/template/nestedImport.asy
@@ -2,6 +2,6 @@ import TestLib;
 
 StartTest('nested_import');
 struct A { int x = 1; }
-access 'template/imports/Cpass'(T=A) as module;
+access template.imports.Cpass(T=A) as module;
 assert(module.global == 17);
 EndTest();
\ No newline at end of file
diff --git a/tests/template/singletype.asy b/tests/template/singletype.asy
index b2c9a88e0..7f56a6d79 100644
--- a/tests/template/singletype.asy
+++ b/tests/template/singletype.asy
@@ -5,15 +5,15 @@ StartTest("singletype");
 struct A {}
 
 // TODO: Should we import operator== and alias for free?
-from "template/imports/wrapperWithEquals"(T=int) access
+from "template/imports/wrapper"(T=int) access
     Wrapper_T as Wrapper_int,
-    wrap, operator ==, alias;
+    wrap;
 // TODO: Create a way to pass operator==(A, A) to the template, either
 // implicitly or explicitly.
 // from "template/imports/wrapperWithEquals"(T=A) access
 //   Wrapper_T as Wrapper_A, wrap, operator ==, alias;  // error
 from "template/imports/wrapper"(T=A)
-    access Wrapper_T as Wrapper_A, wrap, alias;
+    access Wrapper_T as Wrapper_A, wrap;
 
 // Basic functionality for ints:
 Wrapper_int w1 = wrap(5);
diff --git a/tests/template/sortedsetTest.asy b/tests/template/sortedsetTest.asy
deleted file mode 100644
index 66dbb88ec..000000000
--- a/tests/template/sortedsetTest.asy
+++ /dev/null
@@ -1,254 +0,0 @@
-import TestLib;
-
-StartTest("NaiveSortedSet");
-
-from "template/imports/wrapper"(T=int) access
-    Wrapper_T as wrapped_int,
-    wrap,
-    alias;
-
-bool operator < (wrapped_int a, wrapped_int b) {
-  return a.t < b.t;
-}
-
-bool operator == (wrapped_int a, wrapped_int b) {
-  return a.t == b.t;
-}
-
-from "template/imports/pureset"(T=wrapped_int) access
-    Set_T as Set_wrapped_int,
-    makeNaiveSet;
-
-from "template/imports/sortedset"(T=wrapped_int) access
-    SortedSet_T as SortedSet_wrapped_int,
-    makeNaiveSortedSet,
-    operator cast,
-    unSort;
-
-struct ActionEnum {
-  static restricted int numActions = 0;
-  static private int next() {
-    return ++numActions - 1;
-  }
-  static restricted int INSERT = next();
-  static restricted int REPLACE = next();
-  static restricted int DELETE = next();
-  static restricted int CONTAINS = next();
-  static restricted int DELETE_CONTAINS = next();
-}
-
-from "template/imports/zip"(T=int) access zip;
-from mapArray(Src=wrapped_int, Dst=int) access map;
-int get(wrapped_int a) {
-  return a.t;
-}
-
-int[] operator cast(wrapped_int[] a) {
-  for (wrapped_int x : a) {
-    assert(!alias(x, null), 'Null element in array');
-  }
-  return map(get, a);
-}
-
-string differences(Set_wrapped_int a, Set_wrapped_int b) {
-  if (a.size() != b.size()) {
-    return 'Different sizes: ' + string(a.size()) + ' vs ' + string(b.size());
-  }
-  wrapped_int[] aArray = sort(a, operator<);
-  int[] aIntArray = map(get, aArray);
-  wrapped_int[] bArray = sort(b, operator<);
-  int[] bIntArray = map(get, bArray);
-  string arrayValues = '[\n';
-  bool different = false;
-  for (int i = 0; i < aIntArray.length; ++i) {
-    arrayValues += '  [' + format('%5d', aIntArray[i]) + ',' 
-                   + format('%5d', bIntArray[i]) + ']';
-    if (!alias(aArray[i], bArray[i])) {
-      arrayValues += '  <---';
-      different = true;
-    }
-    arrayValues += '\n';
-  }
-  arrayValues += ']';
-  // write(arrayValues + '\n');
-  if (different) {
-    return arrayValues;
-  }
-  return '';
-}
-
-string string(int[] a) {
-  string result = '[';
-  for (int i = 0; i < a.length; ++i) {
-    if (i > 0) {
-      result += ', ';
-    }
-    result += string(a[i]);
-  }
-  result += ']';
-  return result;
-}
-
-string string(bool[] a) {
-  string result = '[';
-  for (int i = 0; i < a.length; ++i) {
-    if (i > 0) {
-      result += ', ';
-    }
-    result += a[i] ? 'true' : 'false';
-  }
-  result += ']';
-  return result;
-}
-
-typedef void Action(int ...Set_wrapped_int[]);
-
-Action[] actions = new Action[ActionEnum.numActions];
-actions[ActionEnum.INSERT] = new void(int maxItem ...Set_wrapped_int[] sets) {
-  wrapped_int toInsert = wrap(rand() % maxItem);
-  // write('Inserting ' + string(toInsert.t) + '\n');
-  for (Set_wrapped_int s : sets) {
-    s.insert(toInsert);
-  }
-};
-actions[ActionEnum.REPLACE] = new void(int maxItem ...Set_wrapped_int[] sets) {
-  wrapped_int toReplace = wrap(rand() % maxItem);
-  // write('Replacing ' + string(toReplace.t) + '\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (Set_wrapped_int s : sets) {
-    results.push(s.replace(toReplace));
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.DELETE] = new void(int maxItem ...Set_wrapped_int[] sets) {
-  wrapped_int toDelete = wrap(rand() % maxItem);
-  // write('Deleting ' + string(toDelete.t) + '\n');
-  bool[] results = new bool[];
-  for (Set_wrapped_int s : sets) {
-    results.push(s.delete(toDelete));
-  }
-  if (results.length > 0) {
-    bool expected = results[0];
-    for (bool r : results) {
-      assert(r == expected, 'Different results: ' + string(results));
-    }
-  }
-};
-actions[ActionEnum.CONTAINS] = new void(int maxItem ...Set_wrapped_int[] sets)
-{
-  int toCheck = rand() % maxItem;
-  // write('Checking ' + string(toCheck) + '\n');
-  bool[] results = new bool[];
-  for (Set_wrapped_int s : sets) {
-    results.push(s.contains(wrap(toCheck)));
-  }
-  if (results.length > 0) {
-    bool expected = results[0];
-    for (bool r : results) {
-      assert(r == expected, 'Different results: ' + string(results));
-    }
-  }
-};
-actions[ActionEnum.DELETE_CONTAINS] = new void(int ...Set_wrapped_int[] sets) {
-  if (sets.length == 0) {
-    return;
-  }
-  int initialSize = sets[0].size();
-  if (initialSize == 0) {
-    return;
-  }
-  int indexToDelete = rand() % initialSize;
-  int i = 0;
-  wrapped_int toDelete = null;
-  bool process(wrapped_int a) {
-    if (i == indexToDelete) {
-      toDelete = wrap(a.t);
-      return false;
-    }
-    ++i;
-    return true;
-  }
-  sets[0].forEach(process);
-  assert(i < initialSize, 'Index out of range');
-  // write('Deleting ' + string(toDelete.t) + '\n');
-  int i = 0;
-  for (Set_wrapped_int s : sets) {
-    assert(s.contains(toDelete), 'Contains failed ' + string(i));
-    assert(s.delete(toDelete), 'Delete failed');
-    assert(!s.contains(toDelete), 'Contains failed');
-    assert(s.size() == initialSize - 1, 'Size failed');
-    ++i;
-  }
-};
-real[] increasingProbs = new real[ActionEnum.numActions];
-increasingProbs[ActionEnum.INSERT] = 0.7;
-increasingProbs[ActionEnum.REPLACE] = 0.1;
-increasingProbs[ActionEnum.DELETE] = 0.05;
-increasingProbs[ActionEnum.CONTAINS] = 0.1;
-increasingProbs[ActionEnum.DELETE_CONTAINS] = 0.05;
-assert(sum(increasingProbs) == 1, 'Probabilities do not sum to 1');
-
-real[] decreasingProbs = new real[ActionEnum.numActions];
-decreasingProbs[ActionEnum.INSERT] = 0.1;
-decreasingProbs[ActionEnum.REPLACE] = 0.1;
-decreasingProbs[ActionEnum.DELETE] = 0.4;
-decreasingProbs[ActionEnum.CONTAINS] = 0.1;
-decreasingProbs[ActionEnum.DELETE_CONTAINS] = 0.3;
-assert(sum(decreasingProbs) == 1, 'Probabilities do not sum to 1');
-
-Set_wrapped_int pure_set = makeNaiveSet(operator ==, (wrapped_int)null);
-SortedSet_wrapped_int sorted_set =
-    makeNaiveSortedSet(operator <, (wrapped_int)null);
-
-int chooseAction(real[] probs) {
-  real r = unitrand();
-  real sum = 0;
-  for (int i = 0; i < probs.length; ++i) {
-    sum += probs[i];
-    if (r < sum) {
-      return i;
-    }
-  }
-  return probs.length - 1;
-} 
-
-bool isStrictlySorted(wrapped_int[] arr) {
-  for (int i = 1; i < arr.length; ++i) {
-    if (!(arr[i - 1] < arr[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-int maxSize = 0;
-for (int i = 0; i < 2000; ++i) {
-  real[] probs = i < 800 ? increasingProbs : decreasingProbs;
-  int choice = chooseAction(probs);
-  actions[choice](100, pure_set, sorted_set);
-  string diffs = differences(pure_set, sorted_set);
-  assert(diffs == '', 'Pure vs sorted: \n' + diffs);
-  assert(isStrictlySorted(sorted_set), 'Not sorted');
-  maxSize = max(maxSize, pure_set.size());
-}
-// write('Max size: ' + string(maxSize) + '\n');
-
-// int maxSize = 0;
-// for (int i = 0; i < 2000; ++i) {
-//   real[] probs = i < 800 ? increasingProbs : decreasingProbs;
-//   int choice = chooseAction(probs);
-//   actions[choice](1000, pure_set, unSort(sorted_set));
-//   string diffs = differences(pure_set, sorted_set);
-//   assert(diffs == '', 'Pure vs sorted: \n' + diffs);
-//   maxSize = max(maxSize, pure_set.size());
-// }
-// write('Max size: ' + string(maxSize) + '\n');
-
-EndTest();
\ No newline at end of file
diff --git a/tests/template/splaytreeTest.asy b/tests/template/splaytreeTest.asy
deleted file mode 100644
index 3ad5811f2..000000000
--- a/tests/template/splaytreeTest.asy
+++ /dev/null
@@ -1,553 +0,0 @@
-
-import TestLib;
-
-StartTest("SplayTree_as_Set");
-
-from "template/imports/wrapper"(T=int) access
-    Wrapper_T as wrapped_int,
-    wrap,
-    alias;
-
-bool operator < (wrapped_int a, wrapped_int b) {
-  return a.t < b.t;
-}
-
-bool operator == (wrapped_int a, wrapped_int b) {
-  return a.t == b.t;
-}
-
-from "template/imports/sortedset"(T=wrapped_int) access
-    makeNaiveSortedSet,
-    SortedSet_T as SortedSet_wrapped_int;
-
-from "template/imports/splaytree"(T=wrapped_int) access
-    SplayTree_T as SplayTree_wrapped_int,
-    operator cast;
-
-struct ActionEnum {
-  static restricted int numActions = 0;
-  static private int next() {
-    return ++numActions - 1;
-  }
-  static restricted int INSERT = next();
-  static restricted int REPLACE = next();
-  static restricted int DELETE = next();
-  static restricted int CONTAINS = next();
-  static restricted int DELETE_CONTAINS = next();
-}
-
-from mapArray(Src=wrapped_int, Dst=int) access map;
-int get(wrapped_int a) {
-  return a.t;
-}
-
-int[] operator cast(wrapped_int[] a) {
-  for (wrapped_int x : a) {
-    assert(!alias(x, null), 'Null element in array');
-  }
-  return map(get, a);
-}
-
-string differences(SortedSet_wrapped_int a, SortedSet_wrapped_int b) {
-  if (a.size() != b.size()) {
-    return 'Different sizes: ' + string(a.size()) + ' vs ' + string(b.size());
-  }
-  wrapped_int[] aArray = a;
-  int[] aIntArray = aArray;
-  wrapped_int[] bArray = b;
-  int[] bIntArray = bArray;
-  string arrayValues = '[\n';
-  bool different = false;
-  for (int i = 0; i < aIntArray.length; ++i) {
-    arrayValues += '  [' + format('%5d', aIntArray[i]) + ',' 
-                   + format('%5d', bIntArray[i]) + ']';
-    if (!alias(aArray[i], bArray[i])) {
-      arrayValues += '  <---';
-      different = true;
-    }
-    arrayValues += '\n';
-  }
-  arrayValues += ']';
-  // write(arrayValues + '\n');
-  if (different) {
-    return arrayValues;
-  }
-  return '';
-}
-
-string string(int[] a) {
-  string result = '[';
-  for (int i = 0; i < a.length; ++i) {
-    if (i > 0) {
-      result += ', ';
-    }
-    result += string(a[i]);
-  }
-  result += ']';
-  return result;
-}
-
-string string(bool[] a) {
-  string result = '[';
-  for (int i = 0; i < a.length; ++i) {
-    if (i > 0) {
-      result += ', ';
-    }
-    result += a[i] ? 'true' : 'false';
-  }
-  result += ']';
-  return result;
-}
-
-typedef void Action(int ...SortedSet_wrapped_int[]);
-
-Action[] actions = new Action[ActionEnum.numActions];
-actions[ActionEnum.INSERT] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toInsert = wrap(rand() % maxItem);
-      // write('Inserting ' + string(toInsert.t) + '\n');
-      for (SortedSet_wrapped_int s : sets) {
-        s.insert(toInsert);
-      }
-    };
-actions[ActionEnum.REPLACE] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toReplace = wrap(rand() % maxItem);
-      // write('Replacing ' + string(toReplace.t) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.replace(toReplace));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.DELETE] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toDelete = wrap(rand() % maxItem);
-      // write('Deleting ' + string(toDelete.t) + '\n');
-      bool[] results = new bool[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.delete(toDelete));
-      }
-      if (results.length > 0) {
-        bool expected = results[0];
-        for (bool r : results) {
-          assert(r == expected, 'Different results: ' + string(results));
-        }
-      }
-    };
-actions[ActionEnum.CONTAINS] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('Checking ' + string(toCheck) + '\n');
-      bool[] results = new bool[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.contains(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        bool expected = results[0];
-        for (bool r : results) {
-          assert(r == expected, 'Different results: ' + string(results));
-        }
-      }
-    };
-actions[ActionEnum.DELETE_CONTAINS] =
-    new void(int ...SortedSet_wrapped_int[] sets) {
-      if (sets.length == 0) {
-        return;
-      }
-      int initialSize = sets[0].size();
-      if (initialSize == 0) {
-        return;
-      }
-      int indexToDelete = rand() % initialSize;
-      int i = 0;
-      wrapped_int toDelete = null;
-      bool process(wrapped_int a) {
-        if (i == indexToDelete) {
-          toDelete = wrap(a.t);
-          return false;
-        }
-        ++i;
-        return true;
-      }
-      sets[0].forEach(process);
-      assert(i < initialSize, 'Index out of range');
-      // write('Deleting ' + string(toDelete.t) + '\n');
-      int i = 0;
-      for (SortedSet_wrapped_int s : sets) {
-        assert(s.contains(toDelete), 'Contains failed ' + string(i));
-        assert(s.delete(toDelete), 'Delete failed');
-        assert(!s.contains(toDelete), 'Contains failed');
-        assert(s.size() == initialSize - 1, 'Size failed');
-        ++i;
-      }
-    };
-real[] increasingProbs = new real[ActionEnum.numActions];
-increasingProbs[ActionEnum.INSERT] = 0.7;
-increasingProbs[ActionEnum.REPLACE] = 0.1;
-increasingProbs[ActionEnum.DELETE] = 0.05;
-increasingProbs[ActionEnum.CONTAINS] = 0.1;
-increasingProbs[ActionEnum.DELETE_CONTAINS] = 0.05;
-assert(sum(increasingProbs) == 1, 'Probabilities do not sum to 1');
-
-real[] decreasingProbs = new real[ActionEnum.numActions];
-decreasingProbs[ActionEnum.INSERT] = 0.1;
-decreasingProbs[ActionEnum.REPLACE] = 0.1;
-decreasingProbs[ActionEnum.DELETE] = 0.4;
-decreasingProbs[ActionEnum.CONTAINS] = 0.1;
-decreasingProbs[ActionEnum.DELETE_CONTAINS] = 0.3;
-assert(sum(decreasingProbs) == 1, 'Probabilities do not sum to 1');
-
-SortedSet_wrapped_int sorted_set =
-    makeNaiveSortedSet(operator <, (wrapped_int)null);
-SplayTree_wrapped_int splayset =
-    SplayTree_wrapped_int(operator <, (wrapped_int)null);
-
-int chooseAction(real[] probs) {
-  real r = unitrand();
-  real sum = 0;
-  for (int i = 0; i < probs.length; ++i) {
-    sum += probs[i];
-    if (r < sum) {
-      return i;
-    }
-  }
-  return probs.length - 1;
-} 
-
-bool isStrictlySorted(wrapped_int[] arr) {
-  for (int i = 1; i < arr.length; ++i) {
-    if (!(arr[i - 1] < arr[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-int maxSize = 0;
-for (int i = 0; i < 2000; ++i) {
-  real[] probs = i < 800 ? increasingProbs : decreasingProbs;
-  int choice = chooseAction(probs);
-  actions[choice](100, sorted_set, splayset);
-  string diffs = differences(sorted_set, splayset);
-  assert(diffs == '', 'Naive vs splayset: \n' + diffs);
-  assert(isStrictlySorted(splayset), 'Not sorted');
-  maxSize = max(maxSize, splayset.size());
-}
-EndTest();
-
-StartTest("SplayTree_as_SortedSet");
-
-struct ActionEnum {
-  static restricted int numActions = 0;
-  static private int next() {
-    return ++numActions - 1;
-  }
-  static restricted int CONTAINS = next();
-  static restricted int AFTER = next();
-  static restricted int BEFORE = next();
-  static restricted int FIRST_GEQ = next();
-  static restricted int FIRST_LEQ = next();
-  static restricted int MIN = next();
-  static restricted int POP_MIN = next();
-  static restricted int MAX = next();
-  static restricted int POP_MAX = next();
-  static restricted int INSERT = next();
-  static restricted int REPLACE = next();
-  static restricted int GET = next();
-  static restricted int DELETE = next();
-  static restricted int DELETE_CONTAINS = next();
-}
-
-Action[] actions = new Action[ActionEnum.numActions];
-actions[ActionEnum.CONTAINS] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('Checking ' + string(toCheck) + '\n');
-      bool[] results = new bool[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.contains(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        bool expected = results[0];
-        for (bool r : results) {
-          assert(r == expected, 'Different results: ' + string(results));
-        }
-      }
-    };
-actions[ActionEnum.AFTER] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('After ' + string(toCheck) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.after(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.BEFORE] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('Before ' + string(toCheck) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.before(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.FIRST_GEQ] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('First greater or equal ' + string(toCheck) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.firstGEQ(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.FIRST_LEQ] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      int toCheck = rand() % maxItem;
-      // write('First less or equal ' + string(toCheck) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.firstLEQ(wrap(toCheck)));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.MIN] = new void(int ...SortedSet_wrapped_int[] sets) {
-  // write('Min\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (SortedSet_wrapped_int s : sets) {
-    results.push(s.min());
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.POP_MIN] = new void(int ...SortedSet_wrapped_int[] sets) {
-  // write('Pop min\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (SortedSet_wrapped_int s : sets) {
-    results.push(s.popMin());
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.MAX] = new void(int ...SortedSet_wrapped_int[] sets) {
-  // write('Max\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (SortedSet_wrapped_int s : sets) {
-    results.push(s.max());
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.POP_MAX] = new void(int ...SortedSet_wrapped_int[] sets) {
-  // write('Pop max\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (SortedSet_wrapped_int s : sets) {
-    results.push(s.popMax());
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.INSERT] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toInsert = wrap(rand() % maxItem);
-      // write('Inserting ' + string(toInsert.t) + '\n');
-      for (SortedSet_wrapped_int s : sets) {
-        s.insert(toInsert);
-      }
-    };
-actions[ActionEnum.REPLACE] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toReplace = wrap(rand() % maxItem);
-      // write('Replacing ' + string(toReplace.t) + '\n');
-      wrapped_int[] results = new wrapped_int[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.replace(toReplace));
-      }
-      if (results.length > 0) {
-        wrapped_int expected = results[0];
-        for (wrapped_int r : results) {
-          if (!alias(r, expected)) {
-            assert(false, 'Different results: ' + string(results));
-          }
-        }
-      }
-    };
-actions[ActionEnum.GET] = new void(int maxItem ...SortedSet_wrapped_int[] sets)
-{
-  wrapped_int toGet = wrap(rand() % maxItem);
-  // write('Getting ' + string(toGet) + '\n');
-  wrapped_int[] results = new wrapped_int[];
-  for (SortedSet_wrapped_int s : sets) {
-    results.push(s.get(toGet));
-  }
-  if (results.length > 0) {
-    wrapped_int expected = results[0];
-    for (wrapped_int r : results) {
-      if (!alias(r, expected)) {
-        assert(false, 'Different results: ' + string(results));
-      }
-    }
-  }
-};
-actions[ActionEnum.DELETE] =
-    new void(int maxItem ...SortedSet_wrapped_int[] sets) {
-      wrapped_int toDelete = wrap(rand() % maxItem);
-      // write('Deleting ' + string(toDelete.t) + '\n');
-      bool[] results = new bool[];
-      for (SortedSet_wrapped_int s : sets) {
-        results.push(s.delete(toDelete));
-      }
-      if (results.length > 0) {
-        bool expected = results[0];
-        for (bool r : results) {
-          assert(r == expected, 'Different results: ' + string(results));
-        }
-      }
-    };
-actions[ActionEnum.DELETE_CONTAINS] =
-    new void(int ...SortedSet_wrapped_int[] sets) {
-      if (sets.length == 0) {
-        return;
-      }
-      int initialSize = sets[0].size();
-      if (initialSize == 0) {
-        return;
-      }
-      int indexToDelete = rand() % initialSize;
-      int i = 0;
-      wrapped_int toDelete = null;
-      bool process(wrapped_int a) {
-        if (i == indexToDelete) {
-          toDelete = wrap(a.t);
-          return false;
-        }
-        ++i;
-        return true;
-      }
-      sets[0].forEach(process);
-      assert(i < initialSize, 'Index out of range');
-      // write('Deleting ' + string(toDelete.t) + '\n');
-      int i = 0;
-      for (SortedSet_wrapped_int s : sets) {
-        assert(s.delete(toDelete), 'Delete failed');
-        assert(!s.contains(toDelete), 'Contains failed');
-        assert(s.size() == initialSize - 1, 'Size failed');
-        ++i;
-      }
-    };
-
-real[] increasingProbs = array(n=ActionEnum.numActions, value=0.0);
-// Actions that don't modify the set (except for rebalancing):
-increasingProbs[ActionEnum.CONTAINS] = 1 / 2^5;
-increasingProbs[ActionEnum.AFTER] = 1 / 2^5;
-increasingProbs[ActionEnum.BEFORE] = 1 / 2^5;
-increasingProbs[ActionEnum.FIRST_GEQ] = 1 / 2^5;
-increasingProbs[ActionEnum.FIRST_LEQ] = 1 / 2^5;
-increasingProbs[ActionEnum.MIN] = 1 / 2^5;
-increasingProbs[ActionEnum.MAX] = 1 / 2^5;
-increasingProbs[ActionEnum.GET] = 1 / 2^5;
-// 1/4 probability of this sort of action:
-assert(sum(increasingProbs) == 8 / 2^5);
-// Actions that might add an element:
-increasingProbs[ActionEnum.INSERT] = 1 / 4;
-increasingProbs[ActionEnum.REPLACE] = 1 / 4;
-assert(sum(increasingProbs) == 3/4);
-// Actions that might remove an element:
-increasingProbs[ActionEnum.POP_MIN] = 1 / 16;
-increasingProbs[ActionEnum.POP_MAX] = 1 / 16;
-increasingProbs[ActionEnum.DELETE] = 1 / 16;
-increasingProbs[ActionEnum.DELETE_CONTAINS] = 1 / 16;
-assert(sum(increasingProbs) == 1, 'Probabilities do not sum to 1');
-
-real[] decreasingProbs = copy(increasingProbs);
-// Actions that might add an element:
-decreasingProbs[ActionEnum.INSERT] = 1 / 8;
-decreasingProbs[ActionEnum.REPLACE] = 1 / 8;
-// Actions that might remove an element:
-decreasingProbs[ActionEnum.POP_MIN] = 1 / 8;
-decreasingProbs[ActionEnum.POP_MAX] = 1 / 8;
-decreasingProbs[ActionEnum.DELETE] = 1 / 8;
-decreasingProbs[ActionEnum.DELETE_CONTAINS] = 1 / 8;
-assert(sum(decreasingProbs) == 1, 'Probabilities do not sum to 1');
-
-SortedSet_wrapped_int sorted_set =
-    makeNaiveSortedSet(operator <, (wrapped_int)null);
-SplayTree_wrapped_int splayset =
-    SplayTree_wrapped_int(operator <, (wrapped_int)null);
-
-
-int maxSize = 0;
-for (int i = 0; i < 2000; ++i) {
-  real[] probs = i < 800 ? increasingProbs : decreasingProbs;
-  int choice = chooseAction(probs);
-  actions[choice](100, sorted_set, splayset);
-  string diffs = differences(sorted_set, splayset);
-  assert(diffs == '', 'Naive vs splayset: \n' + diffs);
-  assert(isStrictlySorted(splayset), 'Not sorted');
-  maxSize = max(maxSize, splayset.size());
-}
-
-EndTest();
\ No newline at end of file
diff --git a/tests/types/overrideEquals.asy b/tests/types/overrideEquals.asy
index d2cbbef89..fe52d1ddf 100644
--- a/tests/types/overrideEquals.asy
+++ b/tests/types/overrideEquals.asy
@@ -27,6 +27,8 @@ StartTest('overrideEquals: internal');
         return true;
       }
     }
+    private typedef bool F(Inner, Inner);
+    assert(((F)operator ==) != ((F)alias));
   }
   from Outer unravel Inner;
   Inner a = new Inner;
diff --git a/types.cc b/types.cc
index 065ec4dde..208789852 100644
--- a/types.cc
+++ b/types.cc
@@ -9,16 +9,18 @@
 #include <cstdio>
 #include <algorithm>
 
+#include "access.h"
+#include "asyprocess.h"
 #include "entry.h"
-#include "types.h"
-#include "runtime.h"
 #include "runarray.h"
 #include "runfile.h"
+#include "runmath.h"
 #include "runpair.h"
+#include "runstring.h"
+#include "runtime.h"
 #include "runtriple.h"
-#include "access.h"
+#include "types.h"
 #include "virtualfieldaccess.h"
-#include "asyprocess.h"
 
 namespace run {
 void arrayDeleteHelper(vm::stack *Stack);
@@ -113,6 +115,10 @@ void ty::print(ostream& out) const
   SIGFIELD(SetType,sym,name##Set);
 
 
+ty *hashMethodType() {
+  return new function(primInt());
+}
+
 ty *dimensionType() {
   return new function(primFile(),
                       formal(primInt(),SYM(nx),true),
@@ -131,6 +137,15 @@ ty *readType() {
 trans::varEntry *primitiveTy::virtualField(symbol id, signature *sig)
 {
   switch (kind) {
+    case ty_string:
+      SIGFIELD(hashMethodType,SYM(hash),stringHash);
+      break;
+    case ty_Int:
+      SIGFIELD(hashMethodType,SYM(hash),intHash);
+      break;
+    case ty_real:
+      SIGFIELD(hashMethodType,SYM(hash),realHash);
+      break;
     case ty_pair:
       FIELD(primReal,SYM(x),pairXPart);
       FIELD(primReal,SYM(y),pairYPart);
@@ -175,6 +190,10 @@ trans::varEntry *primitiveTy::virtualField(symbol id, signature *sig)
   return 0;
 }
 
+ty *ty::keyType() {
+  return primError();
+}
+
 ty *ty::virtualFieldGetType(symbol id)
 {
   trans::varEntry *v = virtualField(id, 0);
@@ -183,6 +202,12 @@ ty *ty::virtualFieldGetType(symbol id)
 
 ty *primitiveTy::virtualFieldGetType(symbol id)
 {
+  if (id == SYM(hash)) {
+    if (kind == ty_string || kind == ty_Int || kind == ty_real) {
+      return hashMethodType();
+    }
+  }
+
   if(kind == ty_file) {
     if (id == SYM(dimension))
       return dimensionType();
@@ -272,6 +297,11 @@ ty *array::deleteType()
   return deletetype;
 }
 
+ty *array::keyType()
+{
+  return primInt();
+}
+
 ty *initializedType() {
   return new function(primBoolean(),formal(primInt(),SYM(i)));
 }
@@ -410,7 +440,7 @@ bool equivalent(const signature *s1, const signature *s2)
   // Handle null signature
   if (s1 == 0 || s2 == 0)
     return false;
-
+  
   // Two open signatures are always equivalent, as the formals are ignored.
   if (s1->isOpen)
     return s2->isOpen;
diff --git a/types.h b/types.h
index a5fff6959..b3e089a42 100644
--- a/types.h
+++ b/types.h
@@ -102,6 +102,10 @@ class ty : public gc {
     return 0;
   }
 
+  virtual ty *signatureless() {
+    return getSignature() == nullptr ? this : nullptr;
+  }
+
   virtual bool primitive() {
     return false;
   }
@@ -176,6 +180,9 @@ class ty : public gc {
     return this==other;
   }
 
+  // If operator[=] is defined, returns the type of the key. For array types,
+  // returns primInt(). Otherwise returns primError().
+  virtual ty *keyType();
 
   // Returns a number for the type for use in a hash table.  Equivalent types
   // must yield the same number.
@@ -240,16 +247,16 @@ struct array : public ty {
     : ty(ty_array), celltype(celltype), pushtype(0), poptype(0),
       appendtype(0), inserttype(0), deletetype(0) {}
 
-  virtual bool isReference() {
+  virtual bool isReference() override {
     return true;
   }
 
-  bool equiv(const ty *other) const {
+  bool equiv(const ty *other) const override {
     return other->kind==ty_array &&
       equivalent(this->celltype,((array *)other)->celltype);
   }
 
-  size_t hash() const {
+  size_t hash() const override {
     return 1007 * celltype->hash();
   }
 
@@ -260,7 +267,7 @@ struct array : public ty {
       return 1;
   }
 
-  void print(ostream& out) const
+  void print(ostream& out) const override
   { out << *celltype << "[]"; }
 
   ty *pushType();
@@ -269,14 +276,16 @@ struct array : public ty {
   ty *insertType();
   ty *deleteType();
 
+  ty *keyType() override;
+
   // Initialize to an empty array by default.
-  trans::access *initializer();
+  trans::access *initializer() override;
 
   // NOTE: General vectorization of casts would be here.
 
   // Add length and push as virtual fields.
-  ty *virtualFieldGetType(symbol id);
-  trans::varEntry *virtualField(symbol id, signature *sig);
+  ty *virtualFieldGetType(symbol id) override;
+  trans::varEntry *virtualField(symbol id, signature *sig) override;
 };
 
 /* Base types */
@@ -522,7 +531,7 @@ class overloaded : public ty {
     : ty(ty_overloaded) { add(t); }
   virtual ~overloaded() {}
 
-  bool equiv(const ty *other) const
+  bool equiv(const ty *other) const override
   {
     for(ty_vector::const_iterator i=sub.begin();i!=sub.end();++i)
       if (equivalent(*i,other))
@@ -530,7 +539,7 @@ class overloaded : public ty {
     return false;
   }
 
-  size_t hash() const {
+  size_t hash() const override {
     // Overloaded types should not be hashed.
     assert(False);
     return 0;
@@ -564,11 +573,12 @@ class overloaded : public ty {
     }
   }
 
-  // Returns the signature-less type of the set.
-  ty *signatureless();
+  // If one of the types has no signature, returns that one; otherwise returns
+  // nullptr.
+  ty* signatureless() override;
 
   // True if one of the subtypes is castable.
-  bool castable(ty *target, caster &c);
+  bool castable(ty *target, caster &c) override;
 
   size_t size() const { return sub.size(); }