diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..6f269154
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,26 @@
+# After changing this file, check it on:
+#   http://lint.travis-ci.org/
+language: python
+sudo: false
+
+python:
+  - '3.6'
+  - 'nightly'
+  - 'pypy3'
+
+env:
+  - INCLUDE_10K=
+
+matrix:
+  allow_failures:
+    - python: 'nightly'
+    - python: 'pypy3'
+
+install:
+  - travis_retry pip install pytest pytest-cov
+  - travis_retry pip install -r requirements.txt
+  - travis_retry python setup.py sdist bdist_wheel
+  - travis_retry pip install -e .
+
+script:
+  - pytest --cov=anonlink
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 65144cb9..aee7a49d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,23 @@
+0.7.0
+-----
+
+Introduces support for comparing "arbitrary" length cryptographic linkage keys.
+Benchmark is much more comprehensive and more comparable between releases - see the
+readme for an example report.
+
+Other improvements
+~~~~~~~~~~~~~~~~~~
+
+- Internal C/C++ cleanup/refactoring and optimization.
+- Expose the native popcount implementation to Python.
+- Bug fix to avoid configuring a logger.
+- Testing is now with `py.test` and runs on [travis-ci](https://travis-ci.org/n1analytics/anonlink/)
+
+0.6.3
+-----
+
+Small fix to logging setup.
+
 0.6.2 - Changelog init
 ---------------------
 
diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 65f5fe2a..00000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,145 +0,0 @@
-void setBuildStatus(String message, String state) {
-  step([
-    $class: "GitHubCommitStatusSetter",
-    reposSource: [$class: "ManuallyEnteredRepositorySource", url: "https://github.com/n1analytics/anonlink"],
-    contextSource: [$class: 'ManuallyEnteredCommitContextSource', context: 'jenkins'],
-    statusResultSource: [ $class: "ConditionalStatusResultSource", results: [[$class: "AnyBuildResult", message: message, state: state]] ]
-  ]);
-}
-
-def isMaster = env.BRANCH_NAME == 'master'
-def isDevelop = env.BRANCH_NAME == 'develop'
-
-def configs = [
-    [label: 'GPU 1', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']],
-    [label: 'osx', pythons: ['python3.5'], compilers: ['clang', 'gcc']]
-]
-
-def build(python_version, compiler, label, release=false) {
-    try {
-        def workspace = pwd();
-        echo "${label}"
-        echo "workspace directory is ${workspace}"
-        env.PATH = "${workspace}/env/bin:/usr/bin:${env.PATH}"
-
-        withEnv(["VENV=${workspace}/env"]) {
-        // ${workspace} contains an absolute path to job workspace (not available within a stage)
-
-            sh "test -d ${workspace}/env && rm -rf ${workspace}/env || echo 'no env, skipping cleanup'"
-
-            // The stage below is attempting to get the latest version of our application code.
-            // Since this is a multi-branch project the 'checkout scm' command is used. If you're working with a standard
-            // pipeline project then you can replace this with the regular 'git url:' pipeline command.
-            // The 'checkout scm' command will automatically pull down the code from the appropriate branch that triggered this build.
-            checkout scm
-
-            def testsError = null
-
-            clkhashPackageName = "clkhash-*-py2.py3-none-any.whl"
-
-            step ([$class: 'CopyArtifact',
-              projectName: 'clkhash/master',
-              fingerprint: true,
-              flatten: true,
-              filter: 'dist/' + clkhashPackageName
-            ]);
-
-            try {
-                sh """#!/usr/bin/env bash
-                    set -xe
-
-                    # Jenkins logs in as a non-interactive shell, so we don't even have /usr/local/bin in PATH
-                    export PATH="/usr/local/bin:\${PATH}"
-                    printenv
-
-                    rm -fr build
-                    ${python_version} -m venv --clear ${VENV}
-                    ${VENV}/bin/python ${VENV}/bin/pip install --upgrade pip coverage setuptools wheel
-                    ${VENV}/bin/python ${VENV}/bin/pip install --quiet --upgrade ${clkhashPackageName}
-
-                    ${VENV}/bin/python ${VENV}/bin/pip install -r requirements.txt
-
-                    CC=${compiler} ${VENV}/bin/python setup.py sdist bdist_wheel
-                    ${VENV}/bin/python ${VENV}/bin/pip install -e .
-                    ${VENV}/bin/python ${VENV}/bin/nosetests \
-                        --with-xunit --with-coverage --cover-inclusive \
-                        --cover-package=anonlink
-
-                   """
-
-                if(release) {
-                    // This will be the official release
-                    archiveArtifacts artifacts: "dist/anonlink-*.whl"
-                    archiveArtifacts artifacts: "dist/anonlink-*.tar.gz"
-                }
-            }
-            catch(err) {
-                testsError = err
-                currentBuild.result = 'FAILURE'
-                setBuildStatus("Build failed", "FAILURE");
-            }
-            finally {
-
-                if (!release) {
-                    junit 'nosetests.xml'
-                } else {
-                    // Code coverage only needs to be done once
-                    sh '''#!/usr/bin/env bash
-                        set -xe
-
-                        ${VENV}/bin/python ${VENV}/bin/coverage xml --omit="*/cpp_code/*" --omit="*build_matcher.py*"
-
-                    '''
-
-                    step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
-                }
-
-                if (testsError) {
-                    throw testsError
-                }
-            }
-        }
-
-    } finally {
-        deleteDir()
-    }
-}
-
-
-def builders = [:]
-for (config in configs) {
-    def label = config["label"]
-    def pythons = config["pythons"]
-    def compilers = config["compilers"]
-
-    for (_py_version in pythons) {
-        for (_compiler in compilers) {
-
-            def py_version = _py_version
-            def compiler = _compiler
-            def combinedName = "${label}-${py_version}-${compiler}"
-
-            builders[combinedName] = {
-                node(label) {
-                    stage(combinedName) {
-                        build(py_version, compiler, label, false)
-                    }
-                }
-            }
-        }
-    }
-}
-
-node {
-    checkout scm
-    setBuildStatus("Build in progress", "PENDING");
-}
-
-parallel builders
-
-node('GPU 1') {
-    stage('Release') {
-        build('python3.5', 'gcc', 'GPU 1', true)
-        setBuildStatus("Tests Passed", "SUCCESS");
-    }
-}
\ No newline at end of file
diff --git a/Jenkinsfile.groovy b/Jenkinsfile.groovy
new file mode 100644
index 00000000..416a2e1f
--- /dev/null
+++ b/Jenkinsfile.groovy
@@ -0,0 +1,127 @@
+@Library("N1Pipeline@0.0.5")
+import com.n1analytics.git.GitUtils;
+import com.n1analytics.git.GitCommit;
+import com.n1analytics.n1.docker.N1EngineContainer;
+import com.n1analytics.python.PythonVirtualEnvironment;
+
+def isMaster = env.BRANCH_NAME == 'master'
+def isDevelop = env.BRANCH_NAME == 'develop'
+
+VENV_DIRECTORY = "env"
+
+GIT_CONTEXT = "jenkins"
+
+def configs = [
+    [label: 'GPU 1', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']],
+    //[label: 'osx', pythons: ['python3.5'], compilers: ['clang', 'gcc']]
+    [label: 'McNode', pythons: ['python3.5'], compilers: ['clang']]
+]
+
+def PythonVirtualEnvironment prepareVirtualEnvironment(String pythonVersion, clkhashPackageName, compiler, venv_directory = VENV_DIRECTORY) {
+  PythonVirtualEnvironment venv = new PythonVirtualEnvironment(this, venv_directory, pythonVersion);
+  venv.create();
+  venv.runPipCommand("install --upgrade pip coverage setuptools wheel")
+  venv.runPipCommand("install --quiet --upgrade ${clkhashPackageName}");
+  venv.runPipCommand("install -r requirements.txt");
+  String cc = "CC=" + compiler;
+  withEnv([cc]) {
+    venv.runCommand("setup.py sdist bdist_wheel --universal");
+  }
+  venv.runPipCommand("install -e .");
+  return venv;
+}
+
+def build(python_version, compiler, label, release = false) {
+  GitUtils.checkoutFromSCM(this)
+  Exception testsError = null;
+  try {
+    clkhashPackageName = "clkhash-*-py2.py3-none-any.whl"
+    copyArtifacts(
+        projectName: 'clkhash/master',
+        fingerprint: true,
+        flatten    : true,
+        filter     : 'dist/' + clkhashPackageName
+    )
+
+    PythonVirtualEnvironment venv = prepareVirtualEnvironment(python_version, clkhashPackageName, compiler)
+    try {
+      venv.runChosenCommand("pytest --cov=anonlink --junit-xml=testoutput.xml --cov-report=xml:coverage.xml")
+      if (release) {
+        // This will be the official release
+        archiveArtifacts artifacts: "dist/anonlink-*.whl"
+        archiveArtifacts artifacts: "dist/anonlink-*.tar.gz"
+      }
+    } catch (Exception err) {
+      testsError = err
+    } finally {
+      if (!release) {
+        junit 'testoutput.xml'
+      } else {
+        venv.runChosenCommand("coverage xml --omit=\"*/cpp_code/*\" --omit=\"*build_matcher.py*\"")
+        cobertura coberturaReportFile: 'coverage.xml'
+      }
+      if (testsError != null) {
+        throw testsError
+      }
+    }
+
+  } finally {
+    try {
+      deleteDir()
+    } catch (Exception e) {
+      echo "Error during 'deleteDir':\n" + e.toString()
+    }
+  }
+}
+
+
+def builders = [:]
+for (config in configs) {
+  def label = config["label"]
+  def pythons = config["pythons"]
+  def compilers = config["compilers"]
+
+  for (_py_version in pythons) {
+    for (_compiler in compilers) {
+
+      def py_version = _py_version
+      def compiler = _compiler
+      def combinedName = "${label}-${py_version}-${compiler}"
+
+      builders[combinedName] = {
+        node(label) {
+          stage(combinedName) {
+            build(py_version, compiler, label, false)
+          }
+        }
+      }
+    }
+  }
+}
+
+GitCommit commit;
+node() {
+  commit = GitUtils.checkoutFromSCM(this);
+  commit.setInProgressStatus(GIT_CONTEXT);
+}
+
+try {
+  parallel builders
+} catch (Exception err) {
+  node() {
+    commit.setFailStatus("Build failed", GIT_CONTEXT);
+  }
+  throw err
+}
+
+node('GPU 1') {
+  stage('Release') {
+    try {
+      build('python3.5', 'gcc', 'GPU 1', true)
+      commit.setSuccessStatus(GIT_CONTEXT)
+    } catch (Exception e) {
+      commit.setFailStatus("Release failed", GIT_CONTEXT);
+      throw e;
+    }
+  }
+}
\ No newline at end of file
diff --git a/README.rst b/README.rst
index 9c025eb5..2d645231 100644
--- a/README.rst
+++ b/README.rst
@@ -1,3 +1,8 @@
+
+.. image:: https://travis-ci.org/n1analytics/anonlink.svg?branch=master
+    :target: https://travis-ci.org/n1analytics/anonlink
+
+
 A Python (and optimised C++) implementation of **anonymous linkage** using
 *cryptographic linkage keys* as described by Rainer Schnell, Tobias
 Bachteler, and Jörg Reiher in `A Novel Error-Tolerant Anonymous Linking
@@ -43,42 +48,95 @@ For linux with:
 Benchmark
 ---------
 
+You can run the benchmark with:
+
 ::
 
-    $ python -m anonlink.benchmark
-    100000 x 1024 bit popcounts in 0.016376 seconds
-    Popcount speed: 745.42 MiB/s
-    Size 1 | Size 2 | Comparisons  | Compute Time | Million Comparisons per second
-      1000 |   1000 |      1000000 |    0.060s    |        16.632
-      2000 |   2000 |      4000000 |    0.159s    |        25.232
-      3000 |   3000 |      9000000 |    0.316s    |        28.524
-      4000 |   4000 |     16000000 |    0.486s    |        32.943
-      5000 |   5000 |     25000000 |    0.584s    |        42.825
-      6000 |   6000 |     36000000 |    0.600s    |        60.027
-      7000 |   7000 |     49000000 |    0.621s    |        78.875
-      8000 |   8000 |     64000000 |    0.758s    |        84.404
-      9000 |   9000 |     81000000 |    0.892s    |        90.827
-     10000 |  10000 |    100000000 |    1.228s    |        81.411
-     20000 |  20000 |    400000000 |    3.980s    |       100.504
-     30000 |  30000 |    900000000 |    9.280s    |        96.986
-     40000 |  40000 |   1600000000 |   17.318s    |        92.391
-
-C++ version uses cpu instruction ``POPCNT`` for bitcount in a 64bit
-word. http://wm.ite.pl/articles/sse-popcount.html
+    $ python3 -m anonlink.benchmark
+    Anonlink benchmark -- see README for explanation
+    ------------------------------------------------
+    100000 x 1024 bit popcounts
+    Implementation              | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s)
+    Python (bitarray.count()):  |    17.78  |      686.54       |    5.62
+    Native code (no copy):      |     1.00  |    12243.76       |  100.30
+    Native code (w/ copy):      |   344.17  |       35.47       |    0.29 (99.7% copying)
+
+    Threshold: 0.5
+    Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput
+           |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)
+    -------+--------+------------------+-------------------------+-------------
+      1000 |   1000 |    1e6  (50.20%) |  0.249  (88.6% / 11.4%) |     4.525
+      2000 |   2000 |    4e6  (50.51%) |  1.069  (88.5% / 11.5%) |     4.227
+      3000 |   3000 |    9e6  (50.51%) |  2.412  (85.3% / 14.7%) |     4.375
+      4000 |   4000 |   16e6  (50.56%) |  4.316  (83.6% / 16.4%) |     4.434
+
+    Threshold: 0.7
+    Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput
+           |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)
+    -------+--------+------------------+-------------------------+-------------
+      1000 |   1000 |    1e6  ( 0.01%) |  0.017  (99.8% /  0.2%) |    59.605
+      2000 |   2000 |    4e6  ( 0.01%) |  0.056  (99.8% /  0.2%) |    71.484
+      3000 |   3000 |    9e6  ( 0.01%) |  0.118  (99.9% /  0.1%) |    76.500
+      4000 |   4000 |   16e6  ( 0.01%) |  0.202  (99.9% /  0.1%) |    79.256
+      5000 |   5000 |   25e6  ( 0.01%) |  0.309  (99.9% /  0.1%) |    81.093
+      6000 |   6000 |   36e6  ( 0.01%) |  0.435  (99.9% /  0.1%) |    82.841
+      7000 |   7000 |   49e6  ( 0.01%) |  0.590  (99.9% /  0.1%) |    83.164
+      8000 |   8000 |   64e6  ( 0.01%) |  0.757  (99.9% /  0.1%) |    84.619
+      9000 |   9000 |   81e6  ( 0.01%) |  0.962  (99.8% /  0.2%) |    84.358
+     10000 |  10000 |  100e6  ( 0.01%) |  1.166  (99.8% /  0.2%) |    85.895
+     20000 |  20000 |  400e6  ( 0.01%) |  4.586  (99.9% /  0.1%) |    87.334
+
+The tables are interpreted as follows. The first section compares the
+bandwidth doing popcounts through (i) the Python bitarray library and
+(ii) a native code implementation in assembler.  The latter
+implementation is measured in two ways: the first measures just the
+time taken to compute the popcounts, while the second includes the
+time taken to copy the data out of the running Python instance as well
+as copying the result back into Python. The "% copying" measure is the
+proportion of time spent doing this copying.
+
+The second section includes two tables that measure the throughput of
+the Dice coefficient comparison function. The two tables correspond to
+two different choices of "matching threshold", 0.5 and 0.7, which were
+chosen to characterise two different performance scenarios. Since the
+data used for comparisons is randomly generated, the first threshold
+value will cause about 50% of the candidates to "match", while the
+second threshold value will cause <0.01% of the candidates to match
+(these values are reported in the "match %" column).  In both cases,
+all matches above the threshold are returned and passed to the
+solver. In the first case, the large number of matches means that much
+of the time is spent keeping the candidates in order so that the top
+`k` matches can be returned. In the latter case, the tiny number of
+candidate matches means that the throughput is determined primarily by
+the comparison code itself.
+
+Finally, the Total Time column includes indications as to the
+proportion of time spent calculating the (sparse) similarity matrix
+`comparisons` and the proportion of time spent `matching` in the
+greedy solver. This latter is determined by the size of the similarity
+matrix, which will be approximately `#comparisons * match% / 100`.
 
 Tests
 =====
 
-Run unit tests with nose
+Run unit tests with `pytest`:
 
 ::
 
-    $ python -m nose
-    ......................SS..............................
-    ----------------------------------------------------------------------
-    Ran 54 tests in 6.615s
+    $ pytest
+    ====================================== test session starts ======================================
+    platform linux -- Python 3.6.4, pytest-3.2.5, py-1.4.34, pluggy-0.4.0
+    rootdir: /home/hlaw/src/n1-anonlink, inifile:
+    collected 71 items
+
+    tests/test_benchmark.py ...
+    tests/test_bloommatcher.py ..............
+    tests/test_e2e.py .............ss....
+    tests/test_matcher.py ..x.....x......x....x..
+    tests/test_similarity.py .........
+    tests/test_util.py ...
 
-    OK (SKIP=2)
+    ======================== 65 passed, 2 skipped, 4 xfailed in 4.01 seconds ========================
 
 To enable slightly larger tests add the following environment variables:
 
@@ -91,8 +149,6 @@ Limitations
 -  The linkage process has order n^2 time complexity - although algorithms exist to
    significantly speed this up. Several possible speedups are described
    in http://dbs.uni-leipzig.de/file/P4Join-BTW2015.pdf
--  The C++ code makes an assumption of 1024 bit keys (although this would be easy
-   to change).
 
 
 License
diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index fb7e1161..6277f3df 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -15,13 +15,14 @@
     "_entitymatcher",
     source,
     source_extension='.cpp',
-    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt'],
+    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-march=native', '-mssse3', '-mpopcnt', '-fvisibility=hidden'
+    ],
 )
 
 ffibuilder.cdef("""
-    int match_one_against_many_dice(const char * one, const char * many, int n, double * score);
-    int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores);
-    double dice_coeff_1024(const char *e1, const char *e2);
+    int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores);
+    double dice_coeff(const char *array1, const char *array2, int array_bytes);
+    double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes);
 """)
 
 
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 6182585c..c5093365 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -1,217 +1,451 @@
 #include <algorithm>
 #include <vector>
 #include <queue>
-#include <cstdint>
+#include <stdint.h>
 #include <cstdlib>
-#include <iostream>
+#include <ctime>
+#include <cassert>
+#include <climits>
 
+static constexpr int WORD_BYTES = sizeof(uint64_t);
 
-#define WORDBYTES (sizeof(uint64_t))
-#define WORDBITS (WORDBYTES * 8)
-#define KEYBITS 1024
-#define KEYBYTES (KEYBITS / 8)
-// KEYWORDS must be divisible by 4. It is currently equal to 16.
-#define KEYWORDS (KEYBYTES / WORDBYTES)
+/**
+ * The popcount of n elements of buf is the sum of c0, c1, c2, c3.
+ */
+template<int n>
+static inline void
+popcount(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf) {
+    popcount<4>(c0, c1, c2, c3, buf);
+    popcount<n - 4>(c0, c1, c2, c3, buf + 4);
+}
 
+// Fast Path
+//
 // Source: http://danluu.com/assembly-intrinsics/
 // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
 //
-// NB: Dan Luu's original assembly is incorrect because it
-// clobbers registers marked as "input only" (see warning at
+// NB: Dan Luu's original assembly (and the SO answer it was based on)
+// is incorrect because it clobbers registers marked as "input only"
+// (see warning at
 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
-// -- this mistake does not materialise with GCC (4.9), but it
-// does with Clang (3.6 and 3.8)).  We fix the mistake by
-// explicitly loading the contents of buf into registers and using
-// these same registers for the intermediate popcnts.
-static inline uint32_t
-builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) {
-  uint64_t b0, b1, b2, b3;
-  uint64_t c0, c1, c2, c3;
-  c0 = c1 = c2 = c3 = 0;
-
-  // We unroll this manually because some versions of GCC don't do so
-  // of their own volition.  Speedup from this in such cases is >20%.
-#undef LOOP_BODY
-#define LOOP_BODY(i) do {                                         \
-  b0 = buf[i]; b1 = buf[i + 1]; b2 = buf[i + 2]; b3 = buf[i + 3]; \
-  __asm__(                                                        \
-    "popcnt %4, %4  \n\t"                                         \
-    "add %4, %0     \n\t"                                         \
-    "popcnt %5, %5  \n\t"                                         \
-    "add %5, %1     \n\t"                                         \
-    "popcnt %6, %6  \n\t"                                         \
-    "add %6, %2     \n\t"                                         \
-    "popcnt %7, %7  \n\t"                                         \
-    "add %7, %3     \n\t"                                         \
-    : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),                 \
-      "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));                \
-  } while (0)
-
-  LOOP_BODY(0);
-  LOOP_BODY(4);
-  LOOP_BODY(8);
-  LOOP_BODY(12);
-
-  return c0 + c1 + c2 + c3;
+// -- this mistake does not materialise with GCC (4.9), but it does
+// with Clang (3.6 and 3.8)).  We fix the mistake by explicitly
+// loading the contents of buf into registers and using these same
+// registers for the intermediate popcnts.
+/**
+ * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
+ */
+template<>
+inline void
+popcount<4>(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf) {
+    uint64_t b0, b1, b2, b3;
+    b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
+    __asm__(
+        "popcnt %4, %4  \n\t"
+        "add %4, %0     \n\t"
+        "popcnt %5, %5  \n\t"
+        "add %5, %1     \n\t"
+        "popcnt %6, %6  \n\t"
+        "add %6, %2     \n\t"
+        "popcnt %7, %7  \n\t"
+        "add %7, %3     \n\t"
+        : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
+          "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
+}
+
+// Slow paths
+// TODO: Assumes sizeof(long) == 8
+//
+// NB: The specialisation to n=3 is not currently used but included
+// for completeness (i.e. so that popcount<n> is defined for all
+// non-negative n) and in anticipation of its use in the near future.
+#if 0
+template<>
+inline void
+popcount<3>(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
+        const uint64_t *buf) {
+    c0 += __builtin_popcountl(buf[0]);
+    c1 += __builtin_popcountl(buf[1]);
+    c2 += __builtin_popcountl(buf[2]);
 }
+#endif
 
 /**
- * Compute the Dice coefficient similarity measure of two bit patterns.
+ * The popcount of 2 elements of buf is the sum of c0, c1.
  */
-static double
-dice_coeff_1024(const char *e1, const char *e2) {
-    const uint64_t *comp1 = (const uint64_t *) e1;
-    const uint64_t *comp2 = (const uint64_t *) e2;
+template<>
+inline void
+popcount<2>(
+        uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
+        const uint64_t *buf) {
+    c0 += __builtin_popcountl(buf[0]);
+    c1 += __builtin_popcountl(buf[1]);
+}
+
+/**
+ * The popcount *buf is in c0.
+ */
+template<>
+inline void
+popcount<1>(
+        uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
+        const uint64_t *buf) {
+    c0 += __builtin_popcountl(buf[0]);
+}
 
-    uint32_t count_both = 0;
 
-    count_both += builtin_popcnt_unrolled_errata_manual(comp1);
-    count_both += builtin_popcnt_unrolled_errata_manual(comp2);
-    if(count_both == 0) {
-        return 0.0;
+/**
+ * Calculate population counts of an array of inputs of nwords elements.
+ *
+ * 'arrays' must point to narrays*nwords*WORD_BYTES bytes
+ * 'counts' must point to narrays*sizeof(uint32_t) bytes.
+ * For i = 0 to narrays - 1, the population count of the nwords elements
+ *
+ *   arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
+ *
+ * is put in counts[i].
+ */
+template<int nwords>
+static void
+_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
+    uint64_t c0, c1, c2, c3;
+    for (int i = 0; i < narrays; ++i, arrays += nwords) {
+        c0 = c1 = c2 = c3 = 0;
+        popcount<nwords>(c0, c1, c2, c3, arrays);
+        counts[i] = c0 + c1 + c2 + c3;
     }
+}
 
-    uint64_t combined[KEYWORDS];
-    for (unsigned int i = 0 ; i < KEYWORDS; i++ ) {
-        combined[i] = comp1[i] & comp2[i];
+/**
+ * Return the popcount of the nwords elements starting at array.
+ */
+static uint32_t
+_popcount_array(const uint64_t *array, int nwords) {
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
+
+    while (nwords >= 16) {
+        popcount<16>(c0, c1, c2, c3, array);
+        array += 16;
+        nwords -= 16;
     }
+    // nwords < 16
+    if (nwords >= 8) {
+        popcount<8>(c0, c1, c2, c3, array);
+        array += 8;
+        nwords -= 8;
+    }
+    // nwords < 8
+    if (nwords >= 4) {
+        popcount<4>(c0, c1, c2, c3, array);
+        array += 4;
+        nwords -= 4;
+    }
+    // nwords < 4
+    if (nwords >= 2) {
+        popcount<2>(c0, c1, c2, c3, array);
+        array += 2;
+        nwords -= 2;
+    }
+    // nwords < 2
+    if (nwords == 1)
+        popcount<1>(c0, c1, c2, c3, array);
+    return c0 + c1 + c2 + c3;
+}
 
-    uint32_t count_and = builtin_popcnt_unrolled_errata_manual(combined);
+/**
+ * The popcount of the logical AND of n corresponding elements of buf1
+ * and buf2 is the sum of c0, c1, c2, c3.
+ */
+template<int n>
+static inline void
+popcount_logand(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf1, const uint64_t *buf2) {
+    popcount_logand<4>(c0, c1, c2, c3, buf1, buf2);
+    popcount_logand<n - 4>(c0, c1, c2, c3, buf1 + 4, buf2 + 4);
+}
 
-    return 2 * count_and / (double)count_both;
+/**
+ * The popcount of the logical AND of 4 corresponding elements of buf1
+ * and buf2 is the sum of c0, c1, c2, c3.
+ */
+template<>
+inline void
+popcount_logand<4>(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf1, const uint64_t *buf2) {
+    uint64_t b[4];
+    b[0] = buf1[0] & buf2[0];
+    b[1] = buf1[1] & buf2[1];
+    b[2] = buf1[2] & buf2[2];
+    b[3] = buf1[3] & buf2[3];
+    popcount<4>(c0, c1, c2, c3, b);
 }
 
+/**
+ * Return the popcount of the logical AND of len corresponding
+ * elements of u and v.
+ */
+static uint32_t
+_popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) {
+    // NB: The switch statement at the end of this function must have
+    // cases for all i = 1, ..., LOOP_LEN - 1.
+    static constexpr int LOOP_LEN = 4;
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
+
+    int i = 0;
+    for ( ; i + LOOP_LEN <= len; i += LOOP_LEN) {
+        popcount_logand<LOOP_LEN>(c0, c1, c2, c3, u, v);
+        u += LOOP_LEN;
+        v += LOOP_LEN;
+    }
+
+    // NB: The "fall through" comments are necessary to tell GCC and
+    // Clang not to complain about the fact that the case clauses
+    // don't have break statements in them.
+    switch (len - i) {
+    case 3: c2 += __builtin_popcountl(u[2] & v[2]);  /* fall through */
+    case 2: c1 += __builtin_popcountl(u[1] & v[1]);  /* fall through */
+    case 1: c0 += __builtin_popcountl(u[0] & v[0]);  /* fall through */
+    }
 
-class Node {
+    return c0 + c1 + c2 + c3;
+}
 
+/**
+ * Return the Sorensen-Dice coefficient of nwords length arrays u and
+ * v, whose popcounts are given in u_popc and v_popc respectively.
+ */
+static inline double
+_dice_coeff_generic(
+        const uint64_t *u, uint32_t u_popc,
+        const uint64_t *v, uint32_t v_popc,
+        int nwords) {
+    uint32_t uv_popc = _popcount_logand_array(u, v, nwords);
+    return (2 * uv_popc) / (double) (u_popc + v_popc);
+}
+
+/**
+ * Return the Sorensen-Dice coefficient of nwords length arrays u and
+ * v, whose popcounts are given in u_popc and v_popc respectively.
+ */
+template<int nwords>
+static inline double
+_dice_coeff(
+        const uint64_t *u, uint32_t u_popc,
+        const uint64_t *v, uint32_t v_popc) {
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
+    popcount_logand<nwords>(c0, c1, c2, c3, u, v);
+    uint32_t uv_popc = c0 + c1 + c2 + c3;
+    return (2 * uv_popc) / (double) (u_popc + v_popc);
+}
+
+class Node {
 public:
     int index;
     double score;
 
     // Constructor with default
     Node( int n_index = -1, double n_score = -1.0 )
-        :index(n_index), score( n_score )
-        {
-        }
+        :index(n_index), score( n_score ) { }
 };
 
-struct score_cmp{
-    bool operator()(const Node& a, const Node& b) const{
+struct score_cmp {
+    bool operator()(const Node& a, const Node& b) const {
         return a.score > b.score;
     }
 };
 
 
 /**
- * Count lots of bits.
+ *
  */
-static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) {
-    for (int i = 0; i < n; i++) {
-        const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS;
-        counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig);
-    }
+static inline uint32_t
+calculate_max_difference(uint32_t popcnt_a, double threshold) {
+    return 2 * popcnt_a * (1/threshold - 1);
 }
 
 /**
+ * Convert clock measurement t to milliseconds.
  *
+ * t should have been obtained as the difference of calls to clock()
+ * for this to make sense.
  */
-static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold)
-{
-    return 2 * popcnt_a * (1/threshold - 1);
+static inline double
+to_millis(clock_t t) {
+    static constexpr double CPS = (double)CLOCKS_PER_SEC;
+    return t * 1.0E3 / CPS;
+}
+
+static inline uint32_t
+abs_diff(uint32_t a, uint32_t b) {
+    if (a > b)
+        return a - b;
+    return b - a;
 }
 
+
 extern "C"
 {
     /**
-     * Calculate up to the top k indices and scores.
-     * Returns the number matched above a threshold.
+     * Calculate population counts of an array of inputs; return how
+     * long it took in milliseconds.
+     *
+     * 'arrays' must point to narrays*array_bytes bytes
+     * 'counts' must point to narrays*sizeof(uint32_t) bytes.
+     * For i = 0 to n - 1, the population count of the array_bytes*8 bits
+     *
+     *   arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
+     *
+     * is put in counts[i].
+     *
+     * ASSUMES: array_bytes is divisible by 8.
      */
-    int match_one_against_many_dice_1024_k_top(
-        const char *one,
-        const char *many,
-        const uint32_t *counts_many,
-        int n,
-        uint32_t k,
-        double threshold,
-        int *indices,
-        double *scores) {
-
-        const uint64_t *comp1 = (const uint64_t *) one;
-        const uint64_t *comp2 = (const uint64_t *) many;
-
-        std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
-
-        uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1);
-
-        uint64_t combined[KEYWORDS];
+    double
+    popcount_arrays(
+            uint32_t *counts,
+            const char *arrays, int narrays, int array_bytes) {
+        // assumes WORD_BYTES divides array_bytes
+        int nwords = array_bytes / WORD_BYTES;
+        const uint64_t *u = reinterpret_cast<const uint64_t *>(arrays);
+
+        // assumes WORD_PER_POPCOUNT divides nwords
+        clock_t t = clock();
+        switch (nwords) {
+        case 32: _popcount_arrays<32>(counts, u, narrays); break;
+        case 16: _popcount_arrays<16>(counts, u, narrays); break;
+        case  8: _popcount_arrays< 8>(counts, u, narrays); break;
+        default:
+            for (int i = 0; i < narrays; ++i, u += nwords)
+                counts[i] = _popcount_array(u, nwords);
+        }
+        return to_millis(clock() - t);
+    }
 
-        double *all_scores = new double[n];
+    /**
+     * Compute the Dice coefficient similarity measure of two arrays
+     * of length array_bytes.
+     *
+     * ASSUMES: array_bytes is divisible by 8.
+     */
+    double
+    dice_coeff(
+            const char *array1,
+            const char *array2,
+            int array_bytes) {
+        const uint64_t *u, *v;
+        uint32_t u_popc, v_popc;
+        // assumes WORD_BYTES divides array_bytes
+        int nwords = array_bytes / WORD_BYTES;
+
+        u = reinterpret_cast<const uint64_t *>(array1);
+        v = reinterpret_cast<const uint64_t *>(array2);
+
+        // If the popcount of one of the arrays is zero, then the
+        // popcount of the "intersection" (logical AND) will be zero,
+        // hence the whole Dice coefficient will be zero.
+        u_popc = _popcount_array(u, nwords);
+        if (u_popc == 0)
+            return 0.0;
+        v_popc = _popcount_array(v, nwords);
+        if (v_popc == 0)
+            return 0.0;
+
+        return _dice_coeff_generic(u, u_popc, v, v_popc, nwords);
+    }
 
-        uint32_t max_popcnt_delta = 1024;
+    /**
+     * Calculate up to the top k indices and scores.  Returns the
+     * number matched above the given threshold or -1 if keybytes is
+     * not a multiple of 8.
+     */
+    int match_one_against_many_dice_k_top(
+            const char *one,
+            const char *many,
+            const uint32_t *counts_many,
+            int n,
+            int keybytes,
+            uint32_t k,
+            double threshold,
+            int *indices,
+            double *scores) {
+
+        if (keybytes % WORD_BYTES != 0)
+            return -1;
+        int keywords = keybytes / WORD_BYTES;
+        const uint64_t *comp1 = reinterpret_cast<const uint64_t *>(one);
+        const uint64_t *comp2 = reinterpret_cast<const uint64_t *>(many);
+
+        // Here we create top_k_scores on the stack by providing it
+        // with a vector in which to put its elements. We do this so
+        // that we can reserve the amount of space needed for the
+        // scores in advance and avoid potential memory reallocation
+        // and copying.
+        typedef std::vector<Node> node_vector;
+        typedef std::priority_queue<Node, std::vector<Node>, score_cmp> node_queue;
+        node_vector vec;
+        vec.reserve(k + 1);
+        node_queue top_k_scores(score_cmp(), std::move(vec));
+
+        uint32_t count_one = _popcount_array(comp1, keywords);
+        if (count_one == 0)
+            return 0;
+
+        uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
         }
-        uint32_t current_delta;
 
-        for (int j = 0; j < n; j++) {
-            const uint64_t *current = comp2 + j * KEYWORDS;
-
-            if(count_one > counts_many[j]){
-                current_delta = count_one - counts_many[j];
-            } else {
-                current_delta = counts_many[j] - count_one;
+        auto push_score = [&](double score, int idx) {
+            if (score >= threshold) {
+                top_k_scores.push(Node(idx, score));
+                if (top_k_scores.size() > k) {
+                    // Popping the top element is O(log(k))!
+                    top_k_scores.pop();
+                }
             }
-
-            if(current_delta <= max_popcnt_delta){
-                for (unsigned int i = 0 ; i < KEYWORDS; i++ ) {
-                    combined[i] = current[i] & comp1[i];
+        };
+
+        const uint64_t *current = comp2;
+
+        // NB: For any key length that must run at maximum speed, we
+        // need to specialise a block in the following 'if' statement
+        // (which is an example of specialising to keywords == 16).
+        if (keywords == 16) {
+            for (int j = 0; j < n; j++, current += 16) {
+                const uint32_t counts_many_j = counts_many[j];
+                if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
+                    double score = _dice_coeff<16>(comp1, count_one, current, counts_many_j);
+                    push_score(score, j);
                 }
-
-                uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined);
-
-                double score = 2 * count_curr / (double) (count_one + counts_many[j]);
-                all_scores[j] = score;
-            } else {
-                // Skipping because popcount difference too large
-                all_scores[j] = -1;
             }
-        }
-
-        for (int j = 0; j < n; j++) {
-
-            if(all_scores[j] >= threshold) {
-                max_k_scores.push(Node(j, all_scores[j]));
+        } else {
+            for (int j = 0; j < n; j++, current += keywords) {
+                const uint32_t counts_many_j = counts_many[j];
+                if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
+                    double score = _dice_coeff_generic(comp1, count_one, current, counts_many_j, keywords);
+                    push_score(score, j);
+                }
             }
-
-            if(max_k_scores.size() > k) max_k_scores.pop();
         }
 
-        delete[] all_scores;
-
         int i = 0;
-        while (!max_k_scores.empty()) {
-
-           scores[i] = max_k_scores.top().score;
-           indices[i] = max_k_scores.top().index;
-
-           max_k_scores.pop();
-           i+=1;
+        while ( ! top_k_scores.empty()) {
+           scores[i] = top_k_scores.top().score;
+           indices[i] = top_k_scores.top().index;
+           // Popping the top element is O(log(k))!
+           top_k_scores.pop();
+           i += 1;
         }
-        return i;
-    }
-
-    int match_one_against_many_dice(const char *one, const char *many, int n, double *score) {
-
-        static const double threshold = 0.0;
-        static const int k = 1;
-        int idx_unused;
-        uint32_t *counts_many = new uint32_t[n];
-        popcount_1024_array(many, n, counts_many);
-        int res = match_one_against_many_dice_1024_k_top(
-            one, many, counts_many, n, k, threshold, &idx_unused, score);
-        delete[] counts_many;
 
-        return res;
+        return i;
     }
 }
-
diff --git a/anonlink/__init__.py b/anonlink/__init__.py
index 79168898..ce35a08c 100644
--- a/anonlink/__init__.py
+++ b/anonlink/__init__.py
@@ -5,4 +5,4 @@
 from anonlink import network_flow
 
 __version__ = pkg_resources.get_distribution('anonlink').version
-__author__ = 'Stephen Hardy, Brian Thorne'
\ No newline at end of file
+__author__ = 'N1 Analytics'
diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index c1fba03d..2fd0e438 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -8,7 +8,6 @@
 
 from anonlink.entitymatch import *
 from anonlink.util import popcount_vector, generate_clks, generate_bitarray
-from anonlink.distributed_processing import calculate_filter_similarity
 
 
 some_filters = generate_clks(10000)
@@ -18,54 +17,74 @@ def compute_popcount_speed(n):
     """
     Just do as much counting of bits.
     """
-    clks = [generate_bitarray(1024) for _ in range(n)]
-    start = timer()
-    popcounts = popcount_vector(clks)
-    end = timer()
-    elapsed_time = end - start
-    print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time))
-    speed_in_MiB = n / (1024 * 8 * elapsed_time)
-    print("Popcount speed: {:.2f} MiB/s".format(speed_in_MiB))
-    return speed_in_MiB
+    clk_bits = 1024
+    clk_bytes = clk_bits / 8
+    clks_MiB = n * clk_bytes * 1.0 / (1 << 20)
 
+    clks = [generate_bitarray(clk_bits) for _ in range(n)]
 
-def print_comparison_header():
-    print("Size 1 | Size 2 | Comparisons  | Compute Time | Million Comparisons per second")
+    print("{:6d} x {:d} bit popcounts".format(n, clk_bits))
+    print("Implementation              | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s)")
 
+    # Python
+    popcounts, elapsed_time = popcount_vector(clks, use_python=True)
+    python_speed_in_MiB = clks_MiB / elapsed_time
+    python_Mops = n / (1e6 * elapsed_time)
+    elapsed_time_ms = elapsed_time * 1e3
+    print("Python (bitarray.count()):  |  {:7.2f}  |   {:9.2f}       | {:7.2f}"
+          .format(elapsed_time_ms, python_speed_in_MiB, python_Mops))
 
-def compute_comparison_speed(n1=100, n2=100):
-    """
-    Using the greedy solver, how fast can hashes be computed using one core.
-    """
-
-    filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)]
-    filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)]
-
+    # Native
     start = timer()
-    result3 = calculate_mapping_greedy(filters1, filters2)
+    popcounts, elapsed_nocopy = popcount_vector(clks, use_python=False)
     end = timer()
     elapsed_time = end - start
-    print("{:6d} | {:6d} | {:12d} | {:8.3f}s    |  {:12.3f}".format(
-        n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time)))
-    return elapsed_time
+    copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
+    elapsed_time_ms = elapsed_time * 1e3
+    elapsed_nocopy_ms = elapsed_nocopy * 1e3
+    native_speed_in_MiB = clks_MiB / elapsed_time
+    native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
+    native_Mops = n / (1e6 * elapsed_time)
+    native_Mops_nocopy = n / (1e6 * elapsed_nocopy)
+    print("Native code (no copy):      |  {:7.2f}  |   {:9.2f}       | {:7.2f}"
+          .format(elapsed_nocopy_ms, native_speed_in_MiB_nocopy, native_Mops_nocopy))
+    print("Native code (w/ copy):      |  {:7.2f}  |   {:9.2f}       | {:7.2f} ({:.1f}% copying)"
+          .format(elapsed_time_ms, native_speed_in_MiB, native_Mops, copy_percent))
 
+    return python_speed_in_MiB
 
-def compute_comparison_speed_parallel(n1=100, n2=100):
+
+def print_comparison_header(threshold):
+    print("\nThreshold:", threshold)
+    print("Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput")
+    print("       |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)")
+    print("-------+--------+------------------+-------------------------+-------------")
+
+
+def compute_comparison_speed(n1, n2, threshold):
     """
-    Using the greedy solver in chunks, how fast can hashes be computed.
+    Using the greedy solver, how fast can hashes be computed using one core.
     """
 
     filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)]
     filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)]
 
-
     start = timer()
-    calculate_filter_similarity(filters1, filters2)
-
+    sparse_matrix = calculate_filter_similarity(filters1, filters2, k=len(filters2), threshold=threshold)
+    t1 = timer()
+    res = greedy_solver(sparse_matrix)
     end = timer()
+
+    similarity_time = t1 - start
+    solver_time = end - t1
     elapsed_time = end - start
-    print("{:6d} | {:6d} | {:12d} | {:8.3f}s    |  {:12.3f}".format(
-        n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time)))
+    print("{:6d} | {:6d} | {:4d}e6  ({:5.2f}%) | {:6.3f}  ({:4.1f}% / {:4.1f}%) |  {:8.3f}".format(
+        n1, n2, n1*n2 // 1000000,
+        100.0*len(sparse_matrix)/(n1*n2),
+        elapsed_time,
+        100.0*similarity_time/elapsed_time,
+        100.0*solver_time/elapsed_time,
+        (n1*n2)/(1e6*similarity_time)))
     return elapsed_time
 
 
@@ -111,13 +130,13 @@ def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
 
 def benchmark(size, compare):
 
+    print("Anonlink benchmark -- see README for explanation")
+    print("------------------------------------------------")
     if compare:
         print(compare_python_c(ntotal=1000, nsubset=600))
 
     compute_popcount_speed(100000)
 
-    print_comparison_header()
-
     possible_test_sizes = [
         1000, 2000, 3000, 4000,
         5000, 6000, 7000, 8000, 9000,
@@ -127,15 +146,19 @@ def benchmark(size, compare):
         2000000
     ]
 
+    thld = 0.5
+    print_comparison_header(thld)
     for test_size in possible_test_sizes:
         if test_size <= size:
-            compute_comparison_speed_parallel(
-                test_size, test_size
-            )
-
-    print("Single Core:")
-    compute_comparison_speed(5000, 5000)
+            compute_comparison_speed(test_size, test_size, thld)
 
+    thld = 0.7
+    print_comparison_header(thld)
+    size *= 5
+    for test_size in possible_test_sizes:
+        if test_size <= size:
+            compute_comparison_speed(test_size, test_size, thld)
 
 if __name__ == '__main__':
-    benchmark(20000, False)
\ No newline at end of file
+    benchmark(4000, False)
+    #benchmark(20000, False)
diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py
index 198e8d31..24675c3b 100644
--- a/anonlink/bloommatcher.py
+++ b/anonlink/bloommatcher.py
@@ -10,7 +10,9 @@ def dicecoeff_pure_python(e1, e2):
     """
     Dice coefficient measures the similarity of two bit patterns.
 
-    :param e1,e2: bitset arrays of same length
+    Implemented exclusively in Python.
+
+    :param e1, e2: bitarrays of same length
     :return: real 0-1 similarity measure
     """
     count1 = e1.count()
@@ -22,18 +24,27 @@ def dicecoeff_pure_python(e1, e2):
     else:
         return 2.0 * overlap_count / combined_count
 
-
-def dicecoeff(e1, e2):
+def dicecoeff_native(e1, e2):
     """
-    Dice coefficient measures the similarity of two bit patterns
+    Dice coefficient measures the similarity of two bit patterns.
+
+    Implemented via an external library.
 
+    :param e1, e2: bitarrays of same length
     :return: real 0-1 similarity measure
     """
     e1array = ffi.new("char[]", e1.tobytes())
     e2array = ffi.new("char[]", e2.tobytes())
+    return lib.dice_coeff(e1array, e2array, len(e1array))
 
-    if len(e1) == 1024 and len(e2) == 1024:
-        return lib.dice_coeff_1024(e1array, e2array)
+def dicecoeff(e1, e2):
+    """
+    Dice coefficient measures the similarity of two bit patterns
+
+    :return: real 0-1 similarity measure
+    """
+    if e1.length() == e2.length() and (e1.length()/8) % 8 == 0:
+        return dicecoeff_native(e1, e2)
     else:
         return dicecoeff_pure_python(e1, e2)
 
diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py
index 1484b12d..fae7b4f7 100644
--- a/anonlink/entitymatch.py
+++ b/anonlink/entitymatch.py
@@ -1,5 +1,4 @@
 import logging
-from operator import itemgetter
 
 from anonlink._entitymatcher import ffi, lib
 
@@ -8,7 +7,7 @@
 from . import bloommatcher as bm
 from . import util
 
-logging.basicConfig(level=logging.WARNING)
+log = logging.getLogger('anonlink.entitymatch')
 
 
 def python_filter_similarity(filters1, filters2):
@@ -34,15 +33,25 @@ def python_filter_similarity(filters1, filters2):
 
 def cffi_filter_similarity_k(filters1, filters2, k, threshold):
     """Accelerated method for determining Bloom Filter similarity.
+
+    Assumes all filters are the same length, being a multiple of 64
+    bits.
+
     """
     length_f1 = len(filters1)
     length_f2 = len(filters2)
 
-    # We assume the length is 1024 bit = 128 Bytes
-    match_one_against_many_dice_1024_k_top = lib.match_one_against_many_dice_1024_k_top
+    if length_f1 == 0:
+        return []
+
+    filter_bits = len(filters1[0][0])
+    assert(filter_bits % 64 == 0, 'Filter length must be a multple of 64 bits.')
+    filter_bytes = filter_bits // 8
+
+    match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top
 
     # An array of the *one* filter
-    clist1 = [ffi.new("char[128]", bytes(f[0].tobytes()))
+    clist1 = [ffi.new("char[{}]".format(filter_bytes), bytes(f[0].tobytes()))
               for f in filters1]
 
     if sys.version_info < (3, 0):
@@ -52,10 +61,10 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
             for b in f[0].tobytes():
                 data.append(b)
 
-        carr2 = ffi.new("char[{}]".format(128 * length_f2), ''.join(data))
+        carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), ''.join(data))
     else:
         # Works in Python 3+
-        carr2 = ffi.new("char[{}]".format(128 * length_f2),
+        carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2),
                         bytes([b for f in filters2 for b in f[0].tobytes()]))
 
     c_popcounts = ffi.new("uint32_t[{}]".format(length_f2), [f[2] for f in filters2])
@@ -67,18 +76,20 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
 
     result = []
     for i, f1 in enumerate(filters1):
-        assert len(clist1[i]) == 128
-        assert len(carr2) % 64 == 0
-        matches = match_one_against_many_dice_1024_k_top(
+        assert len(clist1[i]) == filter_bytes
+        matches = match_one_against_many_dice_k_top(
             clist1[i],
             carr2,
             c_popcounts,
             length_f2,
+            filter_bytes,
             k,
             threshold,
             c_indices,
             c_scores)
 
+        if matches < 0:
+            raise ValueError('Internel error: Bad key length')
         for j in range(matches):
             ind = c_indices[j]
             assert ind < len(filters2)
@@ -119,7 +130,7 @@ def calculate_mapping_greedy(filters1, filters2, threshold=0.95, k=5):
     :returns A mapping dictionary of index in filters1 to index in filters2.
     """
 
-    logging.info('Solving with greedy solver')
+    log.info('Solving with greedy solver')
 
     sparse_matrix = calculate_filter_similarity(filters1, filters2, k, threshold)
     return greedy_solver(sparse_matrix)
diff --git a/anonlink/network_flow.py b/anonlink/network_flow.py
index ffc3619b..5b679ce2 100644
--- a/anonlink/network_flow.py
+++ b/anonlink/network_flow.py
@@ -8,7 +8,7 @@
 import networkx as nx
 from networkx.algorithms import bipartite
 
-logging.basicConfig(level=logging.WARNING)
+log = logging.getLogger('anonlink.networkflow')
 
 
 def calculate_network(similarity, cutoff):
@@ -62,17 +62,17 @@ def calculate_entity_mapping(G, method=None):
     """
 
     if method == 'bipartite':
-        logging.info('Solving entity matches with bipartite maximum matching solver')
+        log.info('Solving entity matches with bipartite maximum matching solver')
         network = bipartite.maximum_matching(G)
         entity_map = _to_int_map(network, lambda network, node: network[node])
 
     elif method == 'weighted':
-        logging.info('Solving entity matches with networkx maximum weight matching solver')
+        log.info('Solving entity matches with networkx maximum weight matching solver')
         network = nx.max_weight_matching(G)
         entity_map = _to_int_map(network, lambda network, node: network[node])
 
     elif method == 'flow' or method is None:
-        logging.info('Solving entity matches with networkx maximum flow solver')
+        log.info('Solving entity matches with networkx maximum flow solver')
         # The maximum flow solver requires a SOURCE and SINK
         num_rows, num_cols = 0, 0
         for i, node in enumerate(G.nodes()):
@@ -87,9 +87,9 @@ def calculate_entity_mapping(G, method=None):
         # This method produces a quality metric `flow`, however
         # it needs to be compared to the number of entities
         if flow_value < num_rows:
-            logging.info('Matching not perfect - {:.3f}'.format(flow_value))
+            log.info('Matching not perfect - {:.3f}'.format(flow_value))
         else:
-            logging.info('Matching complete. (perfect matching)')
+            log.info('Matching complete. (perfect matching)')
 
         def find_pair(network, node):
             # Make sure to deal with unconnected nodes
diff --git a/anonlink/util.py b/anonlink/util.py
index dcafa80f..1fec0d96 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -3,7 +3,9 @@
 import os
 import random
 from bitarray import bitarray
+from timeit import default_timer as timer
 
+from anonlink._entitymatcher import ffi, lib
 
 def generate_bitarray(length):
     a = bitarray(endian=['little', 'big'][random.randint(0, 1)])
@@ -19,25 +21,36 @@ def generate_clks(n):
     return res
 
 
-def popcount_vector(bitarrays):
-    """
-    Note, due to the overhead of converting bitarrays into
-    bytes, it is more expensive to call our C implementation
-    than just calling bitarray.count()
+def popcount_vector(bitarrays, use_python=True):
+    """Return a list containing the popcounts of the elements of
+    bitarrays, and the time (in seconds) it took. If use_python is
+    False, use the native code implementation instead of Python; in
+    this case the returned time is the time spent in the native code,
+    NOT including copying to and from the Python runtime.
 
+    Note, due to the overhead of converting bitarrays into bytes,
+    it is currently more expensive to call our C implementation
+    than just calling bitarray.count()
     """
-    return [clk.count() for clk in bitarrays]
+    # Use Python
+    if use_python:
+        start = timer()
+        counts = [clk.count() for clk in bitarrays]
+        elapsed = timer() - start
+        return counts, elapsed
+
+    # Use native code
+    n = len(bitarrays)
+    arr_bytes = bitarrays[0].length() // 8
+    c_popcounts = ffi.new("uint32_t[{}]".format(n))
+    many = ffi.new("char[{}]".format(arr_bytes * n),
+                    bytes([b for f in bitarrays for b in f.tobytes()]))
+    ms = lib.popcount_arrays(c_popcounts, many, n, arr_bytes)
 
-    # n = len(clks)
-    # c_popcounts = ffi.new("uint32_t[{}]".format(n))
-    # many = ffi.new("char[{}]".format(128 * n),
-    #                 bytes([b for f in clks for b in f.tobytes()]))
-    # lib.popcount_1024_array(many, n, c_popcounts)
-    #
-    # return [c_popcounts[i] for i in range(n)]
+    return [c_popcounts[i] for i in range(n)], ms * 1e-3
 
 
 def chunks(l, n):
     """Yield successive n-sized chunks from l."""
     for i in range(0, len(l), n):
-        yield l[i:i + n]
\ No newline at end of file
+        yield l[i:i + n]
diff --git a/requirements.txt b/requirements.txt
index f4a36d17..2a2ad3a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 bitarray==0.8.1
 networkx==1.11
 cffi>=1.7
-nose==1.3.7
-clkhash==0.8.0
\ No newline at end of file
+pytest>=3.4
+pytest-cov>=2.5
+clkhash==0.8.0
diff --git a/setup.py b/setup.py
index d3ae9419..742f67d1 100644
--- a/setup.py
+++ b/setup.py
@@ -8,13 +8,12 @@
         "bitarray==0.8.1",
         "networkx==1.11",
         "cffi>=1.7",
+        "clkhash>=0.10"
     ]
 
 setup(
     name="anonlink",
-    author='Brian Thorne',
-    author_email='brian.thorne@data61.csiro.au',
-    version='0.6.2',
+    version='0.7.0.rc1',
     description='Anonymous linkage using cryptographic hashes and bloom filters',
     url='https://github.com/n1analytics/anonlink',
     license='Apache',
diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py
new file mode 100644
index 00000000..76426911
--- /dev/null
+++ b/tests/bitarray_utils.py
@@ -0,0 +1,20 @@
+from bitarray import bitarray
+from itertools import combinations_with_replacement
+
+def bitarrays_of_length(L):
+    """
+    Return a bit array of length L*64 whose contents are combinations of
+    the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
+    the least or most significant position).
+    """
+    special_words = [64*bitarray('0'),
+                     63*bitarray('0') + bitarray('1'),
+                     bitarray('1') + 63*bitarray('0'),
+                     64*bitarray('1')]
+    # '+' on bitarrays is concatenation
+    return [sum(word, bitarray())
+            for word in combinations_with_replacement(special_words, L)]
+
+# Interesting key lengths (usually around 2^something +/-1).
+key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
+               23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index acc6fc06..cb6f0a2f 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -11,10 +11,7 @@ def test_benchmarking_popcount(self):
         self.assertGreater(speed, 50, "Popcounting at less than 50MiB/s")
 
     def test_comparison_speed_benchmark(self):
-        benchmark.compute_comparison_speed()
-
-    def test_parallel_comparison_speed_benchmark(self):
-        benchmark.compute_comparison_speed_parallel()
+        benchmark.compute_comparison_speed(100, 100, 0.7)
 
     def test_comparing_python_c_bench(self):
         benchmark.compare_python_c(500, 30, frac=0.8)
diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py
index 4c0f06fd..8fff895e 100644
--- a/tests/test_bloommatcher.py
+++ b/tests/test_bloommatcher.py
@@ -1,8 +1,12 @@
 import unittest
+import pytest
 import random
+import os
+from collections import deque
 from bitarray import bitarray
 
 from anonlink import bloommatcher as bm
+from tests import bitarray_utils
 
 __author__ = 'shardy'
 
@@ -70,6 +74,26 @@ def test_dice_4_c(self):
 
         self.assertEqual(result, 0.0)
 
+@pytest.mark.parametrize("L", bitarray_utils.key_lengths)
+def test_dicecoeff(L):
+    """
+    Test the Dice coefficient of bitarrays in bas with other
+    bitarrays of bas.  rotations is the number of times we rotate
+    bas to generate pairs to test the Dice coefficient; 10 takes
+    around 10s, 100 around 60s.
+    """
+    rotations = 100 if "INCLUDE_10K" in os.environ else 10;
+    bas = bitarray_utils.bitarrays_of_length(L)
+
+    # We check that the native code and Python versions of dicecoeff
+    # don't ever differ by more than 10^{-6}.
+    eps = 0.000001
+    d = deque(bas)
+    for _ in range(min(rotations, len(bas))):
+        for a, b in zip(bas, d):
+            diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b)
+            assert(abs(diff) < eps)
+        d.rotate(1)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_util.py b/tests/test_util.py
index 521cada5..2715de3d 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,7 +1,10 @@
 #!/usr/bin/env python3.4
 
 import unittest
+import pytest
 from anonlink import util
+from anonlink import bloommatcher as bm
+from tests import bitarray_utils
 
 class TestUtilDataGeneration(unittest.TestCase):
 
@@ -19,10 +22,12 @@ def test_generate_clks(self):
             self.assertEqual(len(clk[0]), 1024)
             self.assertEqual(clk[0].count(), clk[2])
 
-    def test_popcount_vector(self):
-        bas = [util.generate_bitarray(1024) for i in range(100)]
-        popcounts = util.popcount_vector(bas)
+@pytest.mark.parametrize("L", bitarray_utils.key_lengths)
+def test_popcount_vector(L):
+    bas = bitarray_utils.bitarrays_of_length(L)
+    bas_counts = [b.count() for b in bas]
 
-        self.assertEquals(len(popcounts), 100)
-        for i, cnt in enumerate(popcounts):
-            self.assertEquals(cnt, bas[i].count())
+    popcounts, _ = util.popcount_vector(bas, use_python=True)
+    assert(popcounts == bas_counts)
+    popcounts, _ = util.popcount_vector(bas, use_python=False)
+    assert(popcounts == bas_counts)