diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..6f269154 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,26 @@ +# After changing this file, check it on: +# http://lint.travis-ci.org/ +language: python +sudo: false + +python: + - '3.6' + - 'nightly' + - 'pypy3' + +env: + - INCLUDE_10K= + +matrix: + allow_failures: + - python: 'nightly' + - python: 'pypy3' + +install: + - travis_retry pip install pytest pytest-cov + - travis_retry pip install -r requirements.txt + - travis_retry python setup.py sdist bdist_wheel + - travis_retry pip install -e . + +script: + - pytest --cov=anonlink diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 65144cb9..aee7a49d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,23 @@ +0.7.0 +----- + +Introduces support for comparing "arbitrary" length cryptographic linkage keys. +Benchmark is much more comprehensive and more comparable between releases - see the +readme for an example report. + +Other improvements +~~~~~~~~~~~~~~~~~~ + +- Internal C/C++ cleanup/refactoring and optimization. +- Expose the native popcount implementation to Python. +- Bug fix to avoid configuring a logger. +- Testing is now with `py.test` and runs on [travis-ci](https://travis-ci.org/n1analytics/anonlink/) + +0.6.3 +----- + +Small fix to logging setup. + 0.6.2 - Changelog init --------------------- diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index 65f5fe2a..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,145 +0,0 @@ -void setBuildStatus(String message, String state) { - step([ - $class: "GitHubCommitStatusSetter", - reposSource: [$class: "ManuallyEnteredRepositorySource", url: "https://github.com/n1analytics/anonlink"], - contextSource: [$class: 'ManuallyEnteredCommitContextSource', context: 'jenkins'], - statusResultSource: [ $class: "ConditionalStatusResultSource", results: [[$class: "AnyBuildResult", message: message, state: state]] ] - ]); -} - -def isMaster = env.BRANCH_NAME == 'master' -def isDevelop = env.BRANCH_NAME == 'develop' - -def configs = [ - [label: 'GPU 1', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']], - [label: 'osx', pythons: ['python3.5'], compilers: ['clang', 'gcc']] -] - -def build(python_version, compiler, label, release=false) { - try { - def workspace = pwd(); - echo "${label}" - echo "workspace directory is ${workspace}" - env.PATH = "${workspace}/env/bin:/usr/bin:${env.PATH}" - - withEnv(["VENV=${workspace}/env"]) { - // ${workspace} contains an absolute path to job workspace (not available within a stage) - - sh "test -d ${workspace}/env && rm -rf ${workspace}/env || echo 'no env, skipping cleanup'" - - // The stage below is attempting to get the latest version of our application code. - // Since this is a multi-branch project the 'checkout scm' command is used. If you're working with a standard - // pipeline project then you can replace this with the regular 'git url:' pipeline command. - // The 'checkout scm' command will automatically pull down the code from the appropriate branch that triggered this build. - checkout scm - - def testsError = null - - clkhashPackageName = "clkhash-*-py2.py3-none-any.whl" - - step ([$class: 'CopyArtifact', - projectName: 'clkhash/master', - fingerprint: true, - flatten: true, - filter: 'dist/' + clkhashPackageName - ]); - - try { - sh """#!/usr/bin/env bash - set -xe - - # Jenkins logs in as a non-interactive shell, so we don't even have /usr/local/bin in PATH - export PATH="/usr/local/bin:\${PATH}" - printenv - - rm -fr build - ${python_version} -m venv --clear ${VENV} - ${VENV}/bin/python ${VENV}/bin/pip install --upgrade pip coverage setuptools wheel - ${VENV}/bin/python ${VENV}/bin/pip install --quiet --upgrade ${clkhashPackageName} - - ${VENV}/bin/python ${VENV}/bin/pip install -r requirements.txt - - CC=${compiler} ${VENV}/bin/python setup.py sdist bdist_wheel - ${VENV}/bin/python ${VENV}/bin/pip install -e . - ${VENV}/bin/python ${VENV}/bin/nosetests \ - --with-xunit --with-coverage --cover-inclusive \ - --cover-package=anonlink - - """ - - if(release) { - // This will be the official release - archiveArtifacts artifacts: "dist/anonlink-*.whl" - archiveArtifacts artifacts: "dist/anonlink-*.tar.gz" - } - } - catch(err) { - testsError = err - currentBuild.result = 'FAILURE' - setBuildStatus("Build failed", "FAILURE"); - } - finally { - - if (!release) { - junit 'nosetests.xml' - } else { - // Code coverage only needs to be done once - sh '''#!/usr/bin/env bash - set -xe - - ${VENV}/bin/python ${VENV}/bin/coverage xml --omit="*/cpp_code/*" --omit="*build_matcher.py*" - - ''' - - step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml']) - } - - if (testsError) { - throw testsError - } - } - } - - } finally { - deleteDir() - } -} - - -def builders = [:] -for (config in configs) { - def label = config["label"] - def pythons = config["pythons"] - def compilers = config["compilers"] - - for (_py_version in pythons) { - for (_compiler in compilers) { - - def py_version = _py_version - def compiler = _compiler - def combinedName = "${label}-${py_version}-${compiler}" - - builders[combinedName] = { - node(label) { - stage(combinedName) { - build(py_version, compiler, label, false) - } - } - } - } - } -} - -node { - checkout scm - setBuildStatus("Build in progress", "PENDING"); -} - -parallel builders - -node('GPU 1') { - stage('Release') { - build('python3.5', 'gcc', 'GPU 1', true) - setBuildStatus("Tests Passed", "SUCCESS"); - } -} \ No newline at end of file diff --git a/Jenkinsfile.groovy b/Jenkinsfile.groovy new file mode 100644 index 00000000..416a2e1f --- /dev/null +++ b/Jenkinsfile.groovy @@ -0,0 +1,127 @@ +@Library("N1Pipeline@0.0.5") +import com.n1analytics.git.GitUtils; +import com.n1analytics.git.GitCommit; +import com.n1analytics.n1.docker.N1EngineContainer; +import com.n1analytics.python.PythonVirtualEnvironment; + +def isMaster = env.BRANCH_NAME == 'master' +def isDevelop = env.BRANCH_NAME == 'develop' + +VENV_DIRECTORY = "env" + +GIT_CONTEXT = "jenkins" + +def configs = [ + [label: 'GPU 1', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']], + //[label: 'osx', pythons: ['python3.5'], compilers: ['clang', 'gcc']] + [label: 'McNode', pythons: ['python3.5'], compilers: ['clang']] +] + +def PythonVirtualEnvironment prepareVirtualEnvironment(String pythonVersion, clkhashPackageName, compiler, venv_directory = VENV_DIRECTORY) { + PythonVirtualEnvironment venv = new PythonVirtualEnvironment(this, venv_directory, pythonVersion); + venv.create(); + venv.runPipCommand("install --upgrade pip coverage setuptools wheel") + venv.runPipCommand("install --quiet --upgrade ${clkhashPackageName}"); + venv.runPipCommand("install -r requirements.txt"); + String cc = "CC=" + compiler; + withEnv([cc]) { + venv.runCommand("setup.py sdist bdist_wheel --universal"); + } + venv.runPipCommand("install -e ."); + return venv; +} + +def build(python_version, compiler, label, release = false) { + GitUtils.checkoutFromSCM(this) + Exception testsError = null; + try { + clkhashPackageName = "clkhash-*-py2.py3-none-any.whl" + copyArtifacts( + projectName: 'clkhash/master', + fingerprint: true, + flatten : true, + filter : 'dist/' + clkhashPackageName + ) + + PythonVirtualEnvironment venv = prepareVirtualEnvironment(python_version, clkhashPackageName, compiler) + try { + venv.runChosenCommand("pytest --cov=anonlink --junit-xml=testoutput.xml --cov-report=xml:coverage.xml") + if (release) { + // This will be the official release + archiveArtifacts artifacts: "dist/anonlink-*.whl" + archiveArtifacts artifacts: "dist/anonlink-*.tar.gz" + } + } catch (Exception err) { + testsError = err + } finally { + if (!release) { + junit 'testoutput.xml' + } else { + venv.runChosenCommand("coverage xml --omit=\"*/cpp_code/*\" --omit=\"*build_matcher.py*\"") + cobertura coberturaReportFile: 'coverage.xml' + } + if (testsError != null) { + throw testsError + } + } + + } finally { + try { + deleteDir() + } catch (Exception e) { + echo "Error during 'deleteDir':\n" + e.toString() + } + } +} + + +def builders = [:] +for (config in configs) { + def label = config["label"] + def pythons = config["pythons"] + def compilers = config["compilers"] + + for (_py_version in pythons) { + for (_compiler in compilers) { + + def py_version = _py_version + def compiler = _compiler + def combinedName = "${label}-${py_version}-${compiler}" + + builders[combinedName] = { + node(label) { + stage(combinedName) { + build(py_version, compiler, label, false) + } + } + } + } + } +} + +GitCommit commit; +node() { + commit = GitUtils.checkoutFromSCM(this); + commit.setInProgressStatus(GIT_CONTEXT); +} + +try { + parallel builders +} catch (Exception err) { + node() { + commit.setFailStatus("Build failed", GIT_CONTEXT); + } + throw err +} + +node('GPU 1') { + stage('Release') { + try { + build('python3.5', 'gcc', 'GPU 1', true) + commit.setSuccessStatus(GIT_CONTEXT) + } catch (Exception e) { + commit.setFailStatus("Release failed", GIT_CONTEXT); + throw e; + } + } +} \ No newline at end of file diff --git a/README.rst b/README.rst index 9c025eb5..2d645231 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ + +.. image:: https://travis-ci.org/n1analytics/anonlink.svg?branch=master + :target: https://travis-ci.org/n1analytics/anonlink + + A Python (and optimised C++) implementation of **anonymous linkage** using *cryptographic linkage keys* as described by Rainer Schnell, Tobias Bachteler, and Jörg Reiher in `A Novel Error-Tolerant Anonymous Linking @@ -43,42 +48,95 @@ For linux with: Benchmark --------- +You can run the benchmark with: + :: - $ python -m anonlink.benchmark - 100000 x 1024 bit popcounts in 0.016376 seconds - Popcount speed: 745.42 MiB/s - Size 1 | Size 2 | Comparisons | Compute Time | Million Comparisons per second - 1000 | 1000 | 1000000 | 0.060s | 16.632 - 2000 | 2000 | 4000000 | 0.159s | 25.232 - 3000 | 3000 | 9000000 | 0.316s | 28.524 - 4000 | 4000 | 16000000 | 0.486s | 32.943 - 5000 | 5000 | 25000000 | 0.584s | 42.825 - 6000 | 6000 | 36000000 | 0.600s | 60.027 - 7000 | 7000 | 49000000 | 0.621s | 78.875 - 8000 | 8000 | 64000000 | 0.758s | 84.404 - 9000 | 9000 | 81000000 | 0.892s | 90.827 - 10000 | 10000 | 100000000 | 1.228s | 81.411 - 20000 | 20000 | 400000000 | 3.980s | 100.504 - 30000 | 30000 | 900000000 | 9.280s | 96.986 - 40000 | 40000 | 1600000000 | 17.318s | 92.391 - -C++ version uses cpu instruction ``POPCNT`` for bitcount in a 64bit -word. http://wm.ite.pl/articles/sse-popcount.html + $ python3 -m anonlink.benchmark + Anonlink benchmark -- see README for explanation + ------------------------------------------------ + 100000 x 1024 bit popcounts + Implementation | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s) + Python (bitarray.count()): | 17.78 | 686.54 | 5.62 + Native code (no copy): | 1.00 | 12243.76 | 100.30 + Native code (w/ copy): | 344.17 | 35.47 | 0.29 (99.7% copying) + + Threshold: 0.5 + Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput + | | (match %) | (comparisons / matching)| (1e6 cmp/s) + -------+--------+------------------+-------------------------+------------- + 1000 | 1000 | 1e6 (50.20%) | 0.249 (88.6% / 11.4%) | 4.525 + 2000 | 2000 | 4e6 (50.51%) | 1.069 (88.5% / 11.5%) | 4.227 + 3000 | 3000 | 9e6 (50.51%) | 2.412 (85.3% / 14.7%) | 4.375 + 4000 | 4000 | 16e6 (50.56%) | 4.316 (83.6% / 16.4%) | 4.434 + + Threshold: 0.7 + Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput + | | (match %) | (comparisons / matching)| (1e6 cmp/s) + -------+--------+------------------+-------------------------+------------- + 1000 | 1000 | 1e6 ( 0.01%) | 0.017 (99.8% / 0.2%) | 59.605 + 2000 | 2000 | 4e6 ( 0.01%) | 0.056 (99.8% / 0.2%) | 71.484 + 3000 | 3000 | 9e6 ( 0.01%) | 0.118 (99.9% / 0.1%) | 76.500 + 4000 | 4000 | 16e6 ( 0.01%) | 0.202 (99.9% / 0.1%) | 79.256 + 5000 | 5000 | 25e6 ( 0.01%) | 0.309 (99.9% / 0.1%) | 81.093 + 6000 | 6000 | 36e6 ( 0.01%) | 0.435 (99.9% / 0.1%) | 82.841 + 7000 | 7000 | 49e6 ( 0.01%) | 0.590 (99.9% / 0.1%) | 83.164 + 8000 | 8000 | 64e6 ( 0.01%) | 0.757 (99.9% / 0.1%) | 84.619 + 9000 | 9000 | 81e6 ( 0.01%) | 0.962 (99.8% / 0.2%) | 84.358 + 10000 | 10000 | 100e6 ( 0.01%) | 1.166 (99.8% / 0.2%) | 85.895 + 20000 | 20000 | 400e6 ( 0.01%) | 4.586 (99.9% / 0.1%) | 87.334 + +The tables are interpreted as follows. The first section compares the +bandwidth doing popcounts through (i) the Python bitarray library and +(ii) a native code implementation in assembler. The latter +implementation is measured in two ways: the first measures just the +time taken to compute the popcounts, while the second includes the +time taken to copy the data out of the running Python instance as well +as copying the result back into Python. The "% copying" measure is the +proportion of time spent doing this copying. + +The second section includes two tables that measure the throughput of +the Dice coefficient comparison function. The two tables correspond to +two different choices of "matching threshold", 0.5 and 0.7, which were +chosen to characterise two different performance scenarios. Since the +data used for comparisons is randomly generated, the first threshold +value will cause about 50% of the candidates to "match", while the +second threshold value will cause <0.01% of the candidates to match +(these values are reported in the "match %" column). In both cases, +all matches above the threshold are returned and passed to the +solver. In the first case, the large number of matches means that much +of the time is spent keeping the candidates in order so that the top +`k` matches can be returned. In the latter case, the tiny number of +candidate matches means that the throughput is determined primarily by +the comparison code itself. + +Finally, the Total Time column includes indications as to the +proportion of time spent calculating the (sparse) similarity matrix +`comparisons` and the proportion of time spent `matching` in the +greedy solver. This latter is determined by the size of the similarity +matrix, which will be approximately `#comparisons * match% / 100`. Tests ===== -Run unit tests with nose +Run unit tests with `pytest`: :: - $ python -m nose - ......................SS.............................. - ---------------------------------------------------------------------- - Ran 54 tests in 6.615s + $ pytest + ====================================== test session starts ====================================== + platform linux -- Python 3.6.4, pytest-3.2.5, py-1.4.34, pluggy-0.4.0 + rootdir: /home/hlaw/src/n1-anonlink, inifile: + collected 71 items + + tests/test_benchmark.py ... + tests/test_bloommatcher.py .............. + tests/test_e2e.py .............ss.... + tests/test_matcher.py ..x.....x......x....x.. + tests/test_similarity.py ......... + tests/test_util.py ... - OK (SKIP=2) + ======================== 65 passed, 2 skipped, 4 xfailed in 4.01 seconds ======================== To enable slightly larger tests add the following environment variables: @@ -91,8 +149,6 @@ Limitations - The linkage process has order n^2 time complexity - although algorithms exist to significantly speed this up. Several possible speedups are described in http://dbs.uni-leipzig.de/file/P4Join-BTW2015.pdf -- The C++ code makes an assumption of 1024 bit keys (although this would be easy - to change). License diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index fb7e1161..6277f3df 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -15,13 +15,14 @@ "_entitymatcher", source, source_extension='.cpp', - extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt'], + extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-march=native', '-mssse3', '-mpopcnt', '-fvisibility=hidden' + ], ) ffibuilder.cdef(""" - int match_one_against_many_dice(const char * one, const char * many, int n, double * score); - int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores); - double dice_coeff_1024(const char *e1, const char *e2); + int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores); + double dice_coeff(const char *array1, const char *array2, int array_bytes); + double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes); """) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 6182585c..c5093365 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -1,217 +1,451 @@ #include #include #include -#include +#include #include -#include +#include +#include +#include +static constexpr int WORD_BYTES = sizeof(uint64_t); -#define WORDBYTES (sizeof(uint64_t)) -#define WORDBITS (WORDBYTES * 8) -#define KEYBITS 1024 -#define KEYBYTES (KEYBITS / 8) -// KEYWORDS must be divisible by 4. It is currently equal to 16. -#define KEYWORDS (KEYBYTES / WORDBYTES) +/** + * The popcount of n elements of buf is the sum of c0, c1, c2, c3. + */ +template +static inline void +popcount( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf) { + popcount<4>(c0, c1, c2, c3, buf); + popcount(c0, c1, c2, c3, buf + 4); +} +// Fast Path +// // Source: http://danluu.com/assembly-intrinsics/ // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance // -// NB: Dan Luu's original assembly is incorrect because it -// clobbers registers marked as "input only" (see warning at +// NB: Dan Luu's original assembly (and the SO answer it was based on) +// is incorrect because it clobbers registers marked as "input only" +// (see warning at // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands -// -- this mistake does not materialise with GCC (4.9), but it -// does with Clang (3.6 and 3.8)). We fix the mistake by -// explicitly loading the contents of buf into registers and using -// these same registers for the intermediate popcnts. -static inline uint32_t -builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) { - uint64_t b0, b1, b2, b3; - uint64_t c0, c1, c2, c3; - c0 = c1 = c2 = c3 = 0; - - // We unroll this manually because some versions of GCC don't do so - // of their own volition. Speedup from this in such cases is >20%. -#undef LOOP_BODY -#define LOOP_BODY(i) do { \ - b0 = buf[i]; b1 = buf[i + 1]; b2 = buf[i + 2]; b3 = buf[i + 3]; \ - __asm__( \ - "popcnt %4, %4 \n\t" \ - "add %4, %0 \n\t" \ - "popcnt %5, %5 \n\t" \ - "add %5, %1 \n\t" \ - "popcnt %6, %6 \n\t" \ - "add %6, %2 \n\t" \ - "popcnt %7, %7 \n\t" \ - "add %7, %3 \n\t" \ - : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3), \ - "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); \ - } while (0) - - LOOP_BODY(0); - LOOP_BODY(4); - LOOP_BODY(8); - LOOP_BODY(12); - - return c0 + c1 + c2 + c3; +// -- this mistake does not materialise with GCC (4.9), but it does +// with Clang (3.6 and 3.8)). We fix the mistake by explicitly +// loading the contents of buf into registers and using these same +// registers for the intermediate popcnts. +/** + * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3. + */ +template<> +inline void +popcount<4>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf) { + uint64_t b0, b1, b2, b3; + b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; + __asm__( + "popcnt %4, %4 \n\t" + "add %4, %0 \n\t" + "popcnt %5, %5 \n\t" + "add %5, %1 \n\t" + "popcnt %6, %6 \n\t" + "add %6, %2 \n\t" + "popcnt %7, %7 \n\t" + "add %7, %3 \n\t" + : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3), + "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); +} + +// Slow paths +// TODO: Assumes sizeof(long) == 8 +// +// NB: The specialisation to n=3 is not currently used but included +// for completeness (i.e. so that popcount is defined for all +// non-negative n) and in anticipation of its use in the near future. +#if 0 +template<> +inline void +popcount<3>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, + const uint64_t *buf) { + c0 += __builtin_popcountl(buf[0]); + c1 += __builtin_popcountl(buf[1]); + c2 += __builtin_popcountl(buf[2]); } +#endif /** - * Compute the Dice coefficient similarity measure of two bit patterns. + * The popcount of 2 elements of buf is the sum of c0, c1. */ -static double -dice_coeff_1024(const char *e1, const char *e2) { - const uint64_t *comp1 = (const uint64_t *) e1; - const uint64_t *comp2 = (const uint64_t *) e2; +template<> +inline void +popcount<2>( + uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, + const uint64_t *buf) { + c0 += __builtin_popcountl(buf[0]); + c1 += __builtin_popcountl(buf[1]); +} + +/** + * The popcount *buf is in c0. + */ +template<> +inline void +popcount<1>( + uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, + const uint64_t *buf) { + c0 += __builtin_popcountl(buf[0]); +} - uint32_t count_both = 0; - count_both += builtin_popcnt_unrolled_errata_manual(comp1); - count_both += builtin_popcnt_unrolled_errata_manual(comp2); - if(count_both == 0) { - return 0.0; +/** + * Calculate population counts of an array of inputs of nwords elements. + * + * 'arrays' must point to narrays*nwords*WORD_BYTES bytes + * 'counts' must point to narrays*sizeof(uint32_t) bytes. + * For i = 0 to narrays - 1, the population count of the nwords elements + * + * arrays[i * nwords] ... arrays[(i + 1) * nwords - 1] + * + * is put in counts[i]. + */ +template +static void +_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { + uint64_t c0, c1, c2, c3; + for (int i = 0; i < narrays; ++i, arrays += nwords) { + c0 = c1 = c2 = c3 = 0; + popcount(c0, c1, c2, c3, arrays); + counts[i] = c0 + c1 + c2 + c3; } +} - uint64_t combined[KEYWORDS]; - for (unsigned int i = 0 ; i < KEYWORDS; i++ ) { - combined[i] = comp1[i] & comp2[i]; +/** + * Return the popcount of the nwords elements starting at array. + */ +static uint32_t +_popcount_array(const uint64_t *array, int nwords) { + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; + + while (nwords >= 16) { + popcount<16>(c0, c1, c2, c3, array); + array += 16; + nwords -= 16; } + // nwords < 16 + if (nwords >= 8) { + popcount<8>(c0, c1, c2, c3, array); + array += 8; + nwords -= 8; + } + // nwords < 8 + if (nwords >= 4) { + popcount<4>(c0, c1, c2, c3, array); + array += 4; + nwords -= 4; + } + // nwords < 4 + if (nwords >= 2) { + popcount<2>(c0, c1, c2, c3, array); + array += 2; + nwords -= 2; + } + // nwords < 2 + if (nwords == 1) + popcount<1>(c0, c1, c2, c3, array); + return c0 + c1 + c2 + c3; +} - uint32_t count_and = builtin_popcnt_unrolled_errata_manual(combined); +/** + * The popcount of the logical AND of n corresponding elements of buf1 + * and buf2 is the sum of c0, c1, c2, c3. + */ +template +static inline void +popcount_logand( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf1, const uint64_t *buf2) { + popcount_logand<4>(c0, c1, c2, c3, buf1, buf2); + popcount_logand(c0, c1, c2, c3, buf1 + 4, buf2 + 4); +} - return 2 * count_and / (double)count_both; +/** + * The popcount of the logical AND of 4 corresponding elements of buf1 + * and buf2 is the sum of c0, c1, c2, c3. + */ +template<> +inline void +popcount_logand<4>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf1, const uint64_t *buf2) { + uint64_t b[4]; + b[0] = buf1[0] & buf2[0]; + b[1] = buf1[1] & buf2[1]; + b[2] = buf1[2] & buf2[2]; + b[3] = buf1[3] & buf2[3]; + popcount<4>(c0, c1, c2, c3, b); } +/** + * Return the popcount of the logical AND of len corresponding + * elements of u and v. + */ +static uint32_t +_popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) { + // NB: The switch statement at the end of this function must have + // cases for all i = 1, ..., LOOP_LEN - 1. + static constexpr int LOOP_LEN = 4; + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; + + int i = 0; + for ( ; i + LOOP_LEN <= len; i += LOOP_LEN) { + popcount_logand(c0, c1, c2, c3, u, v); + u += LOOP_LEN; + v += LOOP_LEN; + } + + // NB: The "fall through" comments are necessary to tell GCC and + // Clang not to complain about the fact that the case clauses + // don't have break statements in them. + switch (len - i) { + case 3: c2 += __builtin_popcountl(u[2] & v[2]); /* fall through */ + case 2: c1 += __builtin_popcountl(u[1] & v[1]); /* fall through */ + case 1: c0 += __builtin_popcountl(u[0] & v[0]); /* fall through */ + } -class Node { + return c0 + c1 + c2 + c3; +} +/** + * Return the Sorensen-Dice coefficient of nwords length arrays u and + * v, whose popcounts are given in u_popc and v_popc respectively. + */ +static inline double +_dice_coeff_generic( + const uint64_t *u, uint32_t u_popc, + const uint64_t *v, uint32_t v_popc, + int nwords) { + uint32_t uv_popc = _popcount_logand_array(u, v, nwords); + return (2 * uv_popc) / (double) (u_popc + v_popc); +} + +/** + * Return the Sorensen-Dice coefficient of nwords length arrays u and + * v, whose popcounts are given in u_popc and v_popc respectively. + */ +template +static inline double +_dice_coeff( + const uint64_t *u, uint32_t u_popc, + const uint64_t *v, uint32_t v_popc) { + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; + popcount_logand(c0, c1, c2, c3, u, v); + uint32_t uv_popc = c0 + c1 + c2 + c3; + return (2 * uv_popc) / (double) (u_popc + v_popc); +} + +class Node { public: int index; double score; // Constructor with default Node( int n_index = -1, double n_score = -1.0 ) - :index(n_index), score( n_score ) - { - } + :index(n_index), score( n_score ) { } }; -struct score_cmp{ - bool operator()(const Node& a, const Node& b) const{ +struct score_cmp { + bool operator()(const Node& a, const Node& b) const { return a.score > b.score; } }; /** - * Count lots of bits. + * */ -static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) { - for (int i = 0; i < n; i++) { - const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS; - counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig); - } +static inline uint32_t +calculate_max_difference(uint32_t popcnt_a, double threshold) { + return 2 * popcnt_a * (1/threshold - 1); } /** + * Convert clock measurement t to milliseconds. * + * t should have been obtained as the difference of calls to clock() + * for this to make sense. */ -static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold) -{ - return 2 * popcnt_a * (1/threshold - 1); +static inline double +to_millis(clock_t t) { + static constexpr double CPS = (double)CLOCKS_PER_SEC; + return t * 1.0E3 / CPS; +} + +static inline uint32_t +abs_diff(uint32_t a, uint32_t b) { + if (a > b) + return a - b; + return b - a; } + extern "C" { /** - * Calculate up to the top k indices and scores. - * Returns the number matched above a threshold. + * Calculate population counts of an array of inputs; return how + * long it took in milliseconds. + * + * 'arrays' must point to narrays*array_bytes bytes + * 'counts' must point to narrays*sizeof(uint32_t) bytes. + * For i = 0 to n - 1, the population count of the array_bytes*8 bits + * + * arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1] + * + * is put in counts[i]. + * + * ASSUMES: array_bytes is divisible by 8. */ - int match_one_against_many_dice_1024_k_top( - const char *one, - const char *many, - const uint32_t *counts_many, - int n, - uint32_t k, - double threshold, - int *indices, - double *scores) { - - const uint64_t *comp1 = (const uint64_t *) one; - const uint64_t *comp2 = (const uint64_t *) many; - - std::priority_queue, score_cmp> max_k_scores; - - uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1); - - uint64_t combined[KEYWORDS]; + double + popcount_arrays( + uint32_t *counts, + const char *arrays, int narrays, int array_bytes) { + // assumes WORD_BYTES divides array_bytes + int nwords = array_bytes / WORD_BYTES; + const uint64_t *u = reinterpret_cast(arrays); + + // assumes WORD_PER_POPCOUNT divides nwords + clock_t t = clock(); + switch (nwords) { + case 32: _popcount_arrays<32>(counts, u, narrays); break; + case 16: _popcount_arrays<16>(counts, u, narrays); break; + case 8: _popcount_arrays< 8>(counts, u, narrays); break; + default: + for (int i = 0; i < narrays; ++i, u += nwords) + counts[i] = _popcount_array(u, nwords); + } + return to_millis(clock() - t); + } - double *all_scores = new double[n]; + /** + * Compute the Dice coefficient similarity measure of two arrays + * of length array_bytes. + * + * ASSUMES: array_bytes is divisible by 8. + */ + double + dice_coeff( + const char *array1, + const char *array2, + int array_bytes) { + const uint64_t *u, *v; + uint32_t u_popc, v_popc; + // assumes WORD_BYTES divides array_bytes + int nwords = array_bytes / WORD_BYTES; + + u = reinterpret_cast(array1); + v = reinterpret_cast(array2); + + // If the popcount of one of the arrays is zero, then the + // popcount of the "intersection" (logical AND) will be zero, + // hence the whole Dice coefficient will be zero. + u_popc = _popcount_array(u, nwords); + if (u_popc == 0) + return 0.0; + v_popc = _popcount_array(v, nwords); + if (v_popc == 0) + return 0.0; + + return _dice_coeff_generic(u, u_popc, v, v_popc, nwords); + } - uint32_t max_popcnt_delta = 1024; + /** + * Calculate up to the top k indices and scores. Returns the + * number matched above the given threshold or -1 if keybytes is + * not a multiple of 8. + */ + int match_one_against_many_dice_k_top( + const char *one, + const char *many, + const uint32_t *counts_many, + int n, + int keybytes, + uint32_t k, + double threshold, + int *indices, + double *scores) { + + if (keybytes % WORD_BYTES != 0) + return -1; + int keywords = keybytes / WORD_BYTES; + const uint64_t *comp1 = reinterpret_cast(one); + const uint64_t *comp2 = reinterpret_cast(many); + + // Here we create top_k_scores on the stack by providing it + // with a vector in which to put its elements. We do this so + // that we can reserve the amount of space needed for the + // scores in advance and avoid potential memory reallocation + // and copying. + typedef std::vector node_vector; + typedef std::priority_queue, score_cmp> node_queue; + node_vector vec; + vec.reserve(k + 1); + node_queue top_k_scores(score_cmp(), std::move(vec)); + + uint32_t count_one = _popcount_array(comp1, keywords); + if (count_one == 0) + return 0; + + uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); } - uint32_t current_delta; - for (int j = 0; j < n; j++) { - const uint64_t *current = comp2 + j * KEYWORDS; - - if(count_one > counts_many[j]){ - current_delta = count_one - counts_many[j]; - } else { - current_delta = counts_many[j] - count_one; + auto push_score = [&](double score, int idx) { + if (score >= threshold) { + top_k_scores.push(Node(idx, score)); + if (top_k_scores.size() > k) { + // Popping the top element is O(log(k))! + top_k_scores.pop(); + } } - - if(current_delta <= max_popcnt_delta){ - for (unsigned int i = 0 ; i < KEYWORDS; i++ ) { - combined[i] = current[i] & comp1[i]; + }; + + const uint64_t *current = comp2; + + // NB: For any key length that must run at maximum speed, we + // need to specialise a block in the following 'if' statement + // (which is an example of specialising to keywords == 16). + if (keywords == 16) { + for (int j = 0; j < n; j++, current += 16) { + const uint32_t counts_many_j = counts_many[j]; + if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { + double score = _dice_coeff<16>(comp1, count_one, current, counts_many_j); + push_score(score, j); } - - uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined); - - double score = 2 * count_curr / (double) (count_one + counts_many[j]); - all_scores[j] = score; - } else { - // Skipping because popcount difference too large - all_scores[j] = -1; } - } - - for (int j = 0; j < n; j++) { - - if(all_scores[j] >= threshold) { - max_k_scores.push(Node(j, all_scores[j])); + } else { + for (int j = 0; j < n; j++, current += keywords) { + const uint32_t counts_many_j = counts_many[j]; + if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { + double score = _dice_coeff_generic(comp1, count_one, current, counts_many_j, keywords); + push_score(score, j); + } } - - if(max_k_scores.size() > k) max_k_scores.pop(); } - delete[] all_scores; - int i = 0; - while (!max_k_scores.empty()) { - - scores[i] = max_k_scores.top().score; - indices[i] = max_k_scores.top().index; - - max_k_scores.pop(); - i+=1; + while ( ! top_k_scores.empty()) { + scores[i] = top_k_scores.top().score; + indices[i] = top_k_scores.top().index; + // Popping the top element is O(log(k))! + top_k_scores.pop(); + i += 1; } - return i; - } - - int match_one_against_many_dice(const char *one, const char *many, int n, double *score) { - - static const double threshold = 0.0; - static const int k = 1; - int idx_unused; - uint32_t *counts_many = new uint32_t[n]; - popcount_1024_array(many, n, counts_many); - int res = match_one_against_many_dice_1024_k_top( - one, many, counts_many, n, k, threshold, &idx_unused, score); - delete[] counts_many; - return res; + return i; } } - diff --git a/anonlink/__init__.py b/anonlink/__init__.py index 79168898..ce35a08c 100644 --- a/anonlink/__init__.py +++ b/anonlink/__init__.py @@ -5,4 +5,4 @@ from anonlink import network_flow __version__ = pkg_resources.get_distribution('anonlink').version -__author__ = 'Stephen Hardy, Brian Thorne' \ No newline at end of file +__author__ = 'N1 Analytics' diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index c1fba03d..2fd0e438 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -8,7 +8,6 @@ from anonlink.entitymatch import * from anonlink.util import popcount_vector, generate_clks, generate_bitarray -from anonlink.distributed_processing import calculate_filter_similarity some_filters = generate_clks(10000) @@ -18,54 +17,74 @@ def compute_popcount_speed(n): """ Just do as much counting of bits. """ - clks = [generate_bitarray(1024) for _ in range(n)] - start = timer() - popcounts = popcount_vector(clks) - end = timer() - elapsed_time = end - start - print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time)) - speed_in_MiB = n / (1024 * 8 * elapsed_time) - print("Popcount speed: {:.2f} MiB/s".format(speed_in_MiB)) - return speed_in_MiB + clk_bits = 1024 + clk_bytes = clk_bits / 8 + clks_MiB = n * clk_bytes * 1.0 / (1 << 20) + clks = [generate_bitarray(clk_bits) for _ in range(n)] -def print_comparison_header(): - print("Size 1 | Size 2 | Comparisons | Compute Time | Million Comparisons per second") + print("{:6d} x {:d} bit popcounts".format(n, clk_bits)) + print("Implementation | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s)") + # Python + popcounts, elapsed_time = popcount_vector(clks, use_python=True) + python_speed_in_MiB = clks_MiB / elapsed_time + python_Mops = n / (1e6 * elapsed_time) + elapsed_time_ms = elapsed_time * 1e3 + print("Python (bitarray.count()): | {:7.2f} | {:9.2f} | {:7.2f}" + .format(elapsed_time_ms, python_speed_in_MiB, python_Mops)) -def compute_comparison_speed(n1=100, n2=100): - """ - Using the greedy solver, how fast can hashes be computed using one core. - """ - - filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)] - filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)] - + # Native start = timer() - result3 = calculate_mapping_greedy(filters1, filters2) + popcounts, elapsed_nocopy = popcount_vector(clks, use_python=False) end = timer() elapsed_time = end - start - print("{:6d} | {:6d} | {:12d} | {:8.3f}s | {:12.3f}".format( - n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time))) - return elapsed_time + copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time + elapsed_time_ms = elapsed_time * 1e3 + elapsed_nocopy_ms = elapsed_nocopy * 1e3 + native_speed_in_MiB = clks_MiB / elapsed_time + native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy + native_Mops = n / (1e6 * elapsed_time) + native_Mops_nocopy = n / (1e6 * elapsed_nocopy) + print("Native code (no copy): | {:7.2f} | {:9.2f} | {:7.2f}" + .format(elapsed_nocopy_ms, native_speed_in_MiB_nocopy, native_Mops_nocopy)) + print("Native code (w/ copy): | {:7.2f} | {:9.2f} | {:7.2f} ({:.1f}% copying)" + .format(elapsed_time_ms, native_speed_in_MiB, native_Mops, copy_percent)) + return python_speed_in_MiB -def compute_comparison_speed_parallel(n1=100, n2=100): + +def print_comparison_header(threshold): + print("\nThreshold:", threshold) + print("Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput") + print(" | | (match %) | (comparisons / matching)| (1e6 cmp/s)") + print("-------+--------+------------------+-------------------------+-------------") + + +def compute_comparison_speed(n1, n2, threshold): """ - Using the greedy solver in chunks, how fast can hashes be computed. + Using the greedy solver, how fast can hashes be computed using one core. """ filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)] filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)] - start = timer() - calculate_filter_similarity(filters1, filters2) - + sparse_matrix = calculate_filter_similarity(filters1, filters2, k=len(filters2), threshold=threshold) + t1 = timer() + res = greedy_solver(sparse_matrix) end = timer() + + similarity_time = t1 - start + solver_time = end - t1 elapsed_time = end - start - print("{:6d} | {:6d} | {:12d} | {:8.3f}s | {:12.3f}".format( - n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time))) + print("{:6d} | {:6d} | {:4d}e6 ({:5.2f}%) | {:6.3f} ({:4.1f}% / {:4.1f}%) | {:8.3f}".format( + n1, n2, n1*n2 // 1000000, + 100.0*len(sparse_matrix)/(n1*n2), + elapsed_time, + 100.0*similarity_time/elapsed_time, + 100.0*solver_time/elapsed_time, + (n1*n2)/(1e6*similarity_time))) return elapsed_time @@ -111,13 +130,13 @@ def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8): def benchmark(size, compare): + print("Anonlink benchmark -- see README for explanation") + print("------------------------------------------------") if compare: print(compare_python_c(ntotal=1000, nsubset=600)) compute_popcount_speed(100000) - print_comparison_header() - possible_test_sizes = [ 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, @@ -127,15 +146,19 @@ def benchmark(size, compare): 2000000 ] + thld = 0.5 + print_comparison_header(thld) for test_size in possible_test_sizes: if test_size <= size: - compute_comparison_speed_parallel( - test_size, test_size - ) - - print("Single Core:") - compute_comparison_speed(5000, 5000) + compute_comparison_speed(test_size, test_size, thld) + thld = 0.7 + print_comparison_header(thld) + size *= 5 + for test_size in possible_test_sizes: + if test_size <= size: + compute_comparison_speed(test_size, test_size, thld) if __name__ == '__main__': - benchmark(20000, False) \ No newline at end of file + benchmark(4000, False) + #benchmark(20000, False) diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py index 198e8d31..24675c3b 100644 --- a/anonlink/bloommatcher.py +++ b/anonlink/bloommatcher.py @@ -10,7 +10,9 @@ def dicecoeff_pure_python(e1, e2): """ Dice coefficient measures the similarity of two bit patterns. - :param e1,e2: bitset arrays of same length + Implemented exclusively in Python. + + :param e1, e2: bitarrays of same length :return: real 0-1 similarity measure """ count1 = e1.count() @@ -22,18 +24,27 @@ def dicecoeff_pure_python(e1, e2): else: return 2.0 * overlap_count / combined_count - -def dicecoeff(e1, e2): +def dicecoeff_native(e1, e2): """ - Dice coefficient measures the similarity of two bit patterns + Dice coefficient measures the similarity of two bit patterns. + + Implemented via an external library. + :param e1, e2: bitarrays of same length :return: real 0-1 similarity measure """ e1array = ffi.new("char[]", e1.tobytes()) e2array = ffi.new("char[]", e2.tobytes()) + return lib.dice_coeff(e1array, e2array, len(e1array)) - if len(e1) == 1024 and len(e2) == 1024: - return lib.dice_coeff_1024(e1array, e2array) +def dicecoeff(e1, e2): + """ + Dice coefficient measures the similarity of two bit patterns + + :return: real 0-1 similarity measure + """ + if e1.length() == e2.length() and (e1.length()/8) % 8 == 0: + return dicecoeff_native(e1, e2) else: return dicecoeff_pure_python(e1, e2) diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py index 1484b12d..fae7b4f7 100644 --- a/anonlink/entitymatch.py +++ b/anonlink/entitymatch.py @@ -1,5 +1,4 @@ import logging -from operator import itemgetter from anonlink._entitymatcher import ffi, lib @@ -8,7 +7,7 @@ from . import bloommatcher as bm from . import util -logging.basicConfig(level=logging.WARNING) +log = logging.getLogger('anonlink.entitymatch') def python_filter_similarity(filters1, filters2): @@ -34,15 +33,25 @@ def python_filter_similarity(filters1, filters2): def cffi_filter_similarity_k(filters1, filters2, k, threshold): """Accelerated method for determining Bloom Filter similarity. + + Assumes all filters are the same length, being a multiple of 64 + bits. + """ length_f1 = len(filters1) length_f2 = len(filters2) - # We assume the length is 1024 bit = 128 Bytes - match_one_against_many_dice_1024_k_top = lib.match_one_against_many_dice_1024_k_top + if length_f1 == 0: + return [] + + filter_bits = len(filters1[0][0]) + assert(filter_bits % 64 == 0, 'Filter length must be a multple of 64 bits.') + filter_bytes = filter_bits // 8 + + match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top # An array of the *one* filter - clist1 = [ffi.new("char[128]", bytes(f[0].tobytes())) + clist1 = [ffi.new("char[{}]".format(filter_bytes), bytes(f[0].tobytes())) for f in filters1] if sys.version_info < (3, 0): @@ -52,10 +61,10 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): for b in f[0].tobytes(): data.append(b) - carr2 = ffi.new("char[{}]".format(128 * length_f2), ''.join(data)) + carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), ''.join(data)) else: # Works in Python 3+ - carr2 = ffi.new("char[{}]".format(128 * length_f2), + carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), bytes([b for f in filters2 for b in f[0].tobytes()])) c_popcounts = ffi.new("uint32_t[{}]".format(length_f2), [f[2] for f in filters2]) @@ -67,18 +76,20 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): result = [] for i, f1 in enumerate(filters1): - assert len(clist1[i]) == 128 - assert len(carr2) % 64 == 0 - matches = match_one_against_many_dice_1024_k_top( + assert len(clist1[i]) == filter_bytes + matches = match_one_against_many_dice_k_top( clist1[i], carr2, c_popcounts, length_f2, + filter_bytes, k, threshold, c_indices, c_scores) + if matches < 0: + raise ValueError('Internel error: Bad key length') for j in range(matches): ind = c_indices[j] assert ind < len(filters2) @@ -119,7 +130,7 @@ def calculate_mapping_greedy(filters1, filters2, threshold=0.95, k=5): :returns A mapping dictionary of index in filters1 to index in filters2. """ - logging.info('Solving with greedy solver') + log.info('Solving with greedy solver') sparse_matrix = calculate_filter_similarity(filters1, filters2, k, threshold) return greedy_solver(sparse_matrix) diff --git a/anonlink/network_flow.py b/anonlink/network_flow.py index ffc3619b..5b679ce2 100644 --- a/anonlink/network_flow.py +++ b/anonlink/network_flow.py @@ -8,7 +8,7 @@ import networkx as nx from networkx.algorithms import bipartite -logging.basicConfig(level=logging.WARNING) +log = logging.getLogger('anonlink.networkflow') def calculate_network(similarity, cutoff): @@ -62,17 +62,17 @@ def calculate_entity_mapping(G, method=None): """ if method == 'bipartite': - logging.info('Solving entity matches with bipartite maximum matching solver') + log.info('Solving entity matches with bipartite maximum matching solver') network = bipartite.maximum_matching(G) entity_map = _to_int_map(network, lambda network, node: network[node]) elif method == 'weighted': - logging.info('Solving entity matches with networkx maximum weight matching solver') + log.info('Solving entity matches with networkx maximum weight matching solver') network = nx.max_weight_matching(G) entity_map = _to_int_map(network, lambda network, node: network[node]) elif method == 'flow' or method is None: - logging.info('Solving entity matches with networkx maximum flow solver') + log.info('Solving entity matches with networkx maximum flow solver') # The maximum flow solver requires a SOURCE and SINK num_rows, num_cols = 0, 0 for i, node in enumerate(G.nodes()): @@ -87,9 +87,9 @@ def calculate_entity_mapping(G, method=None): # This method produces a quality metric `flow`, however # it needs to be compared to the number of entities if flow_value < num_rows: - logging.info('Matching not perfect - {:.3f}'.format(flow_value)) + log.info('Matching not perfect - {:.3f}'.format(flow_value)) else: - logging.info('Matching complete. (perfect matching)') + log.info('Matching complete. (perfect matching)') def find_pair(network, node): # Make sure to deal with unconnected nodes diff --git a/anonlink/util.py b/anonlink/util.py index dcafa80f..1fec0d96 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -3,7 +3,9 @@ import os import random from bitarray import bitarray +from timeit import default_timer as timer +from anonlink._entitymatcher import ffi, lib def generate_bitarray(length): a = bitarray(endian=['little', 'big'][random.randint(0, 1)]) @@ -19,25 +21,36 @@ def generate_clks(n): return res -def popcount_vector(bitarrays): - """ - Note, due to the overhead of converting bitarrays into - bytes, it is more expensive to call our C implementation - than just calling bitarray.count() +def popcount_vector(bitarrays, use_python=True): + """Return a list containing the popcounts of the elements of + bitarrays, and the time (in seconds) it took. If use_python is + False, use the native code implementation instead of Python; in + this case the returned time is the time spent in the native code, + NOT including copying to and from the Python runtime. + Note, due to the overhead of converting bitarrays into bytes, + it is currently more expensive to call our C implementation + than just calling bitarray.count() """ - return [clk.count() for clk in bitarrays] + # Use Python + if use_python: + start = timer() + counts = [clk.count() for clk in bitarrays] + elapsed = timer() - start + return counts, elapsed + + # Use native code + n = len(bitarrays) + arr_bytes = bitarrays[0].length() // 8 + c_popcounts = ffi.new("uint32_t[{}]".format(n)) + many = ffi.new("char[{}]".format(arr_bytes * n), + bytes([b for f in bitarrays for b in f.tobytes()])) + ms = lib.popcount_arrays(c_popcounts, many, n, arr_bytes) - # n = len(clks) - # c_popcounts = ffi.new("uint32_t[{}]".format(n)) - # many = ffi.new("char[{}]".format(128 * n), - # bytes([b for f in clks for b in f.tobytes()])) - # lib.popcount_1024_array(many, n, c_popcounts) - # - # return [c_popcounts[i] for i in range(n)] + return [c_popcounts[i] for i in range(n)], ms * 1e-3 def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] \ No newline at end of file + yield l[i:i + n] diff --git a/requirements.txt b/requirements.txt index f4a36d17..2a2ad3a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ bitarray==0.8.1 networkx==1.11 cffi>=1.7 -nose==1.3.7 -clkhash==0.8.0 \ No newline at end of file +pytest>=3.4 +pytest-cov>=2.5 +clkhash==0.8.0 diff --git a/setup.py b/setup.py index d3ae9419..742f67d1 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,12 @@ "bitarray==0.8.1", "networkx==1.11", "cffi>=1.7", + "clkhash>=0.10" ] setup( name="anonlink", - author='Brian Thorne', - author_email='brian.thorne@data61.csiro.au', - version='0.6.2', + version='0.7.0.rc1', description='Anonymous linkage using cryptographic hashes and bloom filters', url='https://github.com/n1analytics/anonlink', license='Apache', diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py new file mode 100644 index 00000000..76426911 --- /dev/null +++ b/tests/bitarray_utils.py @@ -0,0 +1,20 @@ +from bitarray import bitarray +from itertools import combinations_with_replacement + +def bitarrays_of_length(L): + """ + Return a bit array of length L*64 whose contents are combinations of + the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in + the least or most significant position). + """ + special_words = [64*bitarray('0'), + 63*bitarray('0') + bitarray('1'), + bitarray('1') + 63*bitarray('0'), + 64*bitarray('1')] + # '+' on bitarrays is concatenation + return [sum(word, bitarray()) + for word in combinations_with_replacement(special_words, L)] + +# Interesting key lengths (usually around 2^something +/-1). +key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, + 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index acc6fc06..cb6f0a2f 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -11,10 +11,7 @@ def test_benchmarking_popcount(self): self.assertGreater(speed, 50, "Popcounting at less than 50MiB/s") def test_comparison_speed_benchmark(self): - benchmark.compute_comparison_speed() - - def test_parallel_comparison_speed_benchmark(self): - benchmark.compute_comparison_speed_parallel() + benchmark.compute_comparison_speed(100, 100, 0.7) def test_comparing_python_c_bench(self): benchmark.compare_python_c(500, 30, frac=0.8) diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py index 4c0f06fd..8fff895e 100644 --- a/tests/test_bloommatcher.py +++ b/tests/test_bloommatcher.py @@ -1,8 +1,12 @@ import unittest +import pytest import random +import os +from collections import deque from bitarray import bitarray from anonlink import bloommatcher as bm +from tests import bitarray_utils __author__ = 'shardy' @@ -70,6 +74,26 @@ def test_dice_4_c(self): self.assertEqual(result, 0.0) +@pytest.mark.parametrize("L", bitarray_utils.key_lengths) +def test_dicecoeff(L): + """ + Test the Dice coefficient of bitarrays in bas with other + bitarrays of bas. rotations is the number of times we rotate + bas to generate pairs to test the Dice coefficient; 10 takes + around 10s, 100 around 60s. + """ + rotations = 100 if "INCLUDE_10K" in os.environ else 10; + bas = bitarray_utils.bitarrays_of_length(L) + + # We check that the native code and Python versions of dicecoeff + # don't ever differ by more than 10^{-6}. + eps = 0.000001 + d = deque(bas) + for _ in range(min(rotations, len(bas))): + for a, b in zip(bas, d): + diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b) + assert(abs(diff) < eps) + d.rotate(1) if __name__ == '__main__': unittest.main() diff --git a/tests/test_util.py b/tests/test_util.py index 521cada5..2715de3d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3.4 import unittest +import pytest from anonlink import util +from anonlink import bloommatcher as bm +from tests import bitarray_utils class TestUtilDataGeneration(unittest.TestCase): @@ -19,10 +22,12 @@ def test_generate_clks(self): self.assertEqual(len(clk[0]), 1024) self.assertEqual(clk[0].count(), clk[2]) - def test_popcount_vector(self): - bas = [util.generate_bitarray(1024) for i in range(100)] - popcounts = util.popcount_vector(bas) +@pytest.mark.parametrize("L", bitarray_utils.key_lengths) +def test_popcount_vector(L): + bas = bitarray_utils.bitarrays_of_length(L) + bas_counts = [b.count() for b in bas] - self.assertEquals(len(popcounts), 100) - for i, cnt in enumerate(popcounts): - self.assertEquals(cnt, bas[i].count()) + popcounts, _ = util.popcount_vector(bas, use_python=True) + assert(popcounts == bas_counts) + popcounts, _ = util.popcount_vector(bas, use_python=False) + assert(popcounts == bas_counts)