Skip to content

Commit

Permalink
Initial checkin.
Browse files Browse the repository at this point in the history
  • Loading branch information
jvirkki committed Jun 17, 2012
1 parent d9e88b0 commit e2edda2
Show file tree
Hide file tree
Showing 10 changed files with 614 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build
26 changes: 26 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

Copyright (c) 2012, Jyri J. Virkki
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
82 changes: 82 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

# Copyright (c) 2012, Jyri J. Virkki
# All rights reserved.
#
# This file is under BSD license. See LICENSE file.
#
# By default, builds optimized 32bit libbloom (under ./build)
# Requires GNU Make, so invoke appropriately (make or gmake)
#
# Other build options:
#
# DEBUG=1 make to build debug instead of optimized
# MM=-m64 make to build 64bit library
#
# Other build targets:
#
# make test to build and run test code
# make gcov to build with code coverage and run gcov
# make lint to run lint
# make clean the usual
#

TOP := $(shell /bin/pwd)
BUILD_OS := $(shell uname)

BUILD=$(TOP)/build
INC=-I$(TOP) -I$(TOP)/murmur2
LIB=-lm
CC=gcc ${OPT} ${MM} -std=c99 -fPIC

ifeq ($(MM),)
MM=-m32
endif

ifeq ($(BUILD_OS),Linux)
RPATH=-Wl,-rpath,$(BUILD)
endif

ifeq ($(BUILD_OS),SunOS)
RPATH=-R$(BUILD)
endif

ifeq ($(DEBUG),1)
OPT=-g $(DEBUGOPT)
else
OPT=-O3
endif


all: $(BUILD)/libbloom.so $(BUILD)/test-libbloom

$(BUILD)/libbloom.so: $(BUILD)/murmurhash2.o $(BUILD)/bloom.o
(cd $(BUILD) && $(CC) bloom.o murmurhash2.o -shared $(LIB) -o libbloom.so)

$(BUILD)/test-libbloom: $(BUILD)/libbloom.so $(BUILD)/test.o
(cd $(BUILD) && $(CC) test.o -L$(BUILD) $(RPATH) -lbloom -o test-libbloom)

$(BUILD)/%.o: %.c
mkdir -p $(BUILD)
$(CC) $(INC) -c $< -o $@

$(BUILD)/murmurhash2.o: murmur2/MurmurHash2.c murmur2/murmurhash2.h
mkdir -p $(BUILD)
$(CC) $(INC) -c murmur2/MurmurHash2.c -o $(BUILD)/murmurhash2.o

clean:
rm -rf $(BUILD)

lint:
lint -x -errfmt=simple $(INC) $(LIB) *.c murmur2/*.c

test: $(BUILD)/test-libbloom
$(BUILD)/test-libbloom

gcov:
$(MAKE) clean
DEBUG=1 DEBUGOPT="-fprofile-arcs -ftest-coverage" $(MAKE) all
(cd $(BUILD) && \
cp ../*.c . && \
./test-libbloom && \
gcov -bf bloom.c)
@echo Remember to make clean to remove instrumented objects
44 changes: 43 additions & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@

libbloom
Introduction
------------
This is libbloom, a simple and small bloom filter implementation in C.

If you are reading this you probably already know about bloom filters
and why you might use one. If not, the wikipedia article is a good intro:
http://en.wikipedia.org/wiki/Bloom_filter


Building
--------
The Makefile assumes GNU Make, so run 'make' or 'gmake' as appropriate
on your system.

By default it builds an optimized 32bit libbloom. See Makefile comments
for other build options.

The shared library will be in ./build/libbloom.so


Sample Usage
------------

#include "bloom.h"

struct bloom bloom;
bloom_init(&bloom, 1000000, 0.01);
bloom_add(&bloom, buffer, buflen);

if (bloom_check(&bloom, buffer, buflen)) {
printf("It may be there!\n");
}


Documentation
-------------
Read bloom.h for more detailed documentation on the public interfaces.


License
-------
This code (except MurmurHash2) is under BSD license. See LICENSE file.

See murmur2/README for info on MurmurHash2.
122 changes: 122 additions & 0 deletions bloom.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright (c) 2012, Jyri J. Virkki
* All rights reserved.
*
* This file is under BSD license. See LICENSE file.
*/

/*
* Refer to bloom.h for documentation on the public interfaces.
*/

#include <fcntl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "bloom.h"
#include "murmurhash2.h"


static int bloom_check_add(struct bloom * bloom,
const void * buffer, int len, int add)
{
if (bloom->ready == 0) {
(void)printf("bloom at %p not initialized!\n", (void *)bloom);
return -1;
}

int hits = 0;
register unsigned int a = murmurhash2(buffer, len, 0x9747b28c);
register unsigned int b = murmurhash2(buffer, len, a);
register unsigned int x;
register unsigned int i;
register unsigned int byte;
register unsigned int mask;
register unsigned char c;

for (i = 0; i < bloom->hashes; i++) {
x = (a + i*b) % bloom->bits;
byte = x >> 3;
c = bloom->bf[byte]; // expensive memory access
mask = 1 << (x % 8);

if (c & mask) {
hits++;
} else {
if (add) {
bloom->bf[byte] = c | mask;
}
}
}

if (hits == bloom->hashes) {
return 1; // 1 == element already in (or collision)
}

return 0;
}


int bloom_init(struct bloom * bloom, int entries, double error)
{
bloom->ready = 0;

if (entries < 1 || error == 0) {
return 1;
}

bloom->entries = entries;
bloom->error = error;

double num = log(bloom->error);
double denom = 0.480453013918201; // ln(2)^2
bloom->bpe = -(num / denom);

double dentries = (double)entries;
bloom->bits = (int)(dentries * bloom->bpe);

if (bloom->bits % 8) {
bloom->bytes = (bloom->bits / 8) + 1;
} else {
bloom->bytes = bloom->bits / 8;
}

bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2)

bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
if (bloom->bf == NULL) {
return 1;
}

bloom->ready = 1;
return 0;
}


int bloom_check(struct bloom * bloom, const void * buffer, int len)
{
return bloom_check_add(bloom, buffer, len, 0);
}


int bloom_add(struct bloom * bloom, const void * buffer, int len)
{
return bloom_check_add(bloom, buffer, len, 1);
}


void bloom_print(struct bloom * bloom)
{
(void)printf("bloom at %p\n", (void *)bloom);
(void)printf(" ->entries = %d\n", bloom->entries);
(void)printf(" ->error = %f\n", bloom->error);
(void)printf(" ->bits = %d\n", bloom->bits);
(void)printf(" ->bits per elem = %f\n", bloom->bpe);
(void)printf(" ->bytes = %d\n", bloom->bytes);
(void)printf(" ->hash functions = %d\n", bloom->hashes);
}
116 changes: 116 additions & 0 deletions bloom.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2012, Jyri J. Virkki
* All rights reserved.
*
* This file is under BSD license. See LICENSE file.
*/

#ifndef _BLOOM_H
#define _BLOOM_H

/** ***************************************************************************
* Structure to keep track of one bloom filter. Caller needs to
* allocate this and pass it to the functions below. First call for
* every struct must be to bloom_init().
*
*/
struct bloom
{
// These fields are part of the public interface of this structure.
// Client code may read these values if desired. Client code MUST NOT
// modify any of these.
int entries;
double error;
int bits;
int bytes;
int hashes;

// Fields below are private to the implementation. These may go away or
// change incompatibly at any moment. Client code MUST NOT access or rely
// on these.
double bpe;
unsigned char * bf;
int ready;
};


/** ***************************************************************************
* Initialize the bloom filter for use.
*
* The filter is initialized with a bit field and number of hash functions
* according to the computations from the wikipedia entry:
* http://en.wikipedia.org/wiki/Bloom_filter
*
* Optimal number of bits is:
* bits = (entries * ln(error)) / ln(2)^2
*
* Optimal number of hash functions is:
* hashes = bpe * ln(2)
*
* Parameters:
* -----------
* bloom - Pointer to an allocated struct bloom (see above).
*
* Parameters:
* -----------
* entries - The expected number of entries which will be inserted.
* error - Probability of collision (as long as entries are not
* exceeded).
*
* Return:
* -------
* 0 - on success
* 1 - on failure
*
*/
int bloom_init(struct bloom * bloom, int entries, double error);


/** ***************************************************************************
* Check if the given element is in the bloom filter. Remember this may
* return false positive if a collision occured.
*
* Parameters:
* -----------
* bloom - Pointer to an allocated struct bloom (see above).
* buffer - Pointer to buffer containing element to check.
* len - Size of 'buffer'.
*
* Return:
* -------
* 0 - element is not present
* 1 - element is present (or false positive due to collision)
* -1 - bloom not initialized
*
*/
int bloom_check(struct bloom * bloom, const void * buffer, int len);


/** ***************************************************************************
* Add the given element to the bloom filter.
* The return code indicates if the element (or a collision) was already in,
* so for the common check+add use case, no need to call check separately.
*
* Parameters:
* -----------
* bloom - Pointer to an allocated struct bloom (see above).
* buffer - Pointer to buffer containing element to add.
* len - Size of 'buffer'.
*
* Return:
* -------
* 0 - element was not present and was added
* 1 - element (or a collision) had already been added previously
* -1 - bloom not initialized
*
*/
int bloom_add(struct bloom * bloom, const void * buffer, int len);


/** ***************************************************************************
* Print (to stdout) info about this bloom filter. Debugging aid.
*
*/
void bloom_print(struct bloom * bloom);

#endif
Loading

0 comments on commit e2edda2

Please sign in to comment.