diff --git a/docs/index.rst b/docs/index.rst index 175778f..7cefce4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,30 +13,30 @@ is a member of a set. The `wikipedia page >> from pybloomfilter import BloomFilter - with open("/usr/share/dict/words") as f: - for word in f: - bf.add(word.rstrip()) + >>> bf = BloomFilter(10000000, 0.01, 'filter.bloom') + >>> with open("/usr/share/dict/words") as f: + >>> for word in f: + >>> bf.add(word.rstrip()) - print 'apple' in bf - #outputs True + >>> print 'apple' in bf + True That wasn't so hard, was it? Now, there are a lot of other things we can do. For instance, let's say we want to create a similar -filter with just a few pieces of fruit:: +filter with just a few pieces of fruit: .. code:: python - fruitbf = bf.copy_template("fruit.bloom") - fruitbf.update(("apple", "banana", "orange", "pear")) - print fruitbf.to_base64() + >>> fruitbf = bf.copy_template("fruit.bloom") + >>> fruitbf.update(("apple", "banana", "orange", "pear")) + + >>> print(fruitbf.to_base64()) "eJzt2k13ojAUBuA9f8WFyofF5TWChlTHaPzqrlqFCtj6gQi/frqZM2N7aq3Gis59d2ye85KTRbhk" "0lyu1NRmsQrgRda0I+wZCfXIaxuWv+jqDxA8vdaf21HIOSn1u6LRE0VL9Z/qghfbBmxZoHsqM3k8" "N5XyPAxH2p22TJJoqwU9Q0y0dNDYrOHBIa3BwuznapG+KZZq69JUG0zu1tqI5weJKdpGq7PNJ6tB" @@ -76,7 +76,7 @@ Install Please have `Cython` installed. Please note that this version is for Python 3. In case you are using Python 2, please see https://github.com/axiak/pybloomfiltermmap. -To install: +To install:: $ pip install cython $ pip install pybloomfiltermmap3 diff --git a/docs/ref.rst b/docs/ref.rst index efd51dd..196bbc7 100644 --- a/docs/ref.rst +++ b/docs/ref.rst @@ -10,12 +10,12 @@ BloomFilter Class Reference .. moduleauthor:: Michael Axiak -.. class:: BloomFilter(capacity : int, error_rate : float, [filename=None : string], [perm=0755]) +.. class:: BloomFilter(capacity: int, error_rate: float, [filename = None: string], [mode = "rw+"], [perm=0755]) Create a new BloomFilter object with a given capacity and error_rate. **Note that we do not check capacity.** This is important, because - I want to be able to support logical OR and AND (see below). - The capacity and error_rate then together serve as a contract---you add + we want to be able to support logical OR and AND (see below). + The capacity and error_rate then together serve as a contract --- you add less than capacity items, and the Bloom Filter will have an error rate less than error_rate. @@ -24,7 +24,7 @@ Class Methods .. classmethod:: BloomFilter.open(filename) - Return a BloomFilter object using an already-existing Bloomfilter file. + Return a BloomFilter object using an already existing BloomFilter file. .. classmethod:: BloomFilter.from_base64(filename, string, [perm=0755]) @@ -35,11 +35,11 @@ Class Methods Example:: >>> bf = BloomFilter.from_base64("/tmp/mike.bf", - "eJwFwcuWgiAAANC9v+JCx7By0QKt0GHEbKSknflAQ9QmTyRfP/fW5E9XTRSX" - "qcLlqGNXphAqcfVH\nRoNv0n4JlTpIvAP0e1+RyXX6I637ggA+VPZnTYR1A4" - "Um5s9geYaZZLiT208JIiG3iwhf3Fwlzb3Y\n5NRL4uNQS6/d9OvTDJbnZMnR" - "zcrplOX5kmsVIkQziM+vw4hCDQ3OkN9m3WVfPWzGfaTeRftMCLws\nPnzEzs" - "gjAW60xZTBbj/bOAgYbK50PqjdzvgHZ6FHZw==\n") + "eJwFwcuWgiAAANC9v+JCx7By0QKt0GHEbKSknflAQ9QmTyRfP/fW5E9XTRSX" + "qcLlqGNXphAqcfVH\nRoNv0n4JlTpIvAP0e1+RyXX6I637ggA+VPZnTYR1A4" + "Um5s9geYaZZLiT208JIiG3iwhf3Fwlzb3Y\n5NRL4uNQS6/d9OvTDJbnZMnR" + "zcrplOX5kmsVIkQziM+vw4hCDQ3OkN9m3WVfPWzGfaTeRftMCLws\nPnzEzs" + "gjAW60xZTBbj/bOAgYbK50PqjdzvgHZ6FHZw==\n") >>> "MIKE" in bf True @@ -60,15 +60,20 @@ Instance Attributes .. attribute:: BloomFilter.name - The file name (compatible with file objects) + The file name (compatible with file objects). .. attribute:: BloomFilter.num_bits - The number of bits used in the filter as buckets + The number of bits used in the filter as buckets. .. attribute:: BloomFilter.num_hashes - The number of hash functions used when computing + The number of hash functions used when computing. + +.. attribute:: BloomFilter.read_only + + Boolean, indicating if the opened BloomFilter is read-only. + Always ``False`` for an in-memory BloomFilter. Instance Methods @@ -78,8 +83,8 @@ Instance Methods Add the item to the bloom filter. - :param item: Hashable object - :rtype: Boolean (True if item already in the filter) + :param item: hashable object + :rtype: boolean (``True`` if item already in the filter) .. method:: BloomFilter.clear_all() @@ -121,7 +126,7 @@ Instance Methods this may not be too useful. I find it useful for debugging so I can copy filters from one terminal to another in their entirety. - :rtype: Base64 encoded string representing filter + :rtype: base64 encoded string representing filter .. method:: BloomFilter.update(iterable) @@ -136,7 +141,7 @@ Instance Methods The result will occur **in place**. That is, calling:: - bf.union(bf2) + bf.union(bf2) is a way to add all the elements of bf2 to bf. @@ -147,7 +152,7 @@ Instance Methods The same as union() above except it uses a set AND instead of a set OR. - + *N.B.: Calling this function will render future calls to len() invalid.* @@ -182,11 +187,11 @@ Magic Methods .. method:: BloomFilter.__ior__(filter) -> BloomFilter - See union(filter) + See :meth:`BloomFilter.union`. .. method:: BloomFilter.__iand__(filter) -> BloomFilter - See intersection(filter) + See :meth:`BloomFilter.intersection`. Exceptions -------------- @@ -195,4 +200,3 @@ Exceptions The exception that is raised if len() is called on a BloomFilter object after |=, &=, intersection(), or union() is used. - diff --git a/src/mmapbitarray.c b/src/mmapbitarray.c index 4c1fb7a..43768d2 100644 --- a/src/mmapbitarray.c +++ b/src/mmapbitarray.c @@ -66,6 +66,7 @@ MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * he MBArray * array = (MBArray *)malloc(sizeof(MBArray)); uint64_t filesize; int32_t fheaderlen; + int mmap_flags = PROT_READ; if (!array || errno) { return NULL; @@ -148,9 +149,11 @@ MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * he } errno = 0; + // Add PROT_WRITE if we have write permissions + mmap_flags |= (oflag & O_RDWR) ? PROT_WRITE : 0; array->vector = (DTYPE *)mmap(NULL, _mmap_size(array), - PROT_READ | PROT_WRITE, + mmap_flags, MAP_SHARED, array->fd, 0); diff --git a/src/pybloomfilter.pyx b/src/pybloomfilter.pyx index 4203409..f25e09d 100644 --- a/src/pybloomfilter.pyx +++ b/src/pybloomfilter.pyx @@ -21,8 +21,20 @@ import base64 cdef extern int errno +cdef NoConstruct = object() +cdef ReadFile = object() + + +cdef _construct_access(mode): + result = os.F_OK + if 'w' in mode: + result |= os.W_OK + if 'r' in mode: + result |= os.R_OK + return result -cdef construct_mode(mode): + +cdef _construct_mode(mode): result = os.O_RDONLY if 'w' in mode: result |= os.O_RDWR @@ -32,11 +44,11 @@ cdef construct_mode(mode): result |= os.O_CREAT return result -cdef NoConstruct = object() class IndeterminateCountError(ValueError): pass + cdef class BloomFilter: """ The BloomFilter class implements a bloom filter that uses mmap'd files. @@ -45,37 +57,44 @@ cdef class BloomFilter: cdef cbloomfilter.BloomFilter * _bf cdef int _closed cdef int _in_memory + cdef int _oflags cdef public ReadFile - def __cinit__(self, capacity, error_rate, filename=None, perm=0755, hash_seeds=None): + def __cinit__(self, capacity, error_rate, filename=None, mode="rw+", perm=0755, hash_seeds=None): cdef char * seeds cdef long long num_bits + cdef int _capacity + self._closed = 0 self._in_memory = 0 + self._oflags = os.O_RDWR self.ReadFile = self.__class__.ReadFile - mode = "rw+" + if filename is NoConstruct: return if capacity is self.ReadFile: - mode = "rw" - capacity = 0 + # Create should not be allowed in read mode + mode = mode.replace("+", "") + _capacity = 0 + if not os.path.exists(filename): raise OSError("File %s not found" % filename) - if not os.access(filename, os.O_RDWR): + if not os.access(filename, _construct_access(mode)): raise OSError("Insufficient permissions for file %s" % filename) + else: + _capacity = capacity - mode = construct_mode(mode) - + self._oflags = _construct_mode(mode) - if not mode & os.O_CREAT: + if not self._oflags & os.O_CREAT: if os.path.exists(filename): - self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity, + self._bf = cbloomfilter.bloomfilter_Create_Mmap(_capacity, error_rate, filename.encode(), 0, - mode, + self._oflags, perm, NULL, 0) if self._bf is NULL: @@ -134,17 +153,17 @@ cdef class BloomFilter: # If a filename is provided, we should make a mmap-file # backed bloom filter. Otherwise, it will be malloc if filename: - self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity, + self._bf = cbloomfilter.bloomfilter_Create_Mmap(_capacity, error_rate, filename.encode(), num_bits, - mode, + self._oflags, perm, seeds, num_hashes) else: self._in_memory = 1 - self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, + self._bf = cbloomfilter.bloomfilter_Create_Malloc(_capacity, error_rate, num_bits, seeds, @@ -197,8 +216,15 @@ cdef class BloomFilter: 'in-memory %s' % self.__class__.__name__) + if self._bf.array.filename is NULL: + return None return self._bf.array.filename + property read_only: + def __get__(self): + self._assert_open() + return not self._in_memory and not self._oflags & os.O_RDWR + def fileno(self): self._assert_open() return self._bf.array.fd @@ -215,10 +241,12 @@ cdef class BloomFilter: def sync(self): self._assert_open() + self._assert_writable() cbloomfilter.mbarray_Sync(self._bf.array) def clear_all(self): self._assert_open() + self._assert_writable() cbloomfilter.mbarray_ClearAll(self._bf.array) def __contains__(self, item_): @@ -252,6 +280,7 @@ cdef class BloomFilter: def add(self, item_): self._assert_open() + self._assert_writable() cdef cbloomfilter.Key key if isinstance(item_, str): item = item_.encode() @@ -268,7 +297,6 @@ cdef class BloomFilter: return bool(result) def update(self, iterable): - self._assert_open() for item in iterable: self.add(item) @@ -286,41 +314,38 @@ cdef class BloomFilter: cbloomfilter.bloomfilter_Destroy(self._bf) self._bf = NULL - def __ior__(self, BloomFilter other): - self._assert_open() - self._assert_comparable(other) - cbloomfilter.mbarray_Or(self._bf.array, other._bf.array) - self._bf.count_correct = 0 - return self - def union(self, BloomFilter other): self._assert_open() + self._assert_writable() other._assert_open() self._assert_comparable(other) cbloomfilter.mbarray_Or(self._bf.array, other._bf.array) self._bf.count_correct = 0 return self - def __iand__(self, BloomFilter other): - self._assert_open() - other._assert_open() - self._assert_comparable(other) - cbloomfilter.mbarray_And(self._bf.array, other._bf.array) - self._bf.count_correct = 0 - return self + def __ior__(self, BloomFilter other): + return self.union(other) def intersection(self, BloomFilter other): self._assert_open() + self._assert_writable() other._assert_open() self._assert_comparable(other) cbloomfilter.mbarray_And(self._bf.array, other._bf.array) self._bf.count_correct = 0 return self + def __iand__(self, BloomFilter other): + return self.intersection(other) + def _assert_open(self): if self._closed != 0: raise ValueError("I/O operation on closed file") + def _assert_writable(self): + if self.read_only: + raise ValueError("Write operation on read-only file") + def _assert_comparable(self, BloomFilter other): error = ValueError("The two %s objects are not the same type (hint, " "use copy_template)" % self.__class__.__name__) @@ -339,13 +364,13 @@ cdef class BloomFilter: return result @classmethod - def from_base64(cls, filename, string, perm=0755): - bfile_fp = os.open(filename, construct_mode('w+'), perm) + def from_base64(cls, filename, string, mode="rw+", perm=0755): + bfile_fp = os.open(filename, _construct_mode('w+'), perm) os.write(bfile_fp, zlib.decompress(base64.b64decode(zlib.decompress( base64.b64decode(string))))) os.close(bfile_fp) - return cls.open(filename) + return cls.open(filename, mode) @classmethod - def open(cls, filename): - return cls(cls.ReadFile, 0.1, filename, 0) + def open(cls, filename, mode="rw+"): + return cls(cls.ReadFile, 0.1, filename=filename, mode=mode, perm=0) diff --git a/tests/simpletest.py b/tests/simpletest.py index b9692a5..887c3ef 100755 --- a/tests/simpletest.py +++ b/tests/simpletest.py @@ -128,6 +128,43 @@ def test_open(self): bf = pybloomfilter.BloomFilter.open(self.bf.name.decode()) self._check_filter_contents(bf) + @with_test_file + def test_readonly(self, filename): + bf = pybloomfilter.BloomFilter(self.FILTER_SIZE, + self.FILTER_ERROR_RATE, + filename) + self._populate_filter(bf) + self._check_filter_contents(bf) + self.assertEqual(bf.read_only, False) + bf.sync() + + bfro = pybloomfilter.BloomFilter.open(filename, mode="r") + self._check_filter_contents(bfro) + self.assertEqual(bfro.read_only, True) + + def test_readonly_cannot_write(self): + bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") + self.assertRaises(ValueError, bfro.add, "test") + self.assertRaises(ValueError, bfro.update, ["test"]) + self.assertRaises(ValueError, bfro.sync) + self.assertRaises(ValueError, bfro.clear_all) + + def test_readonly_cannot_do_set_operations(self): + bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE, + self.FILTER_ERROR_RATE) + + bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") + self.assertRaises(ValueError, bfro.union, bf_mem) + + bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") + self.assertRaises(ValueError, bfro.intersection, bf_mem) + + bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") + self.assertRaises(ValueError, bfro.__ior__, bf_mem) + + bfro = pybloomfilter.BloomFilter.open(self.tempfile.name, mode="r") + self.assertRaises(ValueError, bfro.__iand__, bf_mem) + @with_test_file def test_copy(self, filename): self._populate_filter(self.bf) @@ -190,7 +227,6 @@ def test_others_nofile(self): bf.add(elem) self.assertEqual(elem in bf, True) - #@unittest.skip("unfortunately large files cannot be tested on Travis") @with_test_file def _test_large_file(self, filename): bf = pybloomfilter.BloomFilter(400000000, 0.01, filename)