Skip to content

Commit

Permalink
Merge pull request #25 from borgbackup/package
Browse files Browse the repository at this point in the history
move code to a Package
  • Loading branch information
ThomasWaldmann authored Oct 31, 2024
2 parents fb09377 + 9a27697 commit efab864
Show file tree
Hide file tree
Showing 14 changed files with 321 additions and 279 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ jobs:
python -m pip install --upgrade pip setuptools
pip install -r requirements.d/dev.txt
- name: Install borghash
run: pip install -ve .
run: |
python setup.py build_ext --inplace
python -m build
pip install -v dist/borghash*.tar.gz
- name: run tox env
run: tox --skip-missing-interpreters
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
.idea
.pytest_cache
.tox
build
dist
__pycache__
src/borghash.egg-info
src/borghash/_version.py
src/borghash/borghash.cpp
src/*.so
**/*.so
**/*.c
**/*.egg-info
9 changes: 9 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ Results on an Apple MacBook Pro (M3 Pro CPU) are like:
HashTableNT serialization (count=50000): write: 0.020s, read: 0.021s.


Building / Installing
---------------------
::

python setup.py build_ext --inplace
python -m build
pip install dist/borghash*.tar.gz


State of this project
---------------------

Expand Down
13 changes: 2 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,10 @@ dependencies = []
"Changelog" = "https://github.com/borgbackup/borghash/blob/master/changes.rst"

[project.scripts]
borghash-demo = "borghash:demo"

[tool.setuptools]
# See also the MANIFEST.in file.
# We want to install all the files in the package directories...
include-package-data = true

[tool.setuptools.exclude-package-data]
# ...except the source files which have been compiled (C extensions):
"*" = ["*.c", "*.h", "*.pyx"]
borghash-demo = "borghash.__main__:demo"

[build-system]
requires = ["setuptools", "wheel", "Cython>=3.0.3", "setuptools_scm[toml]>=6.2"]
requires = ["setuptools", "wheel", "setuptools_scm[toml]>=6.2"]
build-backend = "setuptools.build_meta"

[tool.setuptools_scm]
Expand Down
1 change: 1 addition & 0 deletions requirements.d/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ pytest
pytest-benchmark
build
twine
Cython
22 changes: 18 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
from setuptools import setup
from Cython.Build import cythonize
from setuptools import Extension, setup

try:
from Cython.Build import cythonize
except ImportError:
cythonize = None # we don't have cython installed

ext = '.pyx' if cythonize else '.c'

extensions = [
Extension("borghash.HashTable", ["src/borghash/HashTable" + ext]),
Extension("borghash.HashTableNT", ["src/borghash/HashTableNT" + ext]),
]

if cythonize:
extensions = cythonize(extensions, language_level="3str")

setup(
package_data=dict(borghash=["borghash.pxd"]),
ext_modules=cythonize("borghash.pyx")
package_data={"borghash": ["*.pxd", "*.pyx"]},
ext_modules=extensions,
)
11 changes: 2 additions & 9 deletions borghash.pxd → src/borghash/HashTable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ from libc.stdint cimport uint8_t, uint32_t

cdef class HashTable:
cdef int ksize, vsize
cdef int initial_capacity, capacity, used, tombstones
cdef readonly int capacity, used
cdef int initial_capacity, tombstones
cdef float max_load_factor, min_load_factor, shrink_factor, grow_factor
cdef uint32_t* table
cdef int kv_capacity, kv_used
Expand All @@ -16,11 +17,3 @@ cdef class HashTable:
cdef int _lookup_index(self, uint8_t* key_ptr, int* index_ptr)
cdef void _resize_table(self, int new_capacity)
cdef void _resize_kv(self, int new_capacity)


cdef class HashTableNT:
cdef int key_size
cdef object value_type
cdef object value_struct
cdef int value_size
cdef HashTable inner
250 changes: 0 additions & 250 deletions borghash.pyx → src/borghash/HashTable.pyx
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
"""
borghash - hashtable implementations in cython.
HashTable: low-level ht mapping fully random bytes keys to bytes values.
key and value length can be chosen, but is fixed afterwards.
the keys and values are stored in arrays separate from the hashtable.
the hashtable only stores the 32bit indexes into the key/value arrays.
HashTableNT: wrapper around HashTable, providing namedtuple values and serialization.
"""
from __future__ import annotations
from typing import BinaryIO, Iterator, Any
Expand All @@ -15,10 +11,7 @@ from libc.stdlib cimport malloc, free, realloc
from libc.string cimport memcpy, memset, memcmp
from libc.stdint cimport uint8_t, uint32_t

from collections import namedtuple
from collections.abc import Mapping
import json
import struct

MAGIC = b"BORGHASH"
assert len(MAGIC) == 8
Expand Down Expand Up @@ -335,246 +328,3 @@ cdef class HashTable:
"resize_table": self.stats_resize_table,
"resize_kv": self.stats_resize_kv,
}


cdef class HashTableNT:
def __init__(self, items=None, *,
key_size: int = 0, value_format: str = "", value_type: Any = None,
capacity: int = MIN_CAPACITY) -> None:
if not key_size:
raise ValueError("key_size must be specified and must be > 0.")
if not value_format:
raise ValueError("value_format must be specified and must be non-empty.")
if value_type is None:
raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).")
self.key_size = key_size
self.value_struct = struct.Struct(value_format)
self.value_size = self.value_struct.size
self.value_type = value_type
self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity)
_fill(self, items)

def clear(self) -> None:
self.inner.clear()

def _check_key(self, key: bytes) -> None:
if not isinstance(key, bytes):
raise TypeError(f"Expected an instance of bytes, got {type(key)}")
if len(key) != self.key_size:
raise ValueError(f"Key must be {self.key_size} bytes long")

def _to_binary_value(self, value: Any) -> bytes:
if not isinstance(value, self.value_type):
if isinstance(value, tuple):
value = self.value_type(*value)
else:
raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}")
return self.value_struct.pack(*value)

def _to_namedtuple_value(self, binary_value: bytes) -> Any:
unpacked_data = self.value_struct.unpack(binary_value)
return self.value_type(*unpacked_data)

def _set_raw(self, key: bytes, value: bytes) -> None:
self.inner[key] = value

def _get_raw(self, key: bytes) -> bytes:
return self.inner[key]

def __setitem__(self, key: bytes, value: Any) -> None:
self._check_key(key)
self.inner[key] = self._to_binary_value(value)

def __getitem__(self, key: bytes) -> Any:
self._check_key(key)
binary_value = self.inner[key]
return self._to_namedtuple_value(binary_value)

def __delitem__(self, key: bytes) -> None:
self._check_key(key)
del self.inner[key]

def __contains__(self, key: bytes) -> bool:
self._check_key(key)
return key in self.inner

def items(self) -> Iterator[tuple[bytes, Any]]:
for key, binary_value in self.inner.items():
yield (key, self._to_namedtuple_value(binary_value))

def __len__(self) -> int:
return len(self.inner)

def get(self, key: bytes, default: Any = None) -> Any:
self._check_key(key)
try:
binary_value = self.inner[key]
except KeyError:
return default
else:
return self._to_namedtuple_value(binary_value)

def setdefault(self, key: bytes, default: Any) -> Any:
self._check_key(key)
binary_default = self._to_binary_value(default)
binary_value = self.inner.setdefault(key, binary_default)
return self._to_namedtuple_value(binary_value)

def pop(self, key: bytes, default: Any = _NoDefault) -> Any:
self._check_key(key)
try:
binary_value = self.inner.pop(key)
except KeyError:
if default is _NoDefault:
raise
return default
else:
return self._to_namedtuple_value(binary_value)

def k_to_idx(self, key: bytes) -> int:
return self.inner.k_to_idx(key)

def idx_to_k(self, idx: int) -> bytes:
return self.inner.idx_to_k(idx)

def kv_to_idx(self, key: bytes, value: Any) -> int:
binary_value = self._to_binary_value(value)
return self.inner.kv_to_idx(key, binary_value)

def idx_to_kv(self, idx: int) -> tuple[bytes, Any]:
key, binary_value = self.inner.idx_to_kv(idx)
return key, self._to_namedtuple_value(binary_value)

@property
def stats(self) -> dict[str, int]:
return self.inner.stats

def write(self, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'wb') as fd:
self._write_fd(fd)
else:
self._write_fd(file)

def _write_fd(self, fd: BinaryIO):
meta = {
'key_size': self.key_size,
'value_size': self.value_size,
'value_format': self.value_struct.format,
'value_type_name': self.value_type.__name__,
'value_type_fields': self.value_type._fields,
'capacity': self.inner.capacity,
'used': self.inner.used, # count of keys / values
}
meta_bytes = json.dumps(meta).encode("utf-8")
meta_size = len(meta_bytes)
header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size)
fd.write(header_bytes)
fd.write(meta_bytes)
count = 0
for key, value in self.inner.items():
fd.write(key)
fd.write(value)
count += 1
assert count == self.inner.used

@classmethod
def read(cls, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'rb') as fd:
return cls._read_fd(fd)
else:
return cls._read_fd(file)

@classmethod
def _read_fd(cls, fd: BinaryIO):
header_size = struct.calcsize(HEADER_FMT)
header_bytes = fd.read(header_size)
if len(header_bytes) < header_size:
raise ValueError(f"Invalid file, file is too short.")
magic, version, meta_size = struct.unpack(HEADER_FMT, header_bytes)
if magic != MAGIC:
raise ValueError(f"Invalid file, magic {MAGIC.decode()} not found.")
if version != VERSION:
raise ValueError(f"Unsupported file version {version}.")
meta_bytes = fd.read(meta_size)
if len(meta_bytes) < meta_size:
raise ValueError(f"Invalid file, file is too short.")
meta = json.loads(meta_bytes.decode("utf-8"))
value_type = namedtuple(meta['value_type_name'], meta['value_type_fields'])
ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity'])
count = 0
ksize, vsize = meta['key_size'], meta['value_size']
for i in range(meta['used']):
key = fd.read(ksize)
value = fd.read(vsize)
ht._set_raw(key, value)
return ht

def size(self) -> int:
"""
do a rough worst-case estimate of the on-disk size when using .write().

the serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads.
"""
one_time_overheads = 4096 # very rough
N = self.inner.used
return int(N * (self.key_size + self.value_size) + one_time_overheads)


def demo():
print("BorgHash demo")
print("=============")
print("Code:")
code = """
from tempfile import NamedTemporaryFile
from time import time
count = 50000
value_type = namedtuple("Chunk", ["refcount", "size"])
# 256bit (32Byte) key, 2x 32bit (4Byte) values
ht = HashTableNT(key_size=32, value_format="<II", value_type=value_type)
t0 = time()
for i in range(count):
# make up a 256bit key from i, first 32bits need to be well distributed.
key = f"{i:4x}{' '*28}".encode()
value = value_type(refcount=i, size=i * 2)
ht[key] = value
assert len(ht) == count
t1 = time()
found = 0
for key, value in ht.items():
i = int(key.decode(), 16)
expected_value = value_type(refcount=i, size=i * 2)
assert ht[key] == expected_value
found += 1
assert found == count
t2 = time()
ht_written = ht
with NamedTemporaryFile(prefix="borghash-demo-ht-read", suffix=".tmp", delete=False) as tmpfile:
ht_written.write(tmpfile)
filename = tmpfile.name
assert len(ht_written) == count, f"{len(ht_written)} != {count}"
t3 = time()
ht_read = HashTableNT.read(filename)
assert len(ht_read) == count, f"{len(ht_read)} != {count}"
t4 = time()
for i in range(count):
# make up a 256bit key from i, first 32bits need to be well distributed.
key = f"{i:4x}{' '*28}".encode()
expected_value = value_type(refcount=i, size=i * 2)
assert ht_read.pop(key) == expected_value
assert len(ht_read) == 0
t5 = time()
print("Result:")
print(f"HashTableNT in-memory ops (count={count}): insert: {t1-t0:.3f}s, lookup: {t2-t1:.3f}s, pop: {t5-t4:.3f}s.")
print(f"HashTableNT serialization (count={count}): write: {t3-t2:.3f}s, read: {t4-t3:.3f}s.")
"""
print(code)
exec(code)
6 changes: 6 additions & 0 deletions src/borghash/HashTableNT.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cdef class HashTableNT:
cdef int key_size
cdef object value_type
cdef object value_struct
cdef int value_size
cdef object inner
Loading

0 comments on commit efab864

Please sign in to comment.