Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add frozen support to roaring64 #688

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 92 additions & 39 deletions include/roaring/art/art.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
* chunks _differ_. This means that if there are two entries with different
* high 48 bits, then there is only one inner node containing the common key
* prefix, and two leaves.
* * Intrusive leaves: the leaf struct is included in user values. This removes
* a layer of indirection.
* * Mostly pointer-free: nodes are referred to by index rather than pointer,
* so that the structure can be deserialized with a backing buffer.
*/

// Fixed length of keys in the ART. All keys are assumed to be of this length.
Expand All @@ -33,25 +33,42 @@ namespace internal {
#endif

typedef uint8_t art_key_chunk_t;
typedef struct art_node_s art_node_t;

// Internal node reference type. Contains the node typecode in the low 8 bits,
// and the index in the relevant node array in the high 48 bits. Has a value of
// CROARING_ART_NULL_REF when pointing to a non-existent node.
typedef uint64_t art_ref_t;

typedef struct art_leaf_s art_leaf_t;
typedef struct art_node4_s art_node4_t;
typedef struct art_node16_s art_node16_t;
typedef struct art_node48_s art_node48_t;
typedef struct art_node256_s art_node256_t;

/**
* Wrapper to allow an empty tree.
* The ART is empty when root is a null ref.
*
* Each node type has its own dynamic array of node structs, indexed by
* art_ref_t. The arrays are expanded as needed, and shrink only when
* `shrink_to_fit` is called.
*/
typedef struct art_s {
art_node_t *root;
art_ref_t root;

// Indexed by node typecode, thus 1 larger than it needs to be for
// convenience. `first_free` indicates the index where the first free node
// lives, which may be equal to the capacity.
uint64_t first_free[6];
uint64_t capacities[6];

art_leaf_t *leaves;
art_node4_t *node4s;
art_node16_t *node16s;
art_node48_t *node48s;
art_node256_t *node256s;
} art_t;

/**
* Values inserted into the tree have to be cast-able to art_val_t. This
* improves performance by reducing indirection.
*
* NOTE: Value pointers must be unique! This is because each value struct
* contains the key corresponding to the value.
*/
typedef struct art_val_s {
art_key_chunk_t key[ART_KEY_BYTES];
} art_val_t;
typedef uint64_t art_val_t;

/**
* Compares two keys, returns their relative order:
Expand All @@ -63,14 +80,21 @@ int art_compare_keys(const art_key_chunk_t key1[],
const art_key_chunk_t key2[]);

/**
* Inserts the given key and value.
* Initializes the ART.
*/
void art_insert(art_t *art, const art_key_chunk_t *key, art_val_t *val);
void art_init_cleared(art_t *art);

/**
* Returns the value erased, NULL if not found.
* Inserts the given key and value. Returns a pointer to the value inserted,
* valid as long as the ART is not modified.
*/
art_val_t *art_erase(art_t *art, const art_key_chunk_t *key);
art_val_t *art_insert(art_t *art, const art_key_chunk_t *key, art_val_t val);

/**
* Returns true if a value was erased. Sets `*erased_val` to the value erased,
* if any.
*/
bool art_erase(art_t *art, const art_key_chunk_t *key, art_val_t *erased_val);

/**
* Returns the value associated with the given key, NULL if not found.
Expand All @@ -83,42 +107,39 @@ art_val_t *art_find(const art_t *art, const art_key_chunk_t *key);
bool art_is_empty(const art_t *art);

/**
* Frees the nodes of the ART except the values, which the user is expected to
* free.
* Frees the contents of the ART. Should not be called when using
* `art_deserialize_frozen_safe`.
*/
void art_free(art_t *art);

/**
* Returns the size in bytes of the ART. Includes size of pointers to values,
* but not the values themselves.
*/
size_t art_size_in_bytes(const art_t *art);

/**
* Prints the ART using printf, useful for debugging.
*/
void art_printf(const art_t *art);

/**
* Callback for validating the value stored in a leaf.
* Callback for validating the value stored in a leaf. `context` is a
* user-provided value passed to the callback without modification.
*
* Should return true if the value is valid, false otherwise
* If false is returned, `*reason` should be set to a static string describing
* the reason for the failure.
*/
typedef bool (*art_validate_cb_t)(const art_val_t *val, const char **reason);
typedef bool (*art_validate_cb_t)(const art_val_t val, const char **reason,
void *context);

/**
* Validate the ART tree, ensuring it is internally consistent.
* Validate the ART tree, ensuring it is internally consistent. `context` is a
* user-provided value passed to the callback without modification.
*/
bool art_internal_validate(const art_t *art, const char **reason,
art_validate_cb_t validate_cb);
art_validate_cb_t validate_cb, void *context);

/**
* ART-internal iterator bookkeeping. Users should treat this as an opaque type.
*/
typedef struct art_iterator_frame_s {
art_node_t *node;
art_ref_t ref;
uint8_t index_in_node;
} art_iterator_frame_t;

Expand All @@ -130,6 +151,8 @@ typedef struct art_iterator_s {
art_key_chunk_t key[ART_KEY_BYTES];
art_val_t *value;

art_t *art;

uint8_t depth; // Key depth
uint8_t frame; // Node depth

Expand All @@ -143,19 +166,19 @@ typedef struct art_iterator_s {
* depending on `first`. The iterator is not valid if there are no entries in
* the ART.
*/
art_iterator_t art_init_iterator(const art_t *art, bool first);
art_iterator_t art_init_iterator(art_t *art, bool first);

/**
* Returns an initialized iterator positioned at a key equal to or greater than
* the given key, if it exists.
*/
art_iterator_t art_lower_bound(const art_t *art, const art_key_chunk_t *key);
art_iterator_t art_lower_bound(art_t *art, const art_key_chunk_t *key);

/**
* Returns an initialized iterator positioned at a key greater than the given
* key, if it exists.
*/
art_iterator_t art_upper_bound(const art_t *art, const art_key_chunk_t *key);
art_iterator_t art_upper_bound(art_t *art, const art_key_chunk_t *key);

/**
* The following iterator movement functions return true if a new entry was
Expand All @@ -174,14 +197,44 @@ bool art_iterator_lower_bound(art_iterator_t *iterator,
/**
* Insert the value and positions the iterator at the key.
*/
void art_iterator_insert(art_t *art, art_iterator_t *iterator,
const art_key_chunk_t *key, art_val_t *val);
void art_iterator_insert(art_iterator_t *iterator, const art_key_chunk_t *key,
art_val_t val);

/**
* Erase the value pointed at by the iterator. Moves the iterator to the next
* leaf. Returns the value erased or NULL if nothing was erased.
* leaf.
* Returns true if a value was erased. Sets `*erased_val` to the value erased,
* if any.
*/
bool art_iterator_erase(art_iterator_t *iterator, art_val_t *erased_val);

/**
* Shrinks the internal arrays in the ART to remove any unused elements. Returns
* the number of bytes freed.
*/
size_t art_shrink_to_fit(art_t *art);

/**
* Returns the serialized size in bytes.
* Requires `art_shrink_to_fit` to be called first.
*/
size_t art_size_in_bytes(const art_t *art);

/**
* Serializes the ART and returns the number of bytes written. Returns 0 on
* error. Requires `art_shrink_to_fit` to be called first.
*/
size_t art_serialize(const art_t *art, char *buf);

/**
* Deserializes the ART from a serialized buffer, reading up to `maxbytes`
* bytes. Returns 0 on error. Requires `buf` to be 8 byte aligned.
*
* An ART deserialized in this way should only be used in a readonly context.The
* underlying buffer must not be freed before the ART. `art_free` should not be
* called on the ART deserialized in this way.
*/
art_val_t *art_iterator_erase(art_t *art, art_iterator_t *iterator);
size_t art_frozen_view(const char *buf, size_t maxbytes, art_t *art);

#ifdef __cplusplus
} // extern "C"
Expand Down
55 changes: 54 additions & 1 deletion include/roaring/roaring64.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace api {
#endif

typedef struct roaring64_bitmap_s roaring64_bitmap_t;
typedef struct roaring64_leaf_s roaring64_leaf_t;
typedef uint64_t roaring64_leaf_t;
typedef struct roaring64_iterator_s roaring64_iterator_t;

/**
Expand Down Expand Up @@ -312,6 +312,12 @@ uint64_t roaring64_bitmap_maximum(const roaring64_bitmap_t *r);
*/
bool roaring64_bitmap_run_optimize(roaring64_bitmap_t *r);

/**
* Shrinks internal arrays to eliminate any unused capacity. Returns the number
* of bytes freed.
*/
size_t roaring64_bitmap_shrink_to_fit(roaring64_bitmap_t *r);

/**
* (For advanced users.)
* Collect statistics about the bitmap
Expand Down Expand Up @@ -564,6 +570,53 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf,
roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe(const char *buf,
size_t maxbytes);

/**
* Returns the number of bytes required to serialize this bitmap in a "frozen"
* format. This is not compatible with any other serialization formats.
*
* `roaring64_bitmap_shrink_to_fit()` must be called before this method.
*/
size_t roaring64_bitmap_frozen_size_in_bytes(const roaring64_bitmap_t *r);

/**
* Serializes the bitmap in a "frozen" format. The given buffer must be at least
* `roaring64_bitmap_frozen_size_in_bytes()` in size. Returns the number of
* bytes used for serialization.
*
* `roaring64_bitmap_shrink_to_fit()` must be called before this method.
*
* The frozen format is optimized for speed of (de)serialization, as well as
* allowing the user to create a bitmap based on a memory mapped file, which is
* possible because the format mimics the memory layout of the bitmap.
*
* Because the format mimics the memory layout of the bitmap, the format is not
* fixed across releases of Roaring Bitmaps, and may change in future releases.
*
* This function is endian-sensitive. If you have a big-endian system (e.g., a
* mainframe IBM s390x), the data format is going to be big-endian and not
* compatible with little-endian systems.
*/
size_t roaring64_bitmap_frozen_serialize(const roaring64_bitmap_t *r,
char *buf);

/**
* Creates a readonly bitmap that is a view of the given buffer. The buffer
* must be created with `roaring64_bitmap_frozen_serialize()`, and must be
* aligned by 64 bytes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 64 byte alignment really required? It seems 8 byte (64 bit) alignment is more likely

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can tell from here, yes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SLieve I am not sure that it is required. I think it is preferred (for performance).

*
* Returns NULL if deserialization fails.
*
* The returned bitmap must only be used in a readonly manner. The bitmap must
* be freed using `roaring64_bitmap_free()` as normal. The backing buffer must
* only be freed after the bitmap.
*
* This function is endian-sensitive. If you have a big-endian system (e.g., a
* mainframe IBM s390x), the data format is going to be big-endian and not
* compatible with little-endian systems.
*/
roaring64_bitmap_t *roaring64_bitmap_frozen_view(const char *buf,
size_t maxbytes);

/**
* Iterate over the bitmap elements. The function `iterator` is called once for
* all the values with `ptr` (can be NULL) as the second parameter of each call.
Expand Down
4 changes: 4 additions & 0 deletions microbenchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ add_executable(bench bench.cpp)
target_link_libraries(bench PRIVATE roaring)
target_link_libraries(bench PRIVATE benchmark::benchmark)
target_compile_definitions(bench PRIVATE BENCHMARK_DATA_DIR="${BENCHMARK_DATA_DIR}")

add_executable(synthetic_bench synthetic_bench.cpp)
target_link_libraries(synthetic_bench PRIVATE roaring)
target_link_libraries(synthetic_bench PRIVATE benchmark::benchmark)
Loading
Loading