Skip to content

Commit

Permalink
Add compression of strings in Mixed, Lst<String> and Dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
jedelbo committed Jun 12, 2024
1 parent 9f4d51c commit 731b7d2
Show file tree
Hide file tree
Showing 21 changed files with 226 additions and 152 deletions.
4 changes: 2 additions & 2 deletions src/realm/array_backlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,12 @@ void ArrayBacklink::verify() const
REALM_ASSERT(src_obj.get<Mixed>(src_col_key).get_link() == target_link);
}
else if (val.is_type(type_List)) {
DummyParent parent(src_table, val.get_ref());
DummyParent parent(src_table, val.get_ref(), src_col_key);
Lst<Mixed> list(parent, 0);
REALM_ASSERT(list.find_any(target_link) != npos);
}
else if (val.is_type(type_Dictionary)) {
DummyParent parent(src_table, val.get_ref());
DummyParent parent(src_table, val.get_ref(), src_col_key);
Dictionary dict(parent, 0);
REALM_ASSERT(dict.find_any(target_link) != npos);
}
Expand Down
77 changes: 42 additions & 35 deletions src/realm/array_mixed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,8 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles,
decimals, links. In general we can compress them, but we need to be careful, controlling the col_type
should prevent compressing data that we want to leave in the current format.
3. string array is for strings and binary data (no compression for now)
4. ref array is actually storing refs to collections. they can only be BPlusTree<int, Mixed> or
BPlusTree<string, Mixed>.
3. string array is for strings and binary data
4. ref array is actually storing refs to collections. They can only be Lst<Mixed> or Dictionary.
5. key array stores unique identifiers for collections in mixed (integers that can be compressed)
*/
Array composite(alloc);
Expand All @@ -372,41 +371,48 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
auto ref = top.get(i);
ref_type new_ref = ref;
if (ref && !(out.only_modified && alloc.is_read_only(ref))) {
if (i < 3) { // int, and pair_int
// integer arrays
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
}
else if (i == 4) { // collection in mixed
ArrayRef arr_ref(alloc);
arr_ref.init_from_ref(ref);
auto ref_sz = arr_ref.size();
TempArray written_ref_leaf(ref_sz);

for (size_t k = 0; k < ref_sz; k++) {
ref_type new_sub_ref = 0;
if (auto sub_ref = arr_ref.get(k)) {
auto header = alloc.translate(sub_ref);
// Now we have to find out if the nested collection is a
// dictionary or a list. If the top array has a size of 2
// and it is not a BplusTree inner node, then it is a dictionary
if (NodeHeader::get_size_from_header(header) == 2 &&
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
}
else {
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
switch (i) {
case payload_idx_int:
// integer array
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
case payload_idx_pair:
// integer array
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
case payload_idx_str:
new_ref = ArrayString::typed_write(ref, out, alloc);
break;
case payload_idx_ref: {
// collection in mixed
ArrayRef arr_ref(alloc);
arr_ref.init_from_ref(ref);
auto ref_sz = arr_ref.size();
TempArray written_ref_leaf(ref_sz);

for (size_t k = 0; k < ref_sz; k++) {
ref_type new_sub_ref = 0;
if (auto sub_ref = arr_ref.get(k)) {
auto header = alloc.translate(sub_ref);
// Now we have to find out if the nested collection is a
// dictionary or a list. If the top array has a size of 2
// and it is not a BplusTree inner node, then it is a dictionary
if (NodeHeader::get_size_from_header(header) == 2 &&
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
}
else {
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
}
}
written_ref_leaf.set_as_ref(k, new_sub_ref);
}
written_ref_leaf.set_as_ref(k, new_sub_ref);
new_ref = written_ref_leaf.write(out);
break;
}
new_ref = written_ref_leaf.write(out);
}
else if (i == 5) { // unique keys associated to collections in mixed
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
}
else {
// all the rest we don't want to compress it, at least for now (strings will be needed)
new_ref = Array::write(ref, alloc, out, out.only_modified, false);
case payload_idx_key:
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
}
}
written_leaf.set(i, new_ref);
Expand Down Expand Up @@ -451,6 +457,7 @@ void ArrayMixed::ensure_string_array() const
m_strings.create();
m_strings.update_parent();
}
m_strings.set_string_interner(m_string_interner);
}
}

Expand Down
10 changes: 10 additions & 0 deletions src/realm/array_mixed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ class ArrayMixed : public ArrayPayload, private Array {
{
Array::set_parent(parent, ndx_in_parent);
}
bool need_string_interner() const override
{
return true;
}
virtual void set_string_interner(StringInterner* interner) const
{
m_string_interner = interner;
}

void init_from_parent()
{
ref_type ref = get_ref_from_parent();
Expand Down Expand Up @@ -135,6 +144,7 @@ class ArrayMixed : public ArrayPayload, private Array {
mutable ArrayString m_strings;
// Used to store nested collection refs
mutable ArrayRef m_refs;
mutable StringInterner* m_string_interner = nullptr;

DataType get_type(size_t ndx) const
{
Expand Down
47 changes: 34 additions & 13 deletions src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <realm/array_string.hpp>
#include <realm/impl/array_writer.hpp>
#include <realm/spec.hpp>
#include <realm/table.hpp>
#include <realm/mixed.hpp>

using namespace realm;
Expand Down Expand Up @@ -536,18 +536,39 @@ void ArrayString::verify() const
#endif
}

ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner)
template <>
ref_type ArrayString::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc)
{
REALM_ASSERT(interner);
// we have to write out all, modified or not, to match the total cleanup
Array interned(Allocator::get_default());
auto sz = size();
interned.create(NodeHeader::type_Normal, true, sz);
for (size_t i = 0; i < sz; ++i) {
interned.set(i, interner->intern(get(i)));
Array leaf(alloc);
leaf.init_from_ref(ref);
ref_type ret_val;
auto header = leaf.get_header();
if (NodeHeader::get_hasrefs_from_header(header) ||
NodeHeader::get_wtype_from_header(header) == NodeHeader::wtype_Multiply) {
// We're interning these strings
ArrayString as(alloc);
as.init_from_ref(ref);
StringInterner* interner = out.table->get_string_interner(out.col_key);
auto sz = as.size();
Array interned(Allocator::get_default());
interned.create(NodeHeader::type_Normal, true, sz);
for (size_t i = 0; i < sz; ++i) {
interned.set(i, interner->intern(as.get(i)));
}
ret_val = interned.write(out, false, false, out.compress);
interned.destroy();
// in a transactional setting:
// Destroy all sub-arrays if present, in order to release memory in file
// This is contrary to the rest of the handling in this function, but needed
// here since sub-arrays may not have been COW'ed and therefore not freed in file.
// We rely on 'only_modified' to indicate that we're in a transactional setting.
if (out.only_modified)
leaf.destroy_deep(true);
}
else {
// whether it's the old enum strings or the new interned strings,
// just write out the array using integer leaf compression
ret_val = leaf.write(out, false, out.only_modified, out.compress);
}
auto retval = interned.write(out, false, false, out.compress);
interned.destroy();
return retval;
// return m_arr->write(out, true, false, false);
return ret_val;
}
6 changes: 2 additions & 4 deletions src/realm/array_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,8 @@ class ArrayString : public ArrayPayload {
static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept;

void verify() const;
// Write to 'out', if needed using 'interner' to intern any strings.
// An interner of 0 will disable interning. Interned values may be further
// compressed using leaf compression for integer arrays.
ref_type write(_impl::ArrayWriterBase& out, StringInterner* interner);
template <class T>
static ref_type typed_write(ref_type ref, T& out, Allocator& alloc);

private:
static constexpr size_t small_string_max_size = 15; // ArrayStringShort
Expand Down
24 changes: 21 additions & 3 deletions src/realm/bplustree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ namespace realm {

class BPlusTreeBase;
class BPlusTreeInner;
class StringInterner;

/*****************************************************************************/
/* BPlusTreeNode */
Expand Down Expand Up @@ -207,6 +208,16 @@ class BPlusTreeBase {
m_root->bp_set_parent(parent, ndx_in_parent);
}

void set_interner(StringInterner* interner)
{
m_interner = interner;
}

StringInterner* get_interner()
{
return m_interner;
}

virtual void erase(size_t) = 0;
virtual void clear() = 0;
virtual void swap(size_t, size_t) = 0;
Expand Down Expand Up @@ -234,6 +245,7 @@ class BPlusTreeBase {
std::unique_ptr<BPlusTreeNode> m_root;
Allocator& m_alloc;
ArrayParent* m_parent = nullptr;
StringInterner* m_interner = nullptr;
size_t m_ndx_in_parent = 0;
size_t m_size = 0;
size_t m_cached_leaf_begin;
Expand Down Expand Up @@ -300,6 +312,9 @@ class BPlusTree : public BPlusTreeBase {
void init_from_ref(ref_type ref) noexcept override
{
LeafArray::init_from_ref(ref);
if constexpr (realm::is_any_v<T, StringData, Mixed>) {
LeafArray::set_string_interner(m_tree->get_interner());
}
}

ref_type get_ref() const override
Expand Down Expand Up @@ -574,13 +589,16 @@ class BPlusTree : public BPlusTreeBase {

std::unique_ptr<BPlusTreeLeaf> create_leaf_node() override
{
std::unique_ptr<BPlusTreeLeaf> leaf = std::make_unique<LeafNode>(this);
static_cast<LeafNode*>(leaf.get())->create();
auto leaf = std::make_unique<LeafNode>(this);
leaf->create();
if constexpr (realm::is_any_v<T, StringData, Mixed>) {
leaf->set_string_interner(m_interner);
}
return leaf;
}
std::unique_ptr<BPlusTreeLeaf> init_leaf_node(ref_type ref) override
{
std::unique_ptr<BPlusTreeLeaf> leaf = std::make_unique<LeafNode>(this);
auto leaf = std::make_unique<LeafNode>(this);
leaf->init_from_ref(ref);
return leaf;
}
Expand Down
35 changes: 10 additions & 25 deletions src/realm/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,12 @@ inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const
m_tree_top.set_string_interner(arr, col_key);
}

template <>
inline void Cluster::set_string_interner(ArrayMixed& arr, ColKey col_key) const
{
m_tree_top.set_string_interner(arr, col_key);
}

template <class T>
inline void Cluster::set_spec(T&, ColKey::Idx) const
{
Expand Down Expand Up @@ -314,6 +320,7 @@ inline void Cluster::do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_valu
{
ArrayMixed arr(m_alloc);
arr.set_parent(this, col_key.get_index().val + s_first_col_index);
set_string_interner(arr, col_key);
arr.init_from_parent();
arr.insert(ndx, init_value);

Expand Down Expand Up @@ -798,6 +805,7 @@ inline void Cluster::do_erase_mixed(size_t ndx, ColKey col_key, ObjKey key, Casc

ArrayMixed values(m_alloc);
values.set_parent(this, col_ndx.val + s_first_col_index);
set_string_interner(values, col_key);
values.init_from_parent();

Mixed value = values.get(ndx);
Expand Down Expand Up @@ -1447,6 +1455,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const
}
case col_type_Mixed: {
ArrayMixed arr(m_alloc);
set_string_interner(arr, col);
ref_type ref = Array::get_as_ref(j);
arr.init_from_ref(ref);
std::cout << ", " << arr.get(i);
Expand Down Expand Up @@ -1651,32 +1660,8 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const
else {
// Columns
auto col_key = out.table->m_leaf_ndx2colkey[j - 1];
out.col_key = col_key;
auto col_type = col_key.get_type();
// String columns are interned at this point
if (out.compress && col_type == col_type_String && !col_key.is_collection()) {
ArrayRef leaf(m_alloc);
leaf.init_from_ref(ref);
auto header = leaf.get_header();
if (NodeHeader::get_hasrefs_from_header(header) ||
NodeHeader::get_wtype_from_header(header) == wtype_Multiply) {
// We're interning these strings
ArrayString as(m_alloc);
as.init_from_ref(leaf_rot.get_as_ref());
written_cluster.set_as_ref(j, as.write(out, out.table->get_string_interner(col_key)));
// in a transactional setting:
// Destroy all sub-arrays if present, in order to release memory in file
// This is contrary to the rest of the handling in this function, but needed
// here since sub-arrays may not have been COW'ed and therefore not freed in file.
// We rely on 'only_modified' to indicate that we're in a transactional setting.
if (only_modified)
leaf.destroy_deep(true);
continue;
}
// whether it's the old enum strings or the new interned strings,
// just write out the array using integer leaf compression
written_cluster.set_as_ref(j, leaf.write(out, false, false, false));
continue;
}
if (col_key.is_collection()) {
ArrayRef arr_ref(m_alloc);
arr_ref.init_from_ref(ref);
Expand Down
2 changes: 1 addition & 1 deletion src/realm/cluster_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1140,7 +1140,7 @@ void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const
// Check for owner. This function may be called in context of DictionaryClusterTree
// in which case m_owner is null (and spec never needed).
if (m_owner) {
arr.set_string_interner(_impl::TableFriend::get_string_interner(*m_owner, col_key));
arr.set_string_interner(m_owner->get_string_interner(col_key));
}
}

Expand Down
6 changes: 4 additions & 2 deletions src/realm/collection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ struct CollectionIterator;
// Used in Cluster when removing owning object
class DummyParent : public CollectionParent {
public:
DummyParent(TableRef t, ref_type ref)
DummyParent(TableRef t, ref_type ref, ColKey ck)
: m_obj(t, MemRef(), ObjKey(), 0)
, m_ref(ref)
, m_col_key(ck)
{
}
FullPath get_path() const noexcept final
Expand All @@ -37,7 +38,7 @@ class DummyParent : public CollectionParent {
}
ColKey get_col_key() const noexcept final
{
return {};
return m_col_key;
}
void add_index(Path&, const Index&) const noexcept final {}
size_t find_index(const Index&) const noexcept final
Expand All @@ -62,6 +63,7 @@ class DummyParent : public CollectionParent {
protected:
Obj m_obj;
ref_type m_ref;
ColKey m_col_key;
UpdateStatus update_if_needed() const final
{
return UpdateStatus::Updated;
Expand Down
Loading

0 comments on commit 731b7d2

Please sign in to comment.