diff --git a/src/realm/CMakeLists.txt b/src/realm/CMakeLists.txt index 18583f3549a..5a67cdabc15 100644 --- a/src/realm/CMakeLists.txt +++ b/src/realm/CMakeLists.txt @@ -62,6 +62,8 @@ set(REALM_SOURCES table.cpp table_ref.cpp obj_list.cpp + string_interner.cpp + string_compressor.cpp object_id.cpp table_view.cpp tokenizer.cpp @@ -178,6 +180,8 @@ set(REALM_INSTALL_HEADERS null.hpp obj.hpp obj_list.hpp + string_interner.hpp + string_compressor.hpp object_id.hpp path.hpp owned_data.hpp diff --git a/src/realm/array.cpp b/src/realm/array.cpp index be70388bb2b..b95d081f4d5 100644 --- a/src/realm/array.cpp +++ b/src/realm/array.cpp @@ -294,7 +294,7 @@ void Array::set_type(Type type) set_hasrefs_in_header(init_has_refs, header); } -void Array::destroy_children(size_t offset) noexcept +void Array::destroy_children(size_t offset, bool ro_only) noexcept { for (size_t i = offset; i != m_size; ++i) { int64_t value = get(i); @@ -310,7 +310,7 @@ void Array::destroy_children(size_t offset) noexcept continue; ref_type ref = to_ref(value); - destroy_deep(ref, m_alloc); + destroy_deep(ref, m_alloc, ro_only); } } @@ -607,6 +607,14 @@ void Array::do_ensure_minimum_width(int_fast64_t value) } } +size_t Array::size() const noexcept +{ + // in case the array is in compressed format. Never read directly + // from the header the size, since it will result very likely in a cache miss. + // For compressed arrays m_size should always be kept updated, due to init_from_mem + return m_size; +} + bool Array::compress_array(Array& arr) const { if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) { diff --git a/src/realm/array.hpp b/src/realm/array.hpp index 47984bfe959..0611068bd12 100644 --- a/src/realm/array.hpp +++ b/src/realm/array.hpp @@ -117,7 +117,7 @@ class Array : public Node, public ArrayParent { /// pointer. void init_from_mem(MemRef) noexcept; - /// Same as `init_from_ref(get_ref_from_parent())`. + /// Same as `init_from_ref(ref_from_parent())`. void init_from_parent() noexcept { ref_type ref = get_ref_from_parent(); @@ -210,6 +210,8 @@ class Array : public Node, public ArrayParent { update_width_cache_from_header(); } + size_t size() const noexcept; + bool is_empty() const noexcept { return size() == 0; @@ -362,7 +364,8 @@ class Array : public Node, public ArrayParent { /// state (as if calling detach()), then free the allocated memory. If this /// accessor is already in the detached state, this function has no effect /// (idempotency). - void destroy_deep() noexcept; + /// If 'ro_only', only free space in read-only memory (the file) + void destroy_deep(bool ro_only = false) noexcept; /// check if the array is encoded (in B format) inline bool is_compressed() const; @@ -377,13 +380,13 @@ class Array : public Node, public ArrayParent { bool try_decompress(); /// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`. - static void destroy_deep(ref_type ref, Allocator& alloc) noexcept; + static void destroy_deep(ref_type ref, Allocator& alloc, bool ro_only = false) noexcept; /// Destroy the specified array node and all of its children, recursively. /// /// This is done by freeing the specified array node after calling /// destroy_deep() for every contained 'ref' element. - static void destroy_deep(MemRef, Allocator&) noexcept; + static void destroy_deep(MemRef, Allocator&, bool ro_only = false) noexcept; // Clone deep static MemRef clone(MemRef, Allocator& from_alloc, Allocator& target_alloc); @@ -546,7 +549,7 @@ class Array : public Node, public ArrayParent { // Overriding method in ArrayParent ref_type get_child_ref(size_t) const noexcept override; - void destroy_children(size_t offset = 0) noexcept; + void destroy_children(size_t offset = 0, bool ro_only = false) noexcept; protected: // Getters and Setters for adaptive-packed arrays @@ -918,16 +921,17 @@ inline void Array::set_context_flag(bool value) noexcept } } -inline void Array::destroy_deep() noexcept +inline void Array::destroy_deep(bool ro_only) noexcept { if (!is_attached()) return; if (m_has_refs) - destroy_children(); + destroy_children(0, ro_only); char* header = get_header_from_data(m_data); - m_alloc.free_(m_ref, header); + if (!ro_only || is_read_only()) + m_alloc.free_(m_ref, header); m_data = nullptr; } @@ -970,20 +974,21 @@ inline void Array::clear_and_destroy_children() truncate_and_destroy_children(0); } -inline void Array::destroy_deep(ref_type ref, Allocator& alloc) noexcept +inline void Array::destroy_deep(ref_type ref, Allocator& alloc, bool ro_only) noexcept { - destroy_deep(MemRef(ref, alloc), alloc); + destroy_deep(MemRef(ref, alloc), alloc, ro_only); } -inline void Array::destroy_deep(MemRef mem, Allocator& alloc) noexcept +inline void Array::destroy_deep(MemRef mem, Allocator& alloc, bool ro_only) noexcept { if (!get_hasrefs_from_header(mem.get_addr())) { - alloc.free_(mem); + if (!ro_only || alloc.is_read_only(mem.get_ref())) + alloc.free_(mem); return; } Array array(alloc); array.init_from_mem(mem); - array.destroy_deep(); + array.destroy_deep(ro_only); } diff --git a/src/realm/array_integer.cpp b/src/realm/array_integer.cpp index f86871c3225..b39ade6e940 100644 --- a/src/realm/array_integer.cpp +++ b/src/realm/array_integer.cpp @@ -22,6 +22,8 @@ #include #include +#include + using namespace realm; ArrayInteger::ArrayInteger(Allocator& allocator) noexcept diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp index b8739414091..22d729e2e29 100644 --- a/src/realm/array_integer.hpp +++ b/src/realm/array_integer.hpp @@ -174,6 +174,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} inline size_t ArrayIntNull::size() const noexcept { + // this cannot be right, what if size is 0 return Array::size() - 1; } diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 636a60a2865..cb2aa6fb3f5 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -17,6 +17,7 @@ **************************************************************************/ #include +#include #include #include @@ -52,14 +53,24 @@ void ArrayString::init_from_mem(MemRef mem) noexcept else { auto arr = new (&m_storage) Array(m_alloc); arr->init_from_mem(mem); - m_string_enum_values = std::make_unique(m_alloc); - ArrayParent* p; - REALM_ASSERT(m_spec != nullptr); - REALM_ASSERT(m_col_ndx != realm::npos); - ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); - m_string_enum_values->init_from_ref(r); - m_string_enum_values->set_parent(p, m_col_ndx); - m_type = Type::enum_strings; + // The context flag is used to indicate interned strings vs old enum strings + // (in conjunction with has_refs() == false) + if (arr->get_context_flag_from_header(arr->get_header())) { + // init for new interned strings (replacing old enum strings) + m_type = Type::interned_strings; + // consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner); + } + else { + // init for old enum strings + m_string_enum_values = std::make_unique(m_alloc); + ArrayParent* p; + REALM_ASSERT(m_spec != nullptr); + REALM_ASSERT(m_col_ndx != realm::npos); + ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); + m_string_enum_values->init_from_ref(r); + m_string_enum_values->set_parent(p, m_col_ndx); + m_type = Type::enum_strings; + } } } else { @@ -111,6 +122,7 @@ size_t ArrayString::size() const case Type::big_strings: return static_cast(m_arr)->size(); case Type::enum_strings: + case Type::interned_strings: return static_cast(m_arr)->size(); } return {}; @@ -128,7 +140,8 @@ void ArrayString::add(StringData value) case Type::big_strings: static_cast(m_arr)->add_string(value); break; - case Type::enum_strings: { + case Type::enum_strings: + case Type::interned_strings: { auto a = static_cast(m_arr); size_t ndx = a->size(); a->add(0); @@ -150,6 +163,11 @@ void ArrayString::set(size_t ndx, StringData value) case Type::big_strings: static_cast(m_arr)->set_string(ndx, value); break; + case Type::interned_strings: { + auto id = m_string_interner->intern(value); + static_cast(m_arr)->set(ndx, id); + break; + } case Type::enum_strings: { size_t sz = m_string_enum_values->size(); size_t res = m_string_enum_values->find_first(value, 0, sz); @@ -178,6 +196,12 @@ void ArrayString::insert(size_t ndx, StringData value) case Type::enum_strings: { static_cast(m_arr)->insert(ndx, 0); set(ndx, value); + break; + } + case Type::interned_strings: { + static_cast(m_arr)->insert(ndx, 0); + set(ndx, value); + break; } } } @@ -195,6 +219,10 @@ StringData ArrayString::get(size_t ndx) const size_t index = size_t(static_cast(m_arr)->get(ndx)); return m_string_enum_values->get(index); } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_interner->get(id); + } } return {}; } @@ -212,6 +240,10 @@ StringData ArrayString::get_legacy(size_t ndx) const size_t index = size_t(static_cast(m_arr)->get(ndx)); return m_string_enum_values->get(index); } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_interner->get(id); + } } return {}; } @@ -231,8 +263,12 @@ bool ArrayString::is_null(size_t ndx) const case Type::big_strings: return static_cast(m_arr)->is_null(ndx); case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->is_null(index); + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_enum_values->is_null(id); + } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return id == 0; } } return {}; @@ -250,6 +286,7 @@ void ArrayString::erase(size_t ndx) case Type::big_strings: static_cast(m_arr)->erase(ndx); break; + case Type::interned_strings: case Type::enum_strings: static_cast(m_arr)->erase(ndx); break; @@ -277,6 +314,9 @@ void ArrayString::move(ArrayString& dst, size_t ndx) // this operation will never be called for enumerated columns REALM_UNREACHABLE(); break; + case Type::interned_strings: + m_arr->truncate(ndx); + break; } } @@ -293,6 +333,7 @@ void ArrayString::clear() static_cast(m_arr)->clear(); break; case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->clear(); break; } @@ -321,6 +362,15 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const } break; } + case Type::interned_strings: { + // we need a way to avoid this lookup for each leaf array. The lookup must appear + // higher up the call stack and passed down. + auto id = m_string_interner->lookup(value); + if (id) { + return static_cast(m_arr)->find_first(*id, begin, end); + } + break; + } } return not_found; } @@ -371,6 +421,9 @@ size_t ArrayString::lower_bound(StringData value) return lower_bound_string(static_cast(m_arr), value); case Type::enum_strings: break; + case Type::interned_strings: + REALM_UNREACHABLE(); + break; } return realm::npos; } @@ -383,6 +436,9 @@ ArrayString::Type ArrayString::upgrade_leaf(size_t value_size) if (m_type == Type::enum_strings) return Type::enum_strings; + if (m_type == Type::interned_strings) + return Type::interned_strings; + if (m_type == Type::medium_strings) { if (value_size <= medium_string_max_size) return Type::medium_strings; @@ -473,8 +529,25 @@ void ArrayString::verify() const static_cast(m_arr)->verify(); break; case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->verify(); break; } #endif } + +ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner) +{ + REALM_ASSERT(interner); + // we have to write out all, modified or not, to match the total cleanup + Array interned(Allocator::get_default()); + auto sz = size(); + interned.create(NodeHeader::type_Normal, true, sz); + for (size_t i = 0; i < sz; ++i) { + interned.set(i, interner->intern(get(i))); + } + auto retval = interned.write(out, false, false, out.compress); + interned.destroy(); + return retval; + // return m_arr->write(out, true, false, false); +} diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index 4dc96646378..df121c50b2c 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -66,6 +66,14 @@ class ArrayString : public ArrayPayload { { m_arr->set_parent(p, n); } + bool need_string_interner() const override + { + return true; + } + void set_string_interner(StringInterner* string_interner) const override + { + m_string_interner = string_interner; + } bool need_spec() const override { return true; @@ -118,6 +126,10 @@ class ArrayString : public ArrayPayload { static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept; void verify() const; + // Write to 'out', if needed using 'interner' to intern any strings. + // An interner of 0 will disable interning. Interned values may be further + // compressed using leaf compression for integer arrays. + ref_type write(_impl::ArrayWriterBase& out, StringInterner* interner); private: static constexpr size_t small_string_max_size = 15; // ArrayStringShort @@ -127,18 +139,18 @@ class ArrayString : public ArrayPayload { static constexpr size_t storage_size = std::max({sizeof(ArrayStringShort), sizeof(ArraySmallBlobs), sizeof(ArrayBigBlobs), sizeof(Array)}); - enum class Type { small_strings, medium_strings, big_strings, enum_strings }; + enum class Type { small_strings, medium_strings, big_strings, enum_strings, interned_strings }; Type m_type = Type::small_strings; Allocator& m_alloc; alignas(storage_alignment) std::byte m_storage[storage_size]; Array* m_arr; + bool m_nullable = true; mutable Spec* m_spec = nullptr; mutable size_t m_col_ndx = realm::npos; - bool m_nullable = true; - std::unique_ptr m_string_enum_values; + mutable StringInterner* m_string_interner = nullptr; Type upgrade_leaf(size_t value_size); }; diff --git a/src/realm/array_timestamp.hpp b/src/realm/array_timestamp.hpp index 1fad36144f0..cfa4268cd11 100644 --- a/src/realm/array_timestamp.hpp +++ b/src/realm/array_timestamp.hpp @@ -76,7 +76,8 @@ class ArrayTimestamp : public ArrayPayload, private Array { Timestamp get(size_t ndx) const { util::Optional seconds = m_seconds.get(ndx); - return seconds ? Timestamp(*seconds, int32_t(m_nanoseconds.get(ndx))) : Timestamp{}; + int32_t nano = (int32_t)m_nanoseconds.get(ndx); + return seconds ? Timestamp(*seconds, nano) : Timestamp{}; } Mixed get_any(size_t ndx) const final { diff --git a/src/realm/array_unsigned.cpp b/src/realm/array_unsigned.cpp index 938fe5aece8..55f030522b9 100644 --- a/src/realm/array_unsigned.cpp +++ b/src/realm/array_unsigned.cpp @@ -92,23 +92,25 @@ void ArrayUnsigned::update_from_parent() noexcept size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::lower_bound<0>(m_data, m_size, value); case 1: @@ -130,23 +132,25 @@ size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept size_t ArrayUnsigned::upper_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::upper_bound<0>(m_data, m_size, value); case 1: diff --git a/src/realm/cluster.cpp b/src/realm/cluster.cpp index 4922c54f9b2..75deb0707c2 100644 --- a/src/realm/cluster.cpp +++ b/src/realm/cluster.cpp @@ -250,6 +250,17 @@ size_t Cluster::node_size_from_header(Allocator& alloc, const char* header) } } +template +inline void Cluster::set_string_interner(T&, ColKey) const +{ +} + +template <> +inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const +{ + m_tree_top.set_string_interner(arr, col_key); +} + template inline void Cluster::set_spec(T&, ColKey::Idx) const { @@ -270,6 +281,7 @@ inline void Cluster::do_insert_row(size_t ndx, ColKey col, Mixed init_val, bool auto col_ndx = col.get_index(); arr.set_parent(this, col_ndx.val + s_first_col_index); set_spec(arr, col_ndx); + set_string_interner(arr, col); arr.init_from_parent(); if (init_val.is_null()) { arr.insert(ndx, T::default_value(nullable)); @@ -446,10 +458,12 @@ inline void Cluster::do_move(size_t ndx, ColKey col_key, Cluster* to) T src(m_alloc); src.set_parent(this, col_ndx); src.init_from_parent(); + set_string_interner(src, col_key); T dst(m_alloc); dst.set_parent(to, col_ndx); dst.init_from_parent(); + set_string_interner(dst, col_key); src.move(dst, ndx); } @@ -760,6 +774,7 @@ inline void Cluster::do_erase(size_t ndx, ColKey col_key) T values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); set_spec(values, col_ndx); + set_string_interner(values, col_key); values.init_from_parent(); if constexpr (std::is_same_v) { if (ObjLink link = values.get(ndx)) { @@ -1031,6 +1046,7 @@ void Cluster::upgrade_string_to_enum(ColKey col_key, ArrayString& keys) indexes.create(Array::type_Normal, false); ArrayString values(m_alloc); ref_type ref = Array::get_as_ref(col_ndx.val + s_first_col_index); + set_string_interner(values, col_key); values.init_from_ref(ref); size_t sz = values.size(); for (size_t i = 0; i < sz; i++) { @@ -1052,6 +1068,9 @@ void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const if (auto t = m_tree_top.get_owning_table()) t->check_column(col_key); ref_type ref = to_ref(Array::get(col_ndx.val + 1)); + if (leaf->need_string_interner()) { + m_tree_top.set_string_interner(*leaf, col_key); + } if (leaf->need_spec()) { m_tree_top.set_spec(*leaf, col_ndx); } @@ -1071,6 +1090,10 @@ void Cluster::verify(ref_type ref, size_t index, util::Optional& sz) con { ArrayType arr(get_alloc()); set_spec(arr, ColKey::Idx{unsigned(index) - 1}); + auto table = get_owning_table(); + REALM_ASSERT(index <= table->m_leaf_ndx2colkey.size()); + auto col_key = table->m_leaf_ndx2colkey[index - 1]; + set_string_interner(arr, col_key); arr.set_parent(const_cast(this), index); arr.init_from_ref(ref); arr.verify(); @@ -1409,6 +1432,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const case col_type_String: { ArrayString arr(m_alloc); set_spec(arr, col.get_index()); + set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); std::cout << ", " << arr.get(i); @@ -1628,6 +1652,31 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const // Columns auto col_key = out.table->m_leaf_ndx2colkey[j - 1]; auto col_type = col_key.get_type(); + // String columns are interned at this point + if (out.compress && col_type == col_type_String && !col_key.is_collection()) { + ArrayRef leaf(m_alloc); + leaf.init_from_ref(ref); + auto header = leaf.get_header(); + if (NodeHeader::get_hasrefs_from_header(header) || + NodeHeader::get_wtype_from_header(header) == wtype_Multiply) { + // We're interning these strings + ArrayString as(m_alloc); + as.init_from_ref(leaf_rot.get_as_ref()); + written_cluster.set_as_ref(j, as.write(out, out.table->get_string_interner(col_key))); + // in a transactional setting: + // Destroy all sub-arrays if present, in order to release memory in file + // This is contrary to the rest of the handling in this function, but needed + // here since sub-arrays may not have been COW'ed and therefore not freed in file. + // We rely on 'only_modified' to indicate that we're in a transactional setting. + if (only_modified) + leaf.destroy_deep(true); + continue; + } + // whether it's the old enum strings or the new interned strings, + // just write out the array using integer leaf compression + written_cluster.set_as_ref(j, leaf.write(out, false, false, false)); + continue; + } if (col_key.is_collection()) { ArrayRef arr_ref(m_alloc); arr_ref.init_from_ref(ref); diff --git a/src/realm/cluster.hpp b/src/realm/cluster.hpp index 9b106f436ea..365ad3a8634 100644 --- a/src/realm/cluster.hpp +++ b/src/realm/cluster.hpp @@ -365,6 +365,8 @@ class Cluster : public ClusterNode { void do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_value, ObjKey origin_key); template void set_spec(T&, ColKey::Idx) const; + template + void set_string_interner(T&, ColKey) const; template void verify(ref_type ref, size_t index, util::Optional& sz) const; }; diff --git a/src/realm/cluster_tree.cpp b/src/realm/cluster_tree.cpp index 29d5f52ce84..3021f684911 100644 --- a/src/realm/cluster_tree.cpp +++ b/src/realm/cluster_tree.cpp @@ -1135,6 +1135,15 @@ void ClusterTree::update(UpdateFunction func) } } +void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const +{ + // Check for owner. This function may be called in context of DictionaryClusterTree + // in which case m_owner is null (and spec never needed). + if (m_owner) { + arr.set_string_interner(_impl::TableFriend::get_string_interner(*m_owner, col_key)); + } +} + void ClusterTree::set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const { // Check for owner. This function may be called in context of DictionaryClusterTree diff --git a/src/realm/cluster_tree.hpp b/src/realm/cluster_tree.hpp index 43d796c995e..15829f991bc 100644 --- a/src/realm/cluster_tree.hpp +++ b/src/realm/cluster_tree.hpp @@ -181,6 +181,7 @@ class ClusterTree { void update(UpdateFunction func); void set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const; + void set_string_interner(ArrayPayload& arr, ColKey col_key) const; virtual std::unique_ptr get_root_from_parent(); diff --git a/src/realm/db.hpp b/src/realm/db.hpp index e46ba6742c3..7613a4c367b 100644 --- a/src/realm/db.hpp +++ b/src/realm/db.hpp @@ -510,6 +510,8 @@ class DB : public std::enable_shared_from_this { std::shared_ptr m_logger; std::mutex m_commit_listener_mutex; std::vector m_commit_listeners; + std::unordered_map*> m_string_interners; + std::mutex m_string_interners_mutex; bool m_is_sync_agent = false; // Id for this DB to be used in logging. We will just use some bits from the pointer. // The path cannot be used as this would not allow us to distinguish between two DBs opening diff --git a/src/realm/group.cpp b/src/realm/group.cpp index 90b7d690b26..70de9a71ae2 100644 --- a/src/realm/group.cpp +++ b/src/realm/group.cpp @@ -1368,7 +1368,7 @@ void Group::flush_accessors_for_commit() acc->flush_for_commit(); } -void Group::refresh_dirty_accessors() +void Group::refresh_dirty_accessors(bool writable) { if (!m_tables.is_attached()) { m_table_accessors.clear(); @@ -1398,7 +1398,7 @@ void Group::refresh_dirty_accessors() same_table = true; } if (same_table) { - table_accessor->refresh_accessor_tree(); + table_accessor->refresh_accessor_tree(writable); } else { table_accessor->detach(Table::cookie_removed); @@ -1456,7 +1456,7 @@ void Group::advance_transact(ref_type new_top_ref, util::InputStream* in, bool w m_top.detach(); // Soft detach bool create_group_when_missing = false; // See Group::attach_shared(). attach(new_top_ref, writable, create_group_when_missing); // Throws - refresh_dirty_accessors(); // Throws + refresh_dirty_accessors(writable); // Throws if (schema_changed) send_schema_change_notification(); diff --git a/src/realm/group.hpp b/src/realm/group.hpp index 08ddd9acd44..7204f26b258 100644 --- a/src/realm/group.hpp +++ b/src/realm/group.hpp @@ -681,7 +681,7 @@ class Group : public ArrayParent { /// Memory mappings must have been updated to reflect any growth in filesize before /// calling advance_transact() void advance_transact(ref_type new_top_ref, util::InputStream*, bool writable); - void refresh_dirty_accessors(); + void refresh_dirty_accessors(bool writable); void flush_accessors_for_commit(); /// \brief The version of the format of the node structure (in file or in diff --git a/src/realm/group_writer.cpp b/src/realm/group_writer.cpp index 22ce7db93ac..533565f39d2 100644 --- a/src/realm/group_writer.cpp +++ b/src/realm/group_writer.cpp @@ -647,6 +647,7 @@ ref_type GroupWriter::write_group() { ALLOC_DBG_COUT("Commit nr " << m_current_version << " ( from " << m_oldest_reachable_version << " )" << std::endl); + // m_group.typed_print(""); read_in_freelist(); // Now, 'm_size_map' holds all free elements candidate for recycling @@ -710,7 +711,7 @@ ref_type GroupWriter::write_group() top.set_as_ref(Group::s_evacuation_point_ndx, ref); } else if (ref) { - Array::destroy(ref, m_alloc); + Array::destroy(ref_type(ref), m_alloc); top.set(Group::s_evacuation_point_ndx, 0); } } @@ -788,7 +789,9 @@ ref_type GroupWriter::write_group() top.set(Group::s_file_size_ndx, RefOrTagged::make_tagged(m_logical_size)); auto ref = top.get_as_ref(Group::s_evacuation_point_ndx); REALM_ASSERT(ref); - Array::destroy(ref, m_alloc); + Array destroy_array(m_alloc); + destroy_array.init_from_ref(ref); + destroy_array.destroy(); top.set(Group::s_evacuation_point_ndx, 0); m_evacuation_limit = 0; diff --git a/src/realm/node.hpp b/src/realm/node.hpp index 8a4b862a701..21ee61eddde 100644 --- a/src/realm/node.hpp +++ b/src/realm/node.hpp @@ -21,6 +21,7 @@ #include #include +#include #include @@ -357,6 +358,11 @@ class ArrayPayload { virtual void init_from_ref(ref_type) noexcept = 0; virtual void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept = 0; virtual Mixed get_any(size_t ndx) const = 0; + virtual bool need_string_interner() const + { + return false; + } + virtual void set_string_interner(StringInterner*) const {} virtual bool need_spec() const { return false; diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index eb8138dd8f5..fc34b755d57 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -613,7 +613,11 @@ StringData Obj::_get(ColKey::Idx col_ndx) const return values.get(m_row_ndx); } else { - return ArrayString::get(alloc.translate(ref), m_row_ndx, alloc); + ArrayString values(get_alloc()); + auto col_key = m_table->leaf_ndx2colkey(col_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.get(m_row_ndx); } } @@ -738,9 +742,12 @@ inline bool Obj::do_is_null(ColKey::Idx col_ndx) const template <> inline bool Obj::do_is_null(ColKey::Idx col_ndx) const { + REALM_ASSERT(false); // Don't come here, you're falling from a cliff.... ArrayString values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); + // TODO: Set string interner if needed + // values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_ref(ref); return values.is_null(m_row_ndx); } @@ -765,8 +772,16 @@ bool Obj::is_null(ColKey col_key) const return do_is_null(col_ndx); case col_type_Double: return do_is_null(col_ndx); - case col_type_String: - return do_is_null(col_ndx); + case col_type_String: { + ArrayString values(get_alloc()); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); + // TODO: Set string interner if needed + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.is_null(m_row_ndx); + } + // return do_is_null(col_ndx); case col_type_Binary: return do_is_null(col_ndx); case col_type_Mixed: @@ -1588,6 +1603,17 @@ inline void check_range(const BinaryData& val) } } // namespace +// helper functions for filtering out calls to set_string_interner() +template +inline void Obj::set_string_interner(T&, ColKey) +{ +} +template <> +inline void Obj::set_string_interner(ArrayString& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} + // helper functions for filtering out calls to set_spec() template inline void Obj::set_spec(T&, ColKey) @@ -1685,6 +1711,7 @@ Obj& Obj::set(ColKey col_key, T value, bool is_default) LeafType values(alloc); values.set_parent(&fields, col_ndx.val + 1); set_spec(values, col_key); + set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); @@ -2296,6 +2323,7 @@ inline void Obj::do_set_null(ColKey col_key) ArrayString values(alloc); values.set_parent(&fields, col_ndx.val + 1); values.set_spec(const_cast(&get_spec()), spec_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_parent(); values.set_null(m_row_ndx); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index 67c82a0cada..8711e590dac 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -392,6 +392,8 @@ class Obj { void nullify_link(ColKey origin_col, ObjLink target_key) &&; template inline void set_spec(T&, ColKey); + template + inline void set_string_interner(T&, ColKey); template inline void nullify_single_link(ColKey col, ValueType target); diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp new file mode 100644 index 00000000000..99dcb50dac5 --- /dev/null +++ b/src/realm/string_compressor.cpp @@ -0,0 +1,357 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include + +#include + +#include +namespace realm { + +StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable) +{ + m_compression_map.resize(16); // start with a very small compression map + m_symbols.reserve(65536); + m_data = std::make_unique(alloc); + m_data->set_parent(&parent, index); + refresh(writable); +} + +void StringCompressor::refresh(bool writable) +{ + // we assume that compressors are only created from a valid parent. + // String interners in 'dead' mode should never instantiate a string compressor. + if (m_data->get_ref_from_parent() == 0) { + REALM_ASSERT(writable); + m_data->create(0, 65535); + m_data->update_parent(); + } + else { + if (m_data->is_attached()) + m_data->update_from_parent(); + else + m_data->init_from_ref(m_data->get_ref_from_parent()); + } + rebuild_internal(); +} + +static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b) +{ + // range of return value must match size of encoding table + uint32_t tmp = a + 3; + tmp *= b + 7; + return (tmp ^ (tmp >> 16)) & 0xFFFF; +} + +void StringCompressor::add_expansion(SymbolDef def) +{ + // compute expansion size: + size_t exp_size = 0; + if (def.expansion_a < 256) + exp_size = 1; + else + exp_size = m_symbols[def.expansion_a - 256].expansion.size(); + if (def.expansion_b < 256) + exp_size += 1; + else + exp_size += m_symbols[def.expansion_b - 256].expansion.size(); + // make sure there is room in active storage chunk: + if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) { + m_expansion_storage.push_back({}); + m_expansion_storage.back().reserve(storage_chunk_size); + } + // construct expansion at end of chunk: + auto& chunk = m_expansion_storage.back(); + auto start_index = (uint32_t)chunk.size(); + if (def.expansion_a < 256) + chunk.push_back((char)def.expansion_a); + else + chunk.append(m_symbols[def.expansion_a - 256].expansion); + if (def.expansion_b < 256) + chunk.push_back((char)def.expansion_b); + else + chunk.append(m_symbols[def.expansion_b - 256].expansion); + std::string_view expansion(chunk.data() + start_index, exp_size); + m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index}); +} + +void StringCompressor::expand_compression_map() +{ + size_t old_size = m_compression_map.size(); + REALM_ASSERT(old_size <= 16384); + size_t new_size = 4 * old_size; + std::vector map(new_size); + for (size_t i = 0; i < m_compression_map.size(); ++i) { + auto& entry = m_compression_map[i]; + if (entry.id == 0) + continue; + auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b); + auto new_hash = hash & (new_size - 1); + REALM_ASSERT(map[new_hash].id == 0); + map[new_hash] = entry; + } + m_compression_map.swap(map); +} + +void StringCompressor::rebuild_internal() +{ + auto num_symbols = m_data->size(); + if (num_symbols == m_symbols.size()) + return; + if (num_symbols < m_symbols.size()) { + // fewer symbols (likely a rollback) -- remove last ones added + while (num_symbols < m_symbols.size()) { + auto& symbol = m_symbols.back(); + auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b); + hash &= m_compression_map.size() - 1; + REALM_ASSERT(m_compression_map[hash].id == symbol.def.id); + m_compression_map[hash] = {0, 0, 0}; + if (symbol.storage_index < m_expansion_storage.size() - 1) { + m_expansion_storage.resize(symbol.storage_index + 1); + } + m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset); + m_symbols.pop_back(); + } + return; + } + // we have new symbols to add + for (size_t i = m_symbols.size(); i < num_symbols; ++i) { + auto pair = m_data->get(i); + SymbolDef def; + def.id = (CompressionSymbol)(i + 256); + def.expansion_a = 0xFFFF & (pair >> 16); + def.expansion_b = 0xFFFF & pair; + auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b); + while (m_compression_map[hash & (m_compression_map.size() - 1)].id) { + expand_compression_map(); + } + // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + m_compression_map[hash & (m_compression_map.size() - 1)] = def; + add_expansion(def); + } +} + +StringCompressor::~StringCompressor() {} + +CompressedString StringCompressor::compress(StringData sd, bool learn) +{ + CompressedString result(sd.size()); + // expand string into array of symbols + const char* d = sd.data(); + const size_t limit = sd.size(); + if (limit == 0) + return {}; + size_t i = 0; + while (i < limit) { + result[i++] = 0xFF & *d++; + } + // iteratively compress array of symbols. Each run compresses pairs into single symbols. + // 6 runs give a max compression of 64x - on average it will be much less :-) + constexpr int run_limit = 6; + CompressionSymbol* to; + for (int run = 0; run < run_limit; ++run) { + CompressionSymbol* from = to = result.data(); + CompressionSymbol* limit = from + result.size() - 1; + while (from < limit) { + auto hash = symbol_pair_hash(from[0], from[1]); + hash &= m_compression_map.size() - 1; + auto& def = m_compression_map[hash]; + if (def.id) { + // existing symbol + if (def.expansion_a == from[0] && def.expansion_b == from[1]) { + // matching symbol + *to++ = def.id; + from += 2; + } + else if (m_compression_map.size() < 65536) { + // Conflict: some other symbol is defined here - but we can expand the compression map + // and hope to find room! + expand_compression_map(); + // simply retry: + continue; + } + else { + // also conflict: some other symbol is defined here, we can't compress + *to++ = *from++; + // In a normal hash table we'd have buckets and add a translation + // to a bucket. This is slower generally, but yields better compression. + } + } + else { + // free entry we can use for new symbol (and we're learning) + if (m_symbols.size() < (65536 - 256) && learn) { + // define a new symbol for this entry and use it. + REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + REALM_ASSERT_DEBUG(m_symbols.size() == m_data->size()); + REALM_ASSERT_DEBUG(m_data->is_attached()); + CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size()); + SymbolDef def{id, from[0], from[1]}; + m_compression_map[hash] = def; + add_expansion(def); + m_data->add(((uint64_t)from[0]) << 16 | from[1]); + // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl; + *to++ = id; + from += 2; + } + else { + // no more symbol space, so can't compress + *to++ = *from++; + } + } + } + if (from == limit) { + // copy over trailing symbol + *to++ = *from++; + } + REALM_ASSERT_DEBUG(to > result.data()); + size_t sz = to - result.data(); + REALM_ASSERT_DEBUG(sz <= sd.size()); + result.resize(sz); + if (from == to) // no compression took place in last iteration + break; + } + return result; +} + +std::string StringCompressor::decompress(CompressedStringView& c_str) +{ + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + // compute size of decompressed string first to avoid allocations as string grows + size_t result_size = 0; + while (ptr < limit) { + if (*ptr < 256) + result_size += 1; + else + result_size += m_symbols[*ptr - 256].expansion.size(); + ++ptr; + } + std::string result2; + result2.reserve(result_size); + // generate result + ptr = c_str.data; + while (ptr < limit) { + if (*ptr < 256) + result2.push_back((char)*ptr); + else + result2.append(m_symbols[*ptr - 256].expansion); + ptr++; + } +#ifdef REALM_DEBUG + std::string result; + { + auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void { + if (symbol < 256) { + result.push_back((char)symbol); + } + else { + auto& s = m_symbols[symbol - 256]; + decompress(s.def.expansion_a, decompress); + decompress(s.def.expansion_b, decompress); + } + }; + + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + while (ptr < limit) { + decompress(*ptr, decompress); + ++ptr; + } + } + REALM_ASSERT_DEBUG(result == result2); +#endif + return result2; +} + +int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) +{ + auto A_ptr = A.data; + auto A_limit = A_ptr + A.size; + auto B_ptr = B.data; + auto B_limit = B_ptr + B.size; + while (A_ptr < A_limit && B_ptr < B_limit) { + auto code_A = *A_ptr++; + auto code_B = *B_ptr++; + if (code_A == code_B) + continue; + // symbols did not match: + // 1. both symbols are single characters + if (code_A < 256 && code_B < 256) + return code_B - code_A; + std::string a_str(code_A, 1); + auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion); + std::string b_str(code_B, 1); + auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion); + // to ensure comparison as StringData we need to convert the stringviews + StringData sd_a(str_A.data(), str_A.size()); + StringData sd_b(str_B.data(), str_B.size()); + REALM_ASSERT_DEBUG(sd_a != sd_b); + if (sd_a < sd_b) + return 1; + else + return -1; + } + // The compressed strings are identical or one is the prefix of the other + return B.size - A.size; + // ^ a faster way of producing same positive / negative / zero as: + // if (A.size() < B.size()) + // return 1; + // if (A.size() > B.size()) + // return -1; + // return 0; +} + +int StringCompressor::compare(StringData sd, CompressedStringView& B) +{ + auto B_size = B.size; + // make sure comparisons are unsigned, even though StringData does not specify signedness + const unsigned char* A_ptr = reinterpret_cast(sd.data()); + auto A_limit = A_ptr + sd.size(); + for (size_t i = 0; i < B_size; ++i) { + if (A_ptr == A_limit) { + // sd ended first, so B is bigger + return -1; + } + auto code = B.data[i]; + if (code < 256) { + if (code < *A_ptr) + return 1; + if (code > *A_ptr) + return -1; + ++A_ptr; + continue; + } + auto& expansion = m_symbols[code - 256]; + for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) { + uint8_t c = expansion.expansion[disp]; + if (c < *A_ptr) + return 1; + if (c > *A_ptr) + return -1; + ++A_ptr; + } + } + // if sd is longer than B, sd is the biggest string + if (A_ptr < A_limit) + return 1; + return 0; +} + + +} // namespace realm diff --git a/src/realm/string_compressor.hpp b/src/realm/string_compressor.hpp new file mode 100644 index 00000000000..2c866ecb781 --- /dev/null +++ b/src/realm/string_compressor.hpp @@ -0,0 +1,100 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_COMPRESSOR_HPP +#define REALM_STRING_COMPRESSOR_HPP + +#include +#include + +using CompressionSymbol = uint16_t; +using CompressedString = std::vector; +struct CompressedStringView { + CompressionSymbol* data = 0; + uint32_t size = 0; + CompressedStringView() = default; + CompressedStringView(CompressionSymbol* c_ptr, size_t s) + : data(c_ptr) + , size(uint32_t(s)) + { + } + explicit CompressedStringView(CompressedString& cs) + : data(cs.data()) + , size(uint32_t(cs.size())) + { + } + bool operator==(CompressedStringView& other) + { + if (size != other.size) + return false; + for (size_t i = 0; i < size; ++i) { + if (data[i] != other.data[i]) + return false; + } + return true; + } +}; + +namespace realm { + +class ArrayUnsigned; +class Array; +class Allocator; + +class StringCompressor { +public: + StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable); + void refresh(bool writable); + ~StringCompressor(); + + int compare(CompressedStringView& A, CompressedStringView& B); + int compare(StringData sd, CompressedStringView& B); + + CompressedString compress(StringData, bool learn); + std::string decompress(CompressedStringView& c_str); + +private: + struct SymbolDef { + CompressionSymbol id = 0; + CompressionSymbol expansion_a = 0; + CompressionSymbol expansion_b = 0; + }; + + struct ExpandedSymbolDef { + SymbolDef def; + std::string_view expansion; + // ^ points into storage managed by m_expansion_storage + // we need the following 2 values to facilitate rollback of allocated storage + uint32_t storage_index; // index into m_expansion_storage + uint32_t storage_offset; // offset into block. + }; + + void rebuild_internal(); + void expand_compression_map(); + void add_expansion(SymbolDef def); + std::vector m_symbols; // map from symbol -> symbolpair, 2 elements pr entry + std::vector m_compression_map; // perfect hash from symbolpair to its symbol + + std::unique_ptr m_data; + constexpr static size_t storage_chunk_size = 4096; + std::vector m_expansion_storage; +}; + +} // namespace realm + +#endif diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp new file mode 100644 index 00000000000..fb801b1fd6a --- /dev/null +++ b/src/realm/string_interner.cpp @@ -0,0 +1,681 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include + +#include +#include + +namespace realm { + +// Fast mapping of strings (or rather hash of strings) to string IDs. +// +// We use a tree where: +// * All interior nodes are radix nodes with a fan-out of 256. +// * Leaf nodes with up to 16 entries are just lists, searched linearly +// * Leaf nodes with more than 16 entries and less than 1K are hash tables. +// Hash tables use linear search starting from the entry found by hashing. +// +constexpr static size_t linear_search_limit = 16; +constexpr static size_t hash_node_min_size = 32; +constexpr static size_t hash_node_max_size = 1024; +constexpr static size_t radix_node_consumes_bits = 8; +constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits; + +// helpers +struct HashMapIter { + Array& m_array; + uint32_t hash_filter; + uint16_t index; + uint16_t left_to_search; + uint8_t hash_size; + HashMapIter(Array& array, uint32_t hash, uint8_t hash_size) + : m_array(array) + , hash_filter(hash) + , hash_size(hash_size) + { + set_index(0); + } + HashMapIter(Array& dummy) + : m_array(dummy) + { + left_to_search = 0; + } + inline uint32_t get() + { + return (uint32_t)(m_array.get(index) >> hash_size); + } + inline bool empty() + { + auto element = m_array.get(index); + return (element >> hash_size) == 0; + } + inline void set(uint64_t element) + { + m_array.set(index, element); + } + inline bool matches() + { + auto mask = 0xFFFFFFFFUL >> (32 - hash_size); + auto element = m_array.get(index); + return ((element & mask) == hash_filter) && (element >> hash_size); + } + inline bool is_valid() + { + return left_to_search != 0; + } + inline void set_index(size_t i, size_t search_limit = linear_search_limit) + { + index = (uint16_t)i; + left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit); + } + void operator++() + { + if (is_valid()) { + left_to_search--; + index++; + if (index == m_array.size()) { + index = 0; + } + } + } +}; + +// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf. +static bool rehash(Array& from, Array& to, uint8_t hash_size) +{ + REALM_ASSERT_DEBUG(from.size() * 2 == to.size()); + + for (size_t i = 0; i < from.size(); ++i) { + auto entry = (size_t)from.get(i); + if ((entry >> hash_size) == 0) + continue; + size_t starting_index = entry & (to.size() - 1); + HashMapIter it(to, 0, hash_size); + it.set_index(starting_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (!it.is_valid()) { + // abort rehashing, we need a larger to-space + return false; + } + REALM_ASSERT(it.empty()); + it.set(entry); + } + return true; +} + +// Add a binding from hash value to id. +static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size) +{ + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf. + if (node.size() < linear_search_limit) { + // it's a list with room to grow + node.add(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() == linear_search_limit) { + // it's a full list, must be converted to a hash table + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + // transform existing list into hash table + rehash(node, new_node, hash_size); + node.destroy(); + node.init_from_parent(); + } + // it's a hash table. Grow if needed up till 'hash_node_max_size' entries + while (node.size() < hash_node_max_size) { + auto size = node.size(); + size_t start_index = hash & (size - 1); + HashMapIter it(node, 0, hash_size); + it.set_index(start_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (it.is_valid()) { + // found an empty spot within search range + it.set(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() >= hash_node_max_size) + break; + // No free spot found - rehash into bigger and bigger tables + auto new_size = node.size(); + bool need_to_rehash = true; + Array new_node(node.get_alloc()); + while (need_to_rehash && new_size < hash_node_max_size) { + new_size *= 2; + new_node.create(NodeHeader::type_Normal, false, new_size, 0); + need_to_rehash = !rehash(node, new_node, hash_size); + if (need_to_rehash) { // we failed, try again - or shift to radix + // I find it counter-intuitive. But it CAN happen. + new_node.destroy(); + } + } + if (need_to_rehash) + break; + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + node.destroy(); + node.init_from_parent(); + } + // we ran out of space. Rewrite as a radix node with subtrees + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + for (size_t index = 0; index < node.size(); ++index) { + auto element = node.get(index); + auto hash = element & (0xFFFFFFFF >> (32 - hash_size)); + auto string_id = element >> hash_size; + if (string_id == 0) + continue; + auto remaining_hash = hash >> radix_node_consumes_bits; + add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8); + } + node.destroy(); + node.init_from_parent(); + } + // We have a radix node and need to insert the new binding into the proper subtree + size_t index = hash & (radix_node_size - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(!rot.is_tagged()); + Array subtree(node.get_alloc()); + if (rot.get_as_ref() == 0) { + // no subtree present, create an empty one + subtree.set_parent(&node, index); + subtree.create(NodeHeader::type_Normal); + subtree.update_parent(); + } + else { + // subtree already present + subtree.set_parent(&node, index); + subtree.init_from_parent(); + } + // recurse into subtree + add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits); +} + +static std::vector hash_to_id(Array& node, uint32_t hash, uint8_t hash_size) +{ + std::vector result; + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf - default is a list, search starts from index 0. + HashMapIter it(node, hash, hash_size); + if (node.size() > hash_node_min_size) { + // it is a hash table, so use hash to select index to start searching + // table size must be power of two! + size_t index = hash & (node.size() - 1); + it.set_index(index); + } + // collect all matching values within allowed range + while (it.is_valid()) { + if (it.matches()) { + result.push_back(it.get()); + } + ++it; + } + return result; + } + else { + // it's a radix node + size_t index = hash & (node.size() - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(rot.is_ref()); + if (rot.get_as_ref() == 0) { + // no subtree, return empty vector + return result; + } + // descend into subtree + Array subtree(node.get_alloc()); + subtree.set_parent(&node, index); + subtree.init_from_parent(); + return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits); + } +} + + +enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size }; +struct StringInterner::DataLeaf { + std::vector m_compressed; + ref_type m_leaf_ref = 0; + bool m_is_loaded = false; + DataLeaf() {} + DataLeaf(ref_type ref) + : m_leaf_ref(ref) + { + } +}; + +StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable) + : m_parent(parent) +{ + REALM_ASSERT_DEBUG(col_key != ColKey()); + size_t index = col_key.get_index().val; + // ensure that m_top and m_data is well defined and reflect any existing data + // We'll have to extend this to handle no defined backing + m_top = std::make_unique(alloc); + m_top->set_parent(&parent, index); + m_data = std::make_unique(alloc); + m_data->set_parent(m_top.get(), Pos_Data); + m_hash_map = std::make_unique(alloc); + m_hash_map->set_parent(m_top.get(), Pos_Map); + m_current_string_leaf = std::make_unique(alloc); + m_col_key = col_key; + update_from_parent(writable); +} + +void StringInterner::update_from_parent(bool writable) +{ + auto parent_idx = m_top->get_ndx_in_parent(); + bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size(); + bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx); + if (valid_top) { + m_top->update_from_parent(); + m_data->update_from_parent(); + m_hash_map->update_from_parent(); + } + else if (writable && valid_top_ref_spot) { + m_top->create(NodeHeader::type_HasRefs, false, Top_Size, 0); + m_top->set(Pos_Version, (1 << 1) + 1); // version number 1. + m_top->set(Pos_Size, (0 << 1) + 1); // total size 0 + m_top->set(Pos_ColKey, (m_col_key.value << 1) + 1); + m_top->set(Pos_Compressor, 0); + // create first level of data tree here (to simplify other stuff) + m_data = std::make_unique(m_parent.get_alloc()); + m_data->set_parent(m_top.get(), Pos_Data); + m_data->create(NodeHeader::type_HasRefs, false, 0); + m_data->update_parent(); + m_hash_map = std::make_unique(m_parent.get_alloc()); + m_hash_map->set_parent(m_top.get(), Pos_Map); + m_hash_map->create(NodeHeader::type_Normal); + m_hash_map->update_parent(); + m_top->update_parent(); + valid_top = true; + } + if (!valid_top) { + // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode + m_compressor.reset(); + m_compressed_leafs.clear(); + // m_compressed_string_map.clear(); + m_top->detach(); // <-- indicates "dead" mode + m_data->detach(); + m_hash_map->detach(); + m_compressor.reset(); + return; + } + // validate we're accessing data for the correct column. A combination of column erase + // and insert could lead to an interner being paired with wrong data in the file. + // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch. + int64_t data_colkey = m_top->get_as_ref_or_tagged(Pos_ColKey).get_as_int(); + if (m_col_key.value != data_colkey) { + // new column, new data + m_compressor.reset(); + m_decompressed_strings.clear(); + } + if (!m_compressor) + m_compressor = std::make_unique(m_top->get_alloc(), *m_top, Pos_Compressor, writable); + else + m_compressor->refresh(writable); + if (m_data->size()) { + auto ref_to_write_buffer = m_data->get_as_ref(m_data->size() - 1); + const char* header = m_top->get_alloc().translate(ref_to_write_buffer); + bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header); + if (is_array_of_cprs) { + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->update_from_parent(); + } + else { + m_current_long_string_node.reset(); + } + } + else + m_current_long_string_node.reset(); // just in case... + + // rebuild internal structures...... + rebuild_internal(); + m_current_string_leaf->detach(); +} + +void StringInterner::rebuild_internal() +{ + std::lock_guard lock(m_mutex); + // release old decompressed strings + for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) { + StringID id = m_in_memory_strings[idx]; + if (id > m_decompressed_strings.size()) { + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + if (auto& w = m_decompressed_strings[id - 1].m_weight) { + w >>= 1; + } + else { + m_decompressed_strings[id - 1].m_decompressed.reset(); + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + } + + size_t target_size = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + m_decompressed_strings.resize(target_size); + if (m_data->size() != m_compressed_leafs.size()) { + m_compressed_leafs.resize(m_data->size()); + } + // allways force new setup of all leafs: + // update m_compressed_leafs to reflect m_data + for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) { + auto ref = m_data->get_as_ref(idx); + auto& leaf_meta = m_compressed_leafs[idx]; + // if (ref != leaf_meta.m_leaf_ref) { + leaf_meta.m_is_loaded = false; + leaf_meta.m_compressed.clear(); + leaf_meta.m_leaf_ref = ref; + //} + } +} + +StringInterner::~StringInterner() {} + +StringID StringInterner::intern(StringData sd) +{ + REALM_ASSERT(m_top->is_attached()); + std::lock_guard lock(m_mutex); + // special case for null string + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + // it's a new string + bool learn = true; + auto c_str = m_compressor->compress(sd, learn); + m_decompressed_strings.push_back({64, std::make_unique(sd)}); + auto id = m_decompressed_strings.size(); + m_in_memory_strings.push_back(id); + add_to_hash_map(*m_hash_map.get(), h, id, 32); + size_t index = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + REALM_ASSERT_DEBUG(index == id - 1); + bool need_long_string_node = c_str.size() >= 65536; + + // TODO: update_internal must set up m_current_long_string_node if it is in use + + if (need_long_string_node && !m_current_long_string_node) { + if ((index & 0xFF) == 0) { + // if we're starting on a new leaf, extend parent array for it + m_data->add(0); + m_compressed_leafs.push_back({}); + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->create(NodeHeader::type_HasRefs); + m_current_long_string_node->update_parent(); + REALM_ASSERT_DEBUG(!m_current_string_leaf->is_attached() || m_current_string_leaf->size() == 0); + m_current_string_leaf->detach(); + } + else { + // we have been building an existing leaf and need to shift representation. + // but first we need to update leaf accessor for existing leaf + if (m_current_string_leaf->is_attached()) { + m_current_string_leaf->update_from_parent(); + } + else { + m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + } + REALM_ASSERT_DEBUG(m_current_string_leaf->size() > 0); + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->create(NodeHeader::type_HasRefs); + m_current_long_string_node->update_parent(); + // convert the current leaf into a long string node. (array of strings in separate arrays) + for (auto s : m_compressed_leafs.back().m_compressed) { + ArrayUnsigned arr(m_top->get_alloc()); + arr.create(s.size, 65535); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(s.data, s.size, dest); + m_current_long_string_node->add(arr.get_ref()); + } + m_current_string_leaf->destroy(); + m_current_string_leaf->detach(); + // force later reload of leaf + m_compressed_leafs.back().m_is_loaded = false; + // m_compressed_leafs.back().m_leaf_ref = m_data->get_as_ref(m_data->size() - 1); + } + } + if (m_current_long_string_node) { + ArrayUnsigned arr(m_top->get_alloc()); + arr.create(c_str.size(), 65535); + unsigned short* begin = c_str.data(); + if (begin) { + // if the compressed string is empty, 'begin' is zero and we don't copy + size_t n = c_str.size(); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(begin, n, dest); + } + m_current_long_string_node->add(arr.get_ref()); + m_current_long_string_node->update_parent(); + if (m_current_long_string_node->size() == 256) { + // exit from "long string mode" + m_current_long_string_node.reset(); + } + CompressionSymbol* p_start = reinterpret_cast(arr.m_data); + m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()}); + } + else { + // Append to leaf with up to 256 entries. + // First create a new leaf if needed (limit number of entries to 256 pr leaf) + bool need_leaf_update = !m_current_string_leaf->is_attached() || (index & 0xFF) == 0; + if (need_leaf_update) { + m_current_string_leaf->set_parent(m_data.get(), index >> 8); + if ((index & 0xFF) == 0) { + // create new leaf + m_current_string_leaf->create(0, 65535); + m_data->add(m_current_string_leaf->get_ref()); + m_compressed_leafs.push_back({}); + } + else { + // just setup leaf accessor + if (m_current_string_leaf->is_attached()) { + m_current_string_leaf->update_from_parent(); + } + else { + m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + } + } + } + REALM_ASSERT(c_str.size() < 65535); + // Add compressed string at end of leaf + m_current_string_leaf->add(c_str.size()); + for (auto c : c_str) { + m_current_string_leaf->add(c); + } + REALM_ASSERT_DEBUG(m_compressed_leafs.size()); + CompressionSymbol* p = reinterpret_cast(m_current_string_leaf->m_data); + auto p_limit = p + m_current_string_leaf->size(); + auto p_start = p_limit - c_str.size(); + m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()}); + REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256); + } + m_top->adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2 + load_leaf_if_new_ref(m_compressed_leafs.back(), m_data->get_as_ref(m_data->size() - 1)); +#ifdef REALM_DEBUG + auto csv = get_compressed(id); + CompressedStringView csv2(c_str); + REALM_ASSERT(csv == csv2); +#endif + return id; +} + +bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) +{ + if (!leaf.m_is_loaded) { + // start with an empty leaf: + leaf.m_compressed.clear(); + leaf.m_compressed.reserve(256); + + // must interpret leaf first - the leaf is either a single array holding all strings, + // or an array with each (compressed) string placed in its own array. + const char* header = m_top->get_alloc().translate(leaf.m_leaf_ref); + bool is_single_array = !NodeHeader::get_hasrefs_from_header(header); + if (is_single_array) { + size_t leaf_offset = 0; + ArrayUnsigned leaf_array(m_top->get_alloc()); + leaf_array.init_from_ref(leaf.m_leaf_ref); + REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16); + // This is dangerous if the leaf is for some reason not in the assumed format + CompressionSymbol* c = reinterpret_cast(leaf_array.m_data); + auto leaf_size = leaf_array.size(); + while (leaf_offset < leaf_size) { + size_t length = c[leaf_offset]; + REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset)); + leaf_offset++; + leaf.m_compressed.push_back({c + leaf_offset, length}); + REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256); + leaf_offset += length; + } + } + else { + // Not a single leaf - instead an array of strings + Array arr(m_top->get_alloc()); + arr.init_from_ref(leaf.m_leaf_ref); + for (size_t idx = 0; idx < arr.size(); ++idx) { + ArrayUnsigned str_array(m_top->get_alloc()); + ref_type ref = arr.get_as_ref(idx); + str_array.init_from_ref(ref); + REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16); + CompressionSymbol* c = reinterpret_cast(str_array.m_data); + leaf.m_compressed.push_back({c, str_array.size()}); + } + } + leaf.m_is_loaded = true; + return true; + } + return false; +} + +// Danger: Only to be used if you know that a change in content ==> different ref +bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref) +{ + if (leaf.m_leaf_ref != new_ref) { + leaf.m_leaf_ref = new_ref; + leaf.m_is_loaded = false; + leaf.m_compressed.resize(0); + } + return load_leaf_if_needed(leaf); +} + +CompressedStringView& StringInterner::get_compressed(StringID id) +{ + auto index = id - 1; // 0 represents null + auto hi = index >> 8; + auto lo = index & 0xFFUL; + DataLeaf& leaf = m_compressed_leafs[hi]; + load_leaf_if_needed(leaf); + REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size()); + return leaf.m_compressed[lo]; +} + +std::optional StringInterner::lookup(StringData sd) +{ + if (!m_top->is_attached()) { + // "dead" mode + return {}; + } + std::lock_guard lock(m_mutex); + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + return {}; +} + +int StringInterner::compare(StringID A, StringID B) +{ + std::lock_guard lock(m_mutex); + REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(B < m_decompressed_strings.size()); + // comparisons against null + if (A == B && A == 0) + return 0; + if (A == 0) + return -1; + if (B == 0) + return 1; + // ok, no nulls. + REALM_ASSERT(m_compressor); + return m_compressor->compare(get_compressed(A), get_compressed(B)); +} + +int StringInterner::compare(StringData s, StringID A) +{ + std::lock_guard lock(m_mutex); + REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); + // comparisons against null + if (s.data() == nullptr && A == 0) + return 0; + if (s.data() == nullptr) + return 1; + if (A == 0) + return -1; + // ok, no nulls + REALM_ASSERT(m_compressor); + return m_compressor->compare(s, get_compressed(A)); +} + + +StringData StringInterner::get(StringID id) +{ + REALM_ASSERT(m_compressor); + std::lock_guard lock(m_mutex); + if (id == 0) + return StringData{nullptr}; + REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); + CachedString& cs = m_decompressed_strings[id - 1]; + if (cs.m_decompressed) { + std::string* ref_str = cs.m_decompressed.get(); + if (cs.m_weight < 128) + cs.m_weight += 64; + return {ref_str->c_str(), ref_str->size()}; + } + cs.m_weight = 64; + cs.m_decompressed = std::make_unique(m_compressor->decompress(get_compressed(id))); + m_in_memory_strings.push_back(id); + return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; +} + +} // namespace realm diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp new file mode 100644 index 00000000000..2a36c9e38dc --- /dev/null +++ b/src/realm/string_interner.hpp @@ -0,0 +1,96 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_INTERNER_HPP +#define REALM_STRING_INTERNER_HPP + +#include +#include +#include +#include + +#include +#include +#include + + +namespace realm { + + +using StringID = size_t; + +class Array; +class ArrayUnsigned; +class Allocator; +struct CachedString { + uint8_t m_weight = 0; + std::unique_ptr m_decompressed; +}; + +class StringInterner { +public: + // To be used exclusively from Table + StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable); + void update_from_parent(bool writable); + ~StringInterner(); + + // To be used from Obj and for searching + StringID intern(StringData); + std::optional lookup(StringData); + int compare(StringID A, StringID B); + int compare(StringData, StringID A); + StringData get(StringID); + +private: + Array& m_parent; // need to be able to check if this is attached or not + std::unique_ptr m_top; + // Compressed strings are stored in blocks of 256. + // One array holds refs to all blocks: + std::unique_ptr m_data; + // In-memory representation of a block. Either only the ref to it, + // or a full vector of views into the block. + struct DataLeaf; + // in-memory metadata for faster access to compressed strings. Mirrors m_data. + std::vector m_compressed_leafs; + // 'm_hash_map' is used for mapping hash of uncompressed string to string id. + std::unique_ptr m_hash_map; + // the block of compressed strings we're currently appending to: + std::unique_ptr m_current_string_leaf; + // an array of strings we're currently appending to. This is used instead + // when ever we meet a string too large to be placed inline. + std::unique_ptr m_current_long_string_node; + void rebuild_internal(); + CompressedStringView& get_compressed(StringID id); + // return true if the leaf was reloaded + bool load_leaf_if_needed(DataLeaf& leaf); + // return 'true' if the new ref was different and forced a reload + bool load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref); + ColKey m_col_key; // for validation + std::unique_ptr m_compressor; + // At the moment we need to keep decompressed strings around if they've been + // returned to the caller, since we're handing + // out StringData references to their storage. This is a temporary solution. + std::vector m_decompressed_strings; + std::vector m_in_memory_strings; + // Mutual exclusion is needed for frozen transactions only. Live objects are + // only used in single threaded contexts so don't need them. For now, just use always. + std::mutex m_mutex; +}; +} // namespace realm + +#endif diff --git a/src/realm/table.cpp b/src/realm/table.cpp index ad9435f45ca..977339ade0d 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -356,6 +357,7 @@ Table::Table(Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(&g_dummy_replication) , m_own_ref(this, alloc.get_instance_version()) { @@ -363,7 +365,7 @@ Table::Table(Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); - + m_interner_data.set_parent(&m_top, top_position_for_interners); ref_type ref = create_empty_table(m_alloc); // Throws ArrayParent* parent = nullptr; size_t ndx_in_parent = 0; @@ -378,6 +380,7 @@ Table::Table(Replication* const* repl, Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(repl) , m_own_ref(this, alloc.get_instance_version()) { @@ -385,6 +388,8 @@ Table::Table(Replication* const* repl, Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_interner_data.set_parent(&m_top, top_position_for_interners); m_cookie = cookie_created; } @@ -535,6 +540,9 @@ void Table::remove_column(ColKey col_key) erase_root_column(col_key); // Throws m_has_any_embedded_objects.reset(); + auto i = col_key.get_index().val; + if (i < m_string_interners.size() && m_string_interners[i]) + m_string_interners[i].reset(); } @@ -653,6 +661,14 @@ void Table::init(ref_type top_ref, ArrayParent* parent, size_t ndx_in_parent, bo else { m_tombstones = nullptr; } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) { + // Interner data exist + m_interner_data.init_from_parent(); + } + else { + REALM_ASSERT_DEBUG(!m_interner_data.is_attached()); + } + refresh_string_interners(is_writable); m_cookie = cookie_initialized; } @@ -1054,7 +1070,19 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData if (m_tombstones) { m_tombstones->insert_column(col_key); } - + // create string interners internal rep as well as data area + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + while (col_ndx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (col_ndx >= m_interner_data.size()) { + m_interner_data.add(0); + } + REALM_ASSERT(!m_string_interners[col_ndx]); + // FIXME: Limit creation of interners to EXACTLY the columns, where they can be + // relevant. + // if (col_key.get_type() == col_type_String) + m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); bump_storage_version(); return col_key; @@ -1086,6 +1114,17 @@ void Table::do_erase_root_column(ColKey col_key) REALM_ASSERT(m_index_accessors.back() == nullptr); m_index_accessors.pop_back(); } + REALM_ASSERT_DEBUG(col_ndx < m_string_interners.size()); + if (m_string_interners[col_ndx]) { + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); + auto data_ref = m_interner_data.get_as_ref(col_ndx); + if (data_ref) + Array::destroy_deep(data_ref, m_alloc); + m_interner_data.set(col_ndx, 0); + // m_string_interners[col_ndx]->update_from_parent(true); + m_string_interners[col_ndx].reset(); + } bump_content_version(); bump_storage_version(); } @@ -1239,6 +1278,9 @@ void Table::detach(LifeCycleCookie cookie) noexcept { m_cookie = cookie; m_alloc.bump_instance_version(); + // release string interners + m_string_interners.clear(); + m_interner_data.detach(); } void Table::fully_detach() noexcept @@ -1249,6 +1291,7 @@ void Table::fully_detach() noexcept m_opposite_table.detach(); m_opposite_column.detach(); m_index_accessors.clear(); + m_string_interners.clear(); } @@ -1465,6 +1508,7 @@ ref_type Table::create_empty_table(Allocator& alloc, TableKey key) top.add(0); // pk col key top.add(0); // flags top.add(0); // tombstones + top.add(0); // string interners REALM_ASSERT(top.size() == top_array_size); @@ -1976,6 +2020,13 @@ void Table::update_from_parent() noexcept refresh_content_version(); m_has_any_embedded_objects.reset(); + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + } + refresh_string_interners(false); } m_alloc.bump_storage_version(); } @@ -2104,7 +2155,7 @@ void Table::refresh_content_version() // Called when Group is moved to another version - either a rollback or an advance. // The content of the table is potentially different, so make no assumptions. -void Table::refresh_accessor_tree() +void Table::refresh_accessor_tree(bool writable) { REALM_ASSERT(m_cookie == cookie_initialized); REALM_ASSERT(m_top.is_attached()); @@ -2134,12 +2185,78 @@ void Table::refresh_accessor_tree() else { m_tombstones = nullptr; } + if (writable) { + while (m_top.size() < top_position_for_interners) + m_top.add(0); + } + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.init_from_parent(); + else + m_interner_data.detach(); + } refresh_content_version(); bump_storage_version(); build_column_mapping(); + refresh_string_interners(writable); refresh_index_accessors(); } +void Table::refresh_string_interners(bool writable) +{ + if (writable) { + // if we're in a write transaction, make sure interner arrays are created which will allow + // string interners to expand with their own data when "learning" + while (m_top.size() <= top_position_for_interners) { + m_top.add(0); + } + } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + if (writable) { + if (!m_interner_data.is_attached()) { + m_interner_data.create(NodeHeader::type_HasRefs); + m_interner_data.update_parent(); + } + } + // bring string interners in line with underlying data. + // Precondition: we rely on the col keys in m_leaf_ndx2colkey[] being up to date. + for (size_t idx = 0; idx < m_leaf_ndx2colkey.size(); ++idx) { + auto col_key = m_leaf_ndx2colkey[idx]; + if (col_key == ColKey()) { + // deleted column, we really don't want a string interner for this + if (idx < m_string_interners.size() && m_string_interners[idx]) + m_string_interners[idx].reset(); + continue; + } + REALM_ASSERT_DEBUG(col_key.get_index().val == idx); + // maintain sufficient size of interner arrays to cover all columns + while (idx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (writable && idx >= m_interner_data.size()) { // m_interner_data.is_attached() per above + m_interner_data.add(0); + } + if (m_string_interners[idx]) { + // existing interner + m_string_interners[idx]->update_from_parent(writable); + } + else { + // new interner. Note: if not in a writable state, the interner will not have a valid + // underlying data array. The interner will be set in a state, where it cannot "learn", + // and searches will not find any matching interned strings. + m_string_interners[idx] = std::make_unique(m_alloc, m_interner_data, col_key, writable); + } + } + if (m_string_interners.size() > m_leaf_ndx2colkey.size()) { + // remove any string interners which are no longer reachable, + // e.g. after a rollback + m_string_interners.resize(m_leaf_ndx2colkey.size()); + } +} + void Table::refresh_index_accessors() { // Refresh search index accessors @@ -3407,3 +3524,12 @@ void Table::typed_print(std::string prefix, ref_type ref) const } std::cout << prefix << "}" << std::endl; } + +StringInterner* Table::get_string_interner(ColKey col_key) const +{ + auto idx = col_key.get_index().val; + REALM_ASSERT(idx < m_string_interners.size()); + auto interner = m_string_interners[idx].get(); + REALM_ASSERT(interner); + return interner; +} diff --git a/src/realm/table.hpp b/src/realm/table.hpp index 0830d7c733f..1f02e0540ac 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -573,7 +573,7 @@ class Table { ColKey::Idx spec_ndx2leaf_ndx(size_t idx) const; ColKey leaf_ndx2colkey(ColKey::Idx idx) const; ColKey spec_ndx2colkey(size_t ndx) const; - + StringInterner* get_string_interner(ColKey col_key) const; // Queries // Using where(tv) is the new method to perform queries on TableView. The 'tv' can have any order; it does not // need to be sorted, and, resulting view retains its order. @@ -737,6 +737,7 @@ class Table { Array m_index_refs; // 5th slot in m_top Array m_opposite_table; // 7th slot in m_top Array m_opposite_column; // 8th slot in m_top + Array m_interner_data; // 14th slot in m_top std::vector> m_index_accessors; ColKey m_primary_key_col; Replication* const* m_repl; @@ -848,8 +849,9 @@ class Table { /// Refresh the part of the accessor tree that is rooted at this /// table. - void refresh_accessor_tree(); + void refresh_accessor_tree(bool writable); void refresh_index_accessors(); + void refresh_string_interners(bool writable); void refresh_content_version(); void flush_for_commit(); @@ -861,6 +863,7 @@ class Table { std::vector m_leaf_ndx2colkey; std::vector m_spec_ndx2leaf_ndx; std::vector m_leaf_ndx2spec_ndx; + mutable std::vector> m_string_interners; Type m_table_type = Type::TopLevel; uint64_t m_in_file_version_at_transaction_boundary = 0; AtomicLifeCycleCookie m_cookie; @@ -880,7 +883,8 @@ class Table { static constexpr int top_position_for_flags = 12; // flags contents: bit 0-1 - table type static constexpr int top_position_for_tombstones = 13; - static constexpr int top_array_size = 14; + static constexpr int top_position_for_interners = 14; + static constexpr int top_array_size = 15; enum { s_collision_map_lo = 0, s_collision_map_hi = 1, s_collision_map_local_id = 2, s_collision_map_num_slots }; @@ -1413,6 +1417,11 @@ class _impl::TableFriend { return table.m_spec; } + static StringInterner* get_string_interner(const Table& table, ColKey col_key) + { + return table.get_string_interner(col_key); + } + static TableRef get_opposite_link_table(const Table& table, ColKey col_key); static Group* get_parent_group(const Table& table) noexcept diff --git a/src/realm/transaction.hpp b/src/realm/transaction.hpp index 4da316c0d2e..e4db3c8a586 100644 --- a/src/realm/transaction.hpp +++ b/src/realm/transaction.hpp @@ -217,6 +217,7 @@ class Transaction : public Group { friend class DB; friend class DisableReplication; + friend class Table; }; /* diff --git a/src/realm/utilities.hpp b/src/realm/utilities.hpp index 2125fe2c2fa..badc4d772b6 100644 --- a/src/realm/utilities.hpp +++ b/src/realm/utilities.hpp @@ -69,8 +69,8 @@ typedef SSIZE_T ssize_t; #if defined(REALM_PTR_64) && defined(REALM_X86_OR_X64) && !REALM_WATCHOS -#define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler -#define REALM_COMPILER_AVX +// #define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler +// #define REALM_COMPILER_AVX #endif namespace realm { diff --git a/test/test_shared.cpp b/test/test_shared.cpp index 261a0cc70fc..5fe1f91a194 100644 --- a/test/test_shared.cpp +++ b/test/test_shared.cpp @@ -2288,6 +2288,89 @@ TEST(Shared_EncryptionPageReadFailure) #endif // REALM_ENABLE_ENCRYPTION +TEST(Shared_MaxStrings) +{ + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + std::string str_a(16 * 1024 * 1024 - 257, 'a'); + std::string str_b(16 * 1024 * 1024 - 257, 'b'); + // make it harder to compress: + for (auto& e : str_a) { + e = std::rand() % 256; + } + for (auto& e : str_b) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str_a); + trans->commit_and_continue_as_read(); + auto v = o.get(ck); + CHECK_EQUAL(str_a, v); + trans->promote_to_write(); + auto o2 = t->create_object(); + o2.set(ck, str_b); + trans->commit_and_continue_as_read(); + v = o.get(ck); + auto v2 = o2.get(ck); + CHECK_EQUAL(v, str_a); + CHECK_EQUAL(v2, str_b); + trans->close(); + sg.reset(); +} + +TEST(Shared_RandomMaxStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + for (int run = 0; run < 10; ++run) { + trans->promote_to_write(); + size_t str_length = std::rand() % (16 * 1024 * 1024 - 257); + std::string str(str_length, 'X'); + for (auto& e : str) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str); + trans->commit_and_continue_as_read(); + } + trans->close(); +} + +TEST(Shared_RandomSmallStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + // std::cout << "Writing " << path << std::endl; + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + std::string str(500, 'X'); + // insert a million objects with at most 4000 different strings + for (int run = 0; run < 100; ++run) { + trans->promote_to_write(); + for (int i = 0; i < 1000; ++i) { + // size_t str_length = std::rand() % (1 + 500); + // std::string str(str_length, 'X'); + size_t offset = std::rand() % str.size(); + str[offset] = 'a' + (std::rand() & 0x7); + auto o = t->create_object(); + o.set(ck, str); + } + trans->commit_and_continue_as_read(); + } + trans->close(); +} + TEST(Shared_VersionCount) { SHARED_GROUP_TEST_PATH(path); @@ -2468,6 +2551,7 @@ TEST(Shared_MovingSearchIndex) // Remove the padding column to shift the indexed columns { WriteTransaction wt(sg); + wt.get_group().verify(); TableRef table = wt.get_table("foo"); CHECK(table->has_search_index(int_col)); diff --git a/test/test_unresolved_links.cpp b/test/test_unresolved_links.cpp index 60f50ee3488..b47c68fa313 100644 --- a/test/test_unresolved_links.cpp +++ b/test/test_unresolved_links.cpp @@ -870,6 +870,7 @@ TEST(Unresolved_PerformanceLinkList) tr->commit_and_continue_as_read(); CHECK(t2 > t1); tr->promote_to_write(); + // fails in compressed format because of unsigned/signed interpretation. tr->verify(); } diff --git a/test/test_upgrade_database.cpp b/test/test_upgrade_database.cpp index ae95d1a02da..04bf2e533b4 100644 --- a/test/test_upgrade_database.cpp +++ b/test/test_upgrade_database.cpp @@ -166,6 +166,7 @@ TEST(Upgrade_Disabled) TEST(Upgrade_DatabaseWithUnsupportedOldFileFormat) { + // Not core 6, thus kind is not set. And assetion is triggered. std::string path = test_util::get_test_resource_path() + "test_upgrade_database_1000_1.realm"; CHECK_OR_RETURN(File::exists(path));