Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RCORE-2064 String EQ/NEQ optimisations for compressed strings #7820

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
71a2318
No unique ptrs for string interner + limit number of interners
nicola-cab Jun 12, 2024
7265c53
point fix client-reset test
nicola-cab Jun 13, 2024
219dbef
minor stuff
nicola-cab Jun 13, 2024
aec41a1
string compression tests
nicola-cab Jun 14, 2024
59e1a5a
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 17, 2024
2e7f996
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 17, 2024
2c45256
point fix check string ids + more tests
nicola-cab Jun 17, 2024
b27b52e
new node addition failure for string interner
nicola-cab Jun 17, 2024
7b8ffe7
point fix lookup compressed string id when rehashing
nicola-cab Jun 18, 2024
4d573e1
find_first optimization for compressed strings
nicola-cab Jun 18, 2024
ad2cfa4
Merge branch 'nc/string_interner_tests' into nc/eq_neq_string_compres…
nicola-cab Jun 18, 2024
50a9287
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 19, 2024
71b34ed
core test passing
nicola-cab Jun 18, 2024
264aae4
compression tests for collection of strings
nicola-cab Jun 19, 2024
050fb75
Merge branch 'nc/string_interner_tests' into nc/eq_neq_string_compres…
nicola-cab Jun 19, 2024
508bbb7
code review
nicola-cab Jul 5, 2024
eedfa69
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jul 8, 2024
9653f0f
Fixes (#7872)
jedelbo Jul 10, 2024
057a46c
Merge branch 'feature/string-compression' into nc/eq_neq_string_compr…
jedelbo Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,16 @@ void ArrayString::clear()
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept
{
// This should only be called if we don't have a string id for this particular array (aka no string interner)
std::optional<StringID> id;
if (m_type == Type::interned_strings)
id = m_string_interner->lookup(value);

return find_first(value, begin, end, id);
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional<StringID> id) const noexcept
{
switch (m_type) {
case Type::small_strings:
Expand All @@ -364,14 +374,13 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const
break;
}
case Type::interned_strings: {
// we need a way to avoid this lookup for each leaf array. The lookup must appear
// higher up the call stack and passed down.
auto id = m_string_interner->lookup(value);
if (id) {
return static_cast<Array*>(m_arr)->find_first(*id, begin, end);
}
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
break;
}
default:
break;
}
return not_found;
}
Expand Down
4 changes: 4 additions & 0 deletions src/realm/array_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <realm/array_string_short.hpp>
#include <realm/array_blobs_small.hpp>
#include <realm/array_blobs_big.hpp>
#include <realm/string_interner.hpp>

namespace realm {

Expand Down Expand Up @@ -117,6 +118,9 @@ class ArrayString : public ArrayPayload {

size_t find_first(StringData value, size_t begin, size_t end) const noexcept;

/// Special version for searching in an array or compressed strings.
size_t find_first(StringData value, size_t begin, size_t end, std::optional<StringID>) const noexcept;

size_t lower_bound(StringData value);

/// Get the specified element without the cost of constructing an
Expand Down
1 change: 0 additions & 1 deletion src/realm/obj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,6 @@ StringData Obj::_get<StringData>(ColKey::Idx col_ndx) const
ArrayString values(get_alloc());
values.set_spec(const_cast<Spec*>(&spec), spec_ndx);
values.init_from_ref(ref);

return values.get(m_row_ndx);
}
else {
Expand Down
1 change: 0 additions & 1 deletion src/realm/query_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,6 @@ StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_

void StringNodeFulltext::table_changed()
{
StringNodeEqualBase::table_changed();
m_link_map->set_base_table(m_table);
}

Expand Down
14 changes: 13 additions & 1 deletion src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1590,6 +1590,8 @@ class StringNodeBase : public ParentNode {
void table_changed() override
{
m_is_string_enum = m_table.unchecked_ptr()->is_enumerated(m_condition_column_key);
m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key);
m_interned_string = m_string_interner->lookup(m_value);
}

void cluster_changed() override
Expand Down Expand Up @@ -1633,6 +1635,8 @@ class StringNodeBase : public ParentNode {
std::optional<std::string> m_value;
std::optional<ArrayString> m_leaf;
StringData m_string_value;
StringInterner* m_string_interner = nullptr;
std::optional<StringID> m_interned_string;

bool m_is_string_enum = false;

Expand Down Expand Up @@ -1680,8 +1684,17 @@ class StringNode : public StringNodeBase {
TConditionFunction cond;

for (size_t s = start; s < end; ++s) {

StringData t = get_string(s);

if constexpr (std::is_same_v<TConditionFunction, NotEqual>) {
if (m_interned_string) {
const auto id = m_string_interner->lookup(get_string(s));
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
if (id && m_string_interner->compare(*m_interned_string, *id))
return s;
}
}

if constexpr (case_sensitive_comparison) {
// case insensitive not implemented for: >, >=, <, <=
if (cond(t, m_string_value))
Expand Down Expand Up @@ -2009,7 +2022,6 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
size_t _find_first_local(size_t start, size_t end) override;
};


class StringNodeFulltext : public StringNodeEqualBase {
public:
StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm = {});
Expand Down
9 changes: 5 additions & 4 deletions src/realm/string_interner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ static std::vector<uint32_t> hash_to_id(Array& node, uint32_t hash, uint8_t hash
if (!node.has_refs()) {
// it's a leaf - default is a list, search starts from index 0.
HashMapIter it(node, hash, hash_size);
if (node.size() > hash_node_min_size) {
if (node.size() >= hash_node_min_size) {
// it is a hash table, so use hash to select index to start searching
// table size must be power of two!
size_t index = hash & (node.size() - 1);
Expand Down Expand Up @@ -618,8 +618,9 @@ std::optional<StringID> StringInterner::lookup(StringData sd)
int StringInterner::compare(StringID A, StringID B)
{
std::lock_guard lock(m_mutex);
REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
REALM_ASSERT_DEBUG(B < m_decompressed_strings.size());
// 0 is null, the first index starts from 1.
REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size());
// comparisons against null
if (A == B && A == 0)
return 0;
Expand All @@ -635,7 +636,7 @@ int StringInterner::compare(StringID A, StringID B)
int StringInterner::compare(StringData s, StringID A)
{
std::lock_guard lock(m_mutex);
REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
REALM_ASSERT_DEBUG((A - 1) < m_decompressed_strings.size());
// comparisons against null
if (s.data() == nullptr && A == 0)
return 0;
Expand Down
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ set(CORE_TEST_SOURCES
test_shared.cpp
test_status.cpp
test_string_data.cpp
test_string_compression.cpp
test_table_view.cpp
test_thread.cpp
test_transactions.cpp
Expand Down
79 changes: 79 additions & 0 deletions test/test_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2508,5 +2508,84 @@ TEST(Group_ArrayCompression_Correctness_Random_Input)
#endif
}

TEST(Group_ArrayCompression_Strings)
{
GROUP_TEST_PATH(path);

// create a bunch of string related properties that are going to be compressed and verify write/read machinery
// and string correctness.
Group to_disk;
TableRef table = to_disk.add_table("test");
auto col_key_string = table->add_column(type_String, "string");
auto col_key_list_string = table->add_column_list(type_String, "list_strings");
auto col_key_set_string = table->add_column_set(type_String, "set_strings");
auto col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings");
auto obj = table->create_object();


obj.set_any(col_key_string, {"Test"});
auto list_s = obj.get_list<String>(col_key_list_string);
auto set_s = obj.get_set<String>(col_key_set_string);
auto dictionary_s = obj.get_dictionary(col_key_dict_string);

std::string tmp{"aabbbcccaaaaddfwregfgklnjytojfs"};
for (size_t i = 0; i < 10; ++i) {
list_s.add({tmp + std::to_string(i)});
}
for (size_t i = 0; i < 10; ++i) {
set_s.insert({tmp + std::to_string(i)});
}
for (size_t i = 0; i < 10; ++i) {
const auto key_value = tmp + std::to_string(i);
dictionary_s.insert({key_value}, {key_value});
}

CHECK(list_s.size() == 10);
CHECK(set_s.size() == 10);
CHECK(dictionary_s.size() == 10);

// Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
to_disk.write(path, crypt_key());

#ifdef REALM_DEBUG
to_disk.verify();
#endif

// Load the tables
Group from_disk(path, crypt_key());
TableRef read_table = from_disk.get_table("test");
auto obj1 = read_table->get_object(0);

auto list_s1 = obj.get_list<String>("list_strings");
auto set_s1 = obj.get_set<String>("set_strings");
auto dictionary_s1 = obj.get_dictionary("dict_strings");

CHECK(obj1.get_any("string") == obj.get_any("string"));


CHECK(list_s1.size() == list_s.size());
CHECK(set_s1.size() == set_s.size());
CHECK(dictionary_s1.size() == dictionary_s.size());

CHECK(*read_table == *table);

for (size_t i = 0; i < list_s1.size(); ++i) {
CHECK_EQUAL(list_s1.get_any(i), list_s.get_any(i));
}

for (size_t i = 0; i < set_s1.size(); ++i) {
CHECK_EQUAL(set_s1.get_any(i), set_s.get_any(i));
}

for (size_t i = 0; i < dictionary_s1.size(); ++i) {
CHECK_EQUAL(dictionary_s1.get_key(i), dictionary_s.get_key(i));
CHECK_EQUAL(dictionary_s1.get_any(i), dictionary_s.get_any(i));
}

#ifdef REALM_DEBUG
from_disk.verify();
#endif
}


#endif // TEST_GROUP
Loading
Loading