Skip to content

Commit

Permalink
compression tests for collection of strings
Browse files Browse the repository at this point in the history
  • Loading branch information
nicola-cab committed Jun 19, 2024
1 parent 50a9287 commit 264aae4
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/realm/string_interner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,8 +618,9 @@ std::optional<StringID> StringInterner::lookup(StringData sd)
int StringInterner::compare(StringID A, StringID B)
{
std::lock_guard lock(m_mutex);
REALM_ASSERT_DEBUG(A - 1 < m_decompressed_strings.size());
REALM_ASSERT_DEBUG(B - 1 < m_decompressed_strings.size());
// 0 is null, the first index starts from 1.
REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size());
// comparisons against null
if (A == B && A == 0)
return 0;
Expand Down
79 changes: 79 additions & 0 deletions test/test_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2508,5 +2508,84 @@ TEST(Group_ArrayCompression_Correctness_Random_Input)
#endif
}

TEST(Group_ArrayCompression_Strings)
{
GROUP_TEST_PATH(path);

// create a bunch of string related properties that are going to be compressed and verify write/read machinery
// and string correctness.
Group to_disk;
TableRef table = to_disk.add_table("test");
auto col_key_string = table->add_column(type_String, "string");
auto col_key_list_string = table->add_column_list(type_String, "list_strings");
auto col_key_set_string = table->add_column_set(type_String, "set_strings");
auto col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings");
auto obj = table->create_object();


obj.set_any(col_key_string, {"Test"});
auto list_s = obj.get_list<String>(col_key_list_string);
auto set_s = obj.get_set<String>(col_key_set_string);
auto dictionary_s = obj.get_dictionary(col_key_dict_string);

std::string tmp{"aabbbcccaaaaddfwregfgklnjytojfs"};
for (size_t i = 0; i < 10; ++i) {
list_s.add({tmp + std::to_string(i)});
}
for (size_t i = 0; i < 10; ++i) {
set_s.insert({tmp + std::to_string(i)});
}
for (size_t i = 0; i < 10; ++i) {
const auto key_value = tmp + std::to_string(i);
dictionary_s.insert({key_value}, {key_value});
}

CHECK(list_s.size() == 10);
CHECK(set_s.size() == 10);
CHECK(dictionary_s.size() == 10);

// Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
to_disk.write(path, crypt_key());

#ifdef REALM_DEBUG
to_disk.verify();
#endif

// Load the tables
Group from_disk(path, crypt_key());
TableRef read_table = from_disk.get_table("test");
auto obj1 = read_table->get_object(0);

auto list_s1 = obj.get_list<String>("list_strings");
auto set_s1 = obj.get_set<String>("set_strings");
auto dictionary_s1 = obj.get_dictionary("dict_strings");

CHECK(obj1.get_any("string") == obj.get_any("string"));


CHECK(list_s1.size() == list_s.size());
CHECK(set_s1.size() == set_s.size());
CHECK(dictionary_s1.size() == dictionary_s.size());

CHECK(*read_table == *table);

for (size_t i = 0; i < list_s1.size(); ++i) {
CHECK_EQUAL(list_s1.get_any(i), list_s.get_any(i));
}

for (size_t i = 0; i < set_s1.size(); ++i) {
CHECK_EQUAL(set_s1.get_any(i), set_s.get_any(i));
}

for (size_t i = 0; i < dictionary_s1.size(); ++i) {
CHECK_EQUAL(dictionary_s1.get_key(i), dictionary_s.get_key(i));
CHECK_EQUAL(dictionary_s1.get_any(i), dictionary_s.get_any(i));
}

#ifdef REALM_DEBUG
from_disk.verify();
#endif
}


#endif // TEST_GROUP
114 changes: 114 additions & 0 deletions test/test_string_compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,117 @@ TEST(StringInterner_Creation_Multiple_String_ColKey)
CHECK(interners[i]->compare(*stored_id, id) == 0); // compare agaist self.
}
}

TEST(StringInterner_Creation_List_Strings)
{
Group group;
TableRef table = group.add_table("test");
const auto col_key = table->add_column_list(type_String, "list_strings");
auto o = table->create_object();
auto list_string = o.get_list<String>(col_key);
auto list_string_interner = table->get_string_interner(col_key);
std::string my_string = "testtesttest";
for (size_t i = 0; i < 10; i++) {
my_string += std::to_string(i);
list_string.add({my_string});
}

std::vector<StringID> interned_ids;
for (size_t i = 0; i < 10; i++) {
auto id = list_string_interner->intern(list_string.get(i));
interned_ids.push_back(id);
}

std::random_device dev;
std::mt19937 rng(dev());
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
for (size_t i = 0; i < 10; ++i) {
const auto str = list_string_interner->get(interned_ids[i]);
CHECK(str == list_string[i]);
CHECK(list_string_interner->compare(list_string[i], interned_ids[i]) == 0);
for (;;) {
// pick a random string and verify that does not match
auto index = dist(rng);
if (index != i) {
CHECK(list_string_interner->compare(list_string[index], interned_ids[i]) != 0);
break;
}
}
}
}

TEST(StringInterner_Creation_Set_String)
{
Group group;
TableRef table = group.add_table("test");
const auto col_key = table->add_column_set(type_String, "set_strings");
auto o = table->create_object();
auto set_string = o.get_set<String>(col_key);
auto set_string_interner = table->get_string_interner(col_key);
std::string my_string = "testtesttest";
for (size_t i = 0; i < 10; i++) {
my_string += std::to_string(i);
set_string.insert({my_string});
}

std::vector<StringID> interned_ids;
for (size_t i = 0; i < 10; i++) {
auto id = set_string_interner->intern(set_string.get(i));
interned_ids.push_back(id);
}

std::random_device dev;
std::mt19937 rng(dev());
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
for (size_t i = 0; i < 10; ++i) {
const auto str = set_string_interner->get(interned_ids[i]);
CHECK(str == set_string.get(i));
CHECK(set_string_interner->compare(set_string.get(i), interned_ids[i]) == 0);
for (;;) {
// pick a random string and verify that does not match
auto index = dist(rng);
if (index != i) {
CHECK(set_string_interner->compare(set_string.get(index), interned_ids[i]) != 0);
break;
}
}
}
}

TEST(StringInterner_Creation_Dictionary_String)
{
Group group;
TableRef table = group.add_table("test");
const auto col_key = table->add_column_dictionary(type_String, "dict_string");
auto o = table->create_object();
auto dictionary = o.get_dictionary(col_key);
auto dictionary_interner = table->get_string_interner(col_key);
std::string my_string = "testtesttest";
for (size_t i = 0; i < 10; i++) {
my_string += std::to_string(i);
dictionary.insert({my_string}, {my_string});
}

std::vector<StringID> interned_ids;
for (size_t i = 0; i < 10; i++) {
auto id = dictionary_interner->intern(dictionary.get_any(i).get_string());
interned_ids.push_back(id);
}

std::random_device dev;
std::mt19937 rng(dev());
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 9);
for (size_t i = 0; i < 10; ++i) {
const auto str = dictionary_interner->get(interned_ids[i]);
CHECK(str == dictionary.get_any(i).get_string());
CHECK(dictionary_interner->compare(dictionary.get_any(i).get_string(), interned_ids[i]) == 0);
for (;;) {
// pick a random string and verify that does not match
auto index = dist(rng);
if (index != i) {
CHECK(dictionary_interner->compare(dictionary.get_any(index).get_string(), interned_ids[i]) != 0);
break;
}
}
}
}

0 comments on commit 264aae4

Please sign in to comment.