Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index on list of strings #7142

Merged
merged 3 commits into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

### Enhancements
* <New feature description> (PR [#????](https://github.com/realm/realm-core/pull/????))
* None.
* Index on list of strings property now supported (PR [#7142](https://github.com/realm/realm-core/pull/7142))

### Fixed
* <How do the end-user experience this issue? what was the impact?> ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?)
Expand Down
95 changes: 71 additions & 24 deletions src/realm/index_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <realm/exceptions.hpp>
#include <realm/index_string.hpp>
#include <realm/table.hpp>
#include <realm/list.hpp>
#include <realm/timestamp.hpp>
#include <realm/column_integer.hpp>
#include <realm/unicode.hpp>
Expand Down Expand Up @@ -82,6 +83,12 @@ Mixed ClusterColumn::get_value(ObjKey key) const
return obj.get_any(m_column_key);
}

Lst<String> ClusterColumn::get_list(ObjKey key) const
{
const Obj obj{m_cluster_tree->get(key)};
return obj.get_list<String>(m_column_key);
}

std::vector<ObjKey> ClusterColumn::get_all_keys() const
{
std::vector<ObjKey> ret;
Expand Down Expand Up @@ -253,8 +260,8 @@ int64_t IndexArray::index_string(Mixed value, InternalFindResult& result_ref, co
if (ref & 1) {
int64_t key_value = int64_t(ref >> 1);

Mixed a = column.is_fulltext() ? reconstruct_string(stringoffset, key, index_data)
: column.get_value(ObjKey(key_value));
Mixed a = column.full_word() ? reconstruct_string(stringoffset, key, index_data)
: column.get_value(ObjKey(key_value));
if (a == value) {
result_ref.payload = key_value;
return first ? key_value : get_count ? 1 : FindRes_single;
Expand All @@ -268,7 +275,7 @@ int64_t IndexArray::index_string(Mixed value, InternalFindResult& result_ref, co
// List of row indices with common prefix up to this point, in sorted order.
if (!sub_isindex) {
const IntegerColumn sub(m_alloc, ref_type(ref));
if (column.is_fulltext()) {
if (column.full_word()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea to share this logic with the fulltext index 👍

result_ref.payload = ref;
result_ref.start_ndx = 0;
result_ref.end_ndx = sub.size();
Expand Down Expand Up @@ -335,6 +342,15 @@ void IndexArray::from_list_all_ins(StringData upper_value, std::vector<ObjKey>&
void IndexArray::from_list_all(Mixed value, std::vector<ObjKey>& result, const IntegerColumn& rows,
const ClusterColumn& column) const
{
if (column.full_word()) {
result.reserve(rows.size());
for (IntegerColumn::const_iterator it = rows.cbegin(); it != rows.cend(); ++it) {
result.push_back(ObjKey(*it));
}

return;
}

SortedListComparator slc(column);

IntegerColumn::const_iterator it_end = rows.cend();
Expand All @@ -356,8 +372,6 @@ void IndexArray::from_list_all(Mixed value, std::vector<ObjKey>& result, const I
for (IntegerColumn::const_iterator it = lower; it != upper; ++it) {
result.push_back(ObjKey(*it));
}

return;
}


Expand Down Expand Up @@ -592,8 +606,7 @@ void IndexArray::index_string_all(Mixed value, std::vector<ObjKey>& result, cons
if (ref & 1) {
ObjKey k(int64_t(ref >> 1));

Mixed a = column.get_value(k);
if (a == value) {
if (column.full_word() || column.get_value(k) == value) {
result.push_back(k);
return;
}
Expand Down Expand Up @@ -802,11 +815,18 @@ void StringIndex::insert_with_offset(ObjKey obj_key, StringData index_data, cons
void StringIndex::insert_to_existing_list_at_lower(ObjKey key, Mixed value, IntegerColumn& list,
const IntegerColumnIterator& lower)
{
SortedListComparator slc(m_target_column);
// At this point there exists duplicates of this value, we need to
// insert value beside it's duplicates so that rows are also sorted
// in ascending order.
IntegerColumn::const_iterator upper = std::upper_bound(lower, list.cend(), value, slc);
IntegerColumn::const_iterator upper = [&]() {
if (m_target_column.full_word()) {
return list.cend();
}
else {
SortedListComparator slc(m_target_column);
return std::upper_bound(lower, list.cend(), value, slc);
}
}();
// find insert position (the list has to be kept in sorted order)
// In most cases the refs will be added to the end. So we test for that
// first to see if we can avoid the binary search for insert position
Expand Down Expand Up @@ -1120,7 +1140,7 @@ bool StringIndex::leaf_insert(ObjKey obj_key, key_type key, size_t offset, Strin

// When key is outside current range, we can just add it
keys.add(key);
if (!m_target_column.is_fulltext() || is_at_string_end) {
if (!m_target_column.full_word() || is_at_string_end) {
int64_t shifted = int64_t((uint64_t(obj_key.value) << 1) + 1); // shift to indicate literal
m_array->add(shifted);
}
Expand All @@ -1141,7 +1161,7 @@ bool StringIndex::leaf_insert(ObjKey obj_key, key_type key, size_t offset, Strin
return false;

keys.insert(ins_pos, key);
if (!m_target_column.is_fulltext() || is_at_string_end) {
if (!m_target_column.full_word() || is_at_string_end) {
int64_t shifted = int64_t((uint64_t(obj_key.value) << 1) + 1); // shift to indicate literal
m_array->insert(ins_pos_refs, shifted);
}
Expand All @@ -1162,7 +1182,7 @@ bool StringIndex::leaf_insert(ObjKey obj_key, key_type key, size_t offset, Strin
// Single match (lowest bit set indicates literal row_ndx)
if ((slot_value & 1) != 0) {
ObjKey obj_key2 = ObjKey(int64_t(slot_value >> 1));
Mixed v2 = m_target_column.is_fulltext() ? reconstruct_string(offset, key, index_data) : get(obj_key2);
Mixed v2 = m_target_column.full_word() ? reconstruct_string(offset, key, index_data) : get(obj_key2);
if (v2 == value) {
// Strings are equal but this is not a list.
// Create a list and add both rows.
Expand Down Expand Up @@ -1213,7 +1233,8 @@ bool StringIndex::leaf_insert(ObjKey obj_key, key_type key, size_t offset, Strin
IntegerColumn::const_iterator lower = it_end;

auto value_exists_in_list = [&]() {
if (m_target_column.is_fulltext()) {
if (m_target_column.full_word()) {
lower = sub.cbegin();
return reconstruct_string(offset, key, index_data) == value.get_string();
}
SortedListComparator slc(m_target_column);
Expand All @@ -1240,15 +1261,15 @@ bool StringIndex::leaf_insert(ObjKey obj_key, key_type key, size_t offset, Strin
// point and insert into the existing list.
ObjKey key_of_any_dup = ObjKey(sub.get(0));
StringConversionBuffer buffer;
StringData index_data_2 = m_target_column.is_fulltext() ? reconstruct_string(offset, key, index_data)
: get(key_of_any_dup).get_index_data(buffer);
StringData index_data_2 = m_target_column.full_word() ? reconstruct_string(offset, key, index_data)
: get(key_of_any_dup).get_index_data(buffer);
if (index_data == index_data_2 || suboffset > s_max_offset) {
insert_to_existing_list(obj_key, value, sub);
}
else {
#ifdef REALM_DEBUG
bool contains_only_duplicates = true;
if (!m_target_column.is_fulltext() && sub.size() > 1) {
if (!m_target_column.full_word() && sub.size() > 1) {
ObjKey first_key = ObjKey(sub.get(0));
ObjKey last_key = ObjKey(sub.back());
auto first = get(first_key);
Expand Down Expand Up @@ -1287,15 +1308,37 @@ Mixed StringIndex::get(ObjKey key) const
void StringIndex::erase(ObjKey key)
{
StringConversionBuffer buffer;
std::string_view value{(get(key).get_index_data(buffer))};
if (m_target_column.is_fulltext()) {
auto words = Tokenizer::get_instance()->reset(value).get_all_tokens();
for (auto& w : words) {
erase_string(key, w);
if (m_target_column.full_word()) {
if (m_target_column.tokenize()) {
// This is a full text index
auto index_data(get(key).get_index_data(buffer));
auto words = Tokenizer::get_instance()->reset(std::string_view(index_data)).get_all_tokens();
for (auto& w : words) {
erase_string(key, w);
}
}
else {
// This is a list (of strings)
erase_list(key);
}
}
else {
erase_string(key, value);
erase_string(key, get(key).get_index_data(buffer));
}
}

void StringIndex::erase_list(ObjKey key)
{
auto list = m_target_column.get_list(key);
std::vector<StringData> strings;
strings.reserve(list.size());
for (auto& val : list) {
strings.push_back(val);
}

auto last = std::unique(strings.begin(), strings.end());
for (auto it = strings.begin(); it != last; ++it) {
erase_string(key, *it);
}
}

Expand Down Expand Up @@ -1659,7 +1702,7 @@ void StringIndex::insert<StringData>(ObjKey key, StringData value)
{
StringConversionBuffer buffer;

if (this->m_target_column.is_fulltext()) {
if (this->m_target_column.tokenize()) {
auto words = Tokenizer::get_instance()->reset(std::string_view(value)).get_all_tokens();

for (auto& word : words) {
Expand All @@ -1680,7 +1723,7 @@ void StringIndex::set<StringData>(ObjKey key, StringData new_value)
Mixed old_value = get(key);
Mixed new_value2 = Mixed(new_value);

if (this->m_target_column.is_fulltext()) {
if (this->m_target_column.tokenize()) {
auto tokenizer = Tokenizer::get_instance();
StringData old_string = old_value.get_index_data(buffer);
std::set<std::string> old_words;
Expand Down Expand Up @@ -1966,6 +2009,10 @@ void StringIndex::dump_node_structure(const Array& node, std::ostream& out, int
}
}

void StringIndex::dump_node_structure() const
{
do_dump_node_structure(std::cout, 0);
}

void StringIndex::do_dump_node_structure(std::ostream& out, int level) const
{
Expand Down
25 changes: 17 additions & 8 deletions src/realm/index_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ class ClusterColumn {
ClusterColumn(const ClusterTree* cluster_tree, ColKey column_key, IndexType type)
: m_cluster_tree(cluster_tree)
, m_column_key(column_key)
, m_type(type)
, m_tokenize(type == IndexType::Fulltext)
, m_full_word(m_tokenize | column_key.is_collection())
{
}
size_t size() const
Expand All @@ -152,17 +153,23 @@ class ClusterColumn {
{
return m_column_key.is_nullable();
}
bool is_fulltext() const
bool tokenize() const
{
return m_type == IndexType::Fulltext;
return m_tokenize;
}
bool full_word() const
{
return m_full_word;
}
Mixed get_value(ObjKey key) const;
Lst<String> get_list(ObjKey key) const;
std::vector<ObjKey> get_all_keys() const;

private:
const ClusterTree* m_cluster_tree;
ColKey m_column_key;
IndexType m_type;
bool m_tokenize;
bool m_full_word;
};

class StringIndex {
Expand Down Expand Up @@ -203,7 +210,7 @@ class StringIndex {
bool is_empty() const;
bool is_fulltext_index() const
{
return this->m_target_column.is_fulltext();
return this->m_target_column.tokenize();
}

template <class T>
Expand All @@ -217,6 +224,10 @@ class StringIndex {
void set(ObjKey key, util::Optional<T> new_value);

void erase(ObjKey key);
void erase_list(ObjKey key);
// Erase without getting value from parent column (useful when string stored
// does not directly match string in parent, like with full-text indexing)
void erase_string(ObjKey key, StringData value);

template <class T>
ObjKey find_first(T value) const;
Expand All @@ -237,6 +248,7 @@ class StringIndex {
#ifdef REALM_DEBUG
template <class T>
void verify_entries(const ClusterColumn& column) const;
void dump_node_structure() const;
void do_dump_node_structure(std::ostream&, int) const;
#endif

Expand Down Expand Up @@ -316,9 +328,6 @@ class StringIndex {
bool noextend = false);
void node_insert_split(size_t ndx, size_t new_ref);
void node_insert(size_t ndx, size_t ref);
// Erase without getting value from parent column (useful when string stored
// does not directly match string in parent, like with full-text indexing)
void erase_string(ObjKey key, StringData value);
void do_delete(ObjKey key, StringData, size_t offset);

Mixed get(ObjKey key) const;
Expand Down
72 changes: 72 additions & 0 deletions src/realm/list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "realm/table_view.hpp"
#include "realm/group.hpp"
#include "realm/replication.hpp"
#include "realm/index_string.hpp"

namespace realm {

Expand Down Expand Up @@ -192,6 +193,77 @@ void Lst<T>::distinct(std::vector<size_t>& indices, util::Optional<bool> sort_or
}
}

/***************************** Lst<Stringdata> ******************************/

template <>
void Lst<StringData>::do_insert(size_t ndx, StringData value)
{
if (auto index = m_obj.get_table()->get_search_index(m_col_key)) {
if (m_tree->find_first(value) == realm::not_found) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose the idea behind handling duplicates at this level is to optimize the find operation. But the cost of O(N) on insert/set/remove seems very high. What would you think about allowing duplicates into the index to make these operations faster and less error prone? Then we would have a small sort/unique cost applied in StringIndex::from_list_all to remove duplicates from the results. Or perhaps we could even return duplicates there? Then users could see how many items in the list match which could be a useful feature?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll give it a try.

Copy link
Contributor Author

@jedelbo jedelbo Nov 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit wary of changing the invariant that there are no duplicate object keys in the index. I have now changed the code so that inserting a value twice is idempotent, so at least we can avoid the O(N) behavior on insertions. Would that be an acceptable compromise?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm good with that. Thanks!

// Value not present - insert
index->insert(m_obj.get_key(), value);
}
}
m_tree->insert(ndx, value);
}

template <>
void Lst<StringData>::do_set(size_t ndx, StringData value)
{
if (auto index = m_obj.get_table()->get_search_index(m_col_key)) {
auto old_value = m_tree->get(ndx);
size_t nb_old = 0;
size_t nb_new = 0;
m_tree->for_all([&](StringData val) {
if (val == old_value) {
nb_old++;
}
if (val == value) {
nb_new++;
}
return !(nb_new && nb_old > 1);
});

if (nb_old == 1) {
index->erase_string(m_obj.get_key(), old_value);
}
if (!nb_new) {
// Value not present - insert
index->insert(m_obj.get_key(), value);
}
}
m_tree->set(ndx, value);
}

template <>
inline void Lst<StringData>::do_remove(size_t ndx)
{
if (auto index = m_obj.get_table()->get_search_index(m_col_key)) {
auto old_value = m_tree->get(ndx);
size_t nb_old = 0;
m_tree->for_all([&](StringData val) {
if (val == old_value) {
nb_old++;
}
return !(nb_old > 1);
});

if (nb_old == 1) {
index->erase_string(m_obj.get_key(), old_value);
}
}
m_tree->erase(ndx);
}

template <>
inline void Lst<StringData>::do_clear()
{
if (auto index = m_obj.get_table()->get_search_index(m_col_key)) {
index->erase_list(m_obj.get_key());
}
m_tree->clear();
}

/********************************* Lst<Key> *********************************/

template <>
Expand Down
Loading