realm · jedelbo · May 30, 2024 · May 29, 2024
diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp
@@ -194,43 +194,47 @@ class UnalignedWordIter {
     }
     // 'num_bits' number of bits which must be read
     // WARNING returned word may be garbage above the first 'num_bits' bits.
-    uint64_t get(size_t num_bits)
+    uint64_t consume(size_t num_bits)
     {
         auto first_word = m_word_ptr[0];
         uint64_t result = first_word >> m_in_word_offset;
         // note: above shifts in zeroes
-        if (m_in_word_offset + num_bits <= 64)
-            return result;
-        // if we're here, in_word_offset > 0
-        auto first_word_size = 64 - m_in_word_offset;
-        auto second_word = m_word_ptr[1];
-        result |= second_word << first_word_size;
-        // note: above shifts in zeroes below the bits we want
+        if (m_in_word_offset + num_bits > 64) {
+            // if we're here, in_word_offset > 0
+            auto first_word_size = 64 - m_in_word_offset;
+            auto second_word = m_word_ptr[1];
+            result |= second_word << first_word_size;
+            // note: above shifts in zeroes below the bits we want
+        }
+        _bump(num_bits);
         return result;
     }
-    uint64_t get_with_unsafe_prefetch(size_t num_bits)
+    uint64_t consume_with_unsafe_prefetch(size_t num_bits)
     {
         auto first_word = m_word_ptr[0];
         uint64_t result = first_word >> m_in_word_offset;
         // note: above shifts in zeroes
         auto first_word_size = 64 - m_in_word_offset;
         auto second_word = m_word_ptr[1];
         REALM_ASSERT_DEBUG(num_bits <= 64);
-        result |= (m_in_word_offset + num_bits > 64) ? (second_word << first_word_size) : 0;
+        if (num_bits > first_word_size)
+            result |= second_word << first_word_size;
         // note: above shifts in zeroes below the bits we want
+        _bump(num_bits);
         return result;
     }
+
+private:
+    const uint64_t* m_word_ptr;
+    unsigned m_in_word_offset;
+
     // bump the iterator the specified number of bits
-    void bump(size_t num_bits)
+    void _bump(size_t num_bits)
     {
         auto total_offset = m_in_word_offset + num_bits;
         m_word_ptr += total_offset >> 6;
         m_in_word_offset = total_offset & 0x3F;
     }
-
-private:
-    const uint64_t* m_word_ptr;
-    unsigned m_in_word_offset;
 };
 
 // Read a bit field of up to 64 bits.
@@ -524,127 +528,6 @@ constexpr uint64_t field_sign_bit(int width)
     return populate(width, 1ULL << (width - 1));
 }
 
-/* Unsigned LT.
-
- This can be determined by trial subtaction. However, some care must be exercised
- since simply subtracting one vector from another will allow carries from one
- bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping
- the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in
- the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB)
- using boolean logic as described below.
-
- Unsigned LT is also used to find all zero fields or all non-zero fields, so it is
- the backbone of all comparisons returning vectors.
- */
-
-// compute the overflows in unsigned trial subtraction A-B. The overflows
-// will be marked by 1 in the sign bit of each field in the result. Other
-// bits in the result are zero.
-// Overflow are detected for each field pair where A is less than B.
-inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // 1. compute borrow from most significant bit
-    // Isolate bitfields inside A and B before subtraction (prevent carries from spilling over)
-    // do this by clamping most significant bit in A to 1, and msb in B to 0
-    auto A_isolated = A | MSBs;                              // 1 op
-    auto B_isolated = B & ~MSBs;                             // 2 ops
-    auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4)
-
-    // 2. determine what subtraction against most significant bit would give:
-    // A B borrow-in:   (A-B-borrow-in)
-    // 0 0 0            (0-0-0) = 0
-    // 0 0 1            (0-0-1) = 1 + borrow-out
-    // 0 1 0            (0-1-0) = 1 + borrow-out
-    // 0 1 1            (0-1-1) = 0 + borrow-out
-    // 1 0 0            (1-0-0) = 1
-    // 1 0 1            (1-0-1) = 0
-    // 1 1 0            (1-1-0) = 0
-    // 1 1 1            (1-1-1) = 1 + borrow-out
-    // borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in)
-    // The overflows are simply the borrow-out, now encoded into the sign bits of each field.
-    auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit);
-    // ^ 6 ops, total latency 6 (4+2)
-    return overflows & MSBs; // 1 op, total latency 7
-    // total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel
-    // and still reach a combined latency of 10 or less.
-}
-
-inline uint64_t find_all_fields_unsigned_LT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return unsigned_LT_vector(MSBs, A, B);
-}
-
-inline uint64_t find_all_fields_NE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // 0 != A^B, same as asking 0 - (A^B) overflows.
-    return unsigned_LT_vector(MSBs, 0, A ^ B);
-}
-
-inline uint64_t find_all_fields_EQ(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // get the fields which are EQ and negate the result
-    auto all_fields_NE = find_all_fields_NE(MSBs, A, B);
-    auto all_fields_NE_negated = ~all_fields_NE;
-    // must filter the negated vector so only MSB are left.
-    return MSBs & all_fields_NE_negated;
-}
-
-inline uint64_t find_all_fields_unsigned_LE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // Now A <= B is the same as !(A > B) so...
-    // reverse A and B to turn (A>B) --> (B<A)
-    auto GT = unsigned_LT_vector(MSBs, B, A);
-    // Negate the matches
-    auto GT_negated = ~GT;
-    // and since this negates all bits, filter so we only have MSBs again
-    return MSBs & GT_negated;
-}
-
-inline uint64_t find_all_fields_unsigned_GE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return find_all_fields_unsigned_LE(MSBs, B, A);
-}
-
-inline uint64_t find_all_fields_unsigned_GT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return find_all_fields_unsigned_LT(MSBs, B, A);
-}
-
-/*
- Handling signed values
-
- Trial subtraction only works as is for unsigned. We simply transform signed into unsigned
- by pusing all values up by 1<<(field_width-1). This makes all negative values positive and positive
- values remain positive, although larger. Any overflow during the push can be ignored.
- After that transformation Trial subtraction should correctly detect the LT condition.
-
- */
-
-
-inline uint64_t find_all_fields_signed_LT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    auto sign_bits = MSBs;
-    return unsigned_LT_vector(MSBs, A ^ sign_bits, B ^ sign_bits);
-}
-
-inline uint64_t find_all_fields_signed_LE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    auto sign_bits = MSBs;
-    return find_all_fields_unsigned_LE(MSBs, A ^ sign_bits, B ^ sign_bits);
-}
-
-inline uint64_t find_all_fields_signed_GT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // A > B is the same as B < A
-    return find_all_fields_signed_LT(MSBs, B, A);
-}
-
-inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // A >= B is the same as B <= A
-    return find_all_fields_signed_LE(MSBs, B, A);
-}
-
 constexpr uint32_t inverse_width[65] = {
     65536 * 64 / 1, // never used
     65536 * 64 / 1,  65536 * 64 / 2,  65536 * 64 / 3,  65536 * 64 / 4,  65536 * 64 / 5,  65536 * 64 / 6,
@@ -709,12 +592,10 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
     uint64_t found_vector = 0;
     while (total_bit_count_left >= fast_scan_limit) {
         // unrolling 2x
-        const auto word0 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
-        it.bump(bit_count_pr_iteration);
-        const auto word1 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
+        const auto word0 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
+        const auto word1 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
         auto found_vector0 = vector_compare(MSBs, word0, search_vector);
         auto found_vector1 = vector_compare(MSBs, word1, search_vector);
-        it.bump(bit_count_pr_iteration);
         if (found_vector0) {
             const auto sub_word_index = first_field_marked(width, found_vector0);
             return start + sub_word_index;
@@ -726,19 +607,23 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
         total_bit_count_left -= 2 * bit_count_pr_iteration;
         start += 2 * field_count;
     }
+
+    // One word at a time
     while (total_bit_count_left >= bit_count_pr_iteration) {
-        const auto word = it.get(bit_count_pr_iteration);
+        const auto word = it.consume(bit_count_pr_iteration);
         found_vector = vector_compare(MSBs, word, search_vector);
         if (found_vector) {
             const auto sub_word_index = first_field_marked(width, found_vector);
             return start + sub_word_index;
         }
         total_bit_count_left -= bit_count_pr_iteration;
         start += field_count;
-        it.bump(bit_count_pr_iteration);
     }
-    if (total_bit_count_left) {                         // final subword, may be partial
-        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+
+    // final subword, may be partial
+    if (total_bit_count_left) {
+        // limit lookahead to avoid touching memory beyond array
+        const auto word = it.consume(total_bit_count_left);
         found_vector = vector_compare(MSBs, word, search_vector);
         auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
         found_vector &= last_word_mask;

diff --git a/src/realm/integer_compressor.hpp b/src/realm/integer_compressor.hpp
@@ -24,6 +24,7 @@
 #include <vector>
 #include <realm/query_conditions.hpp>
 #include <realm/array_direct.hpp>
+#include <realm/node.hpp>
 
 namespace realm {
 

diff --git a/src/realm/integer_flex_compressor.hpp b/src/realm/integer_flex_compressor.hpp
@@ -27,9 +27,6 @@
 
 namespace realm {
 
-struct WordTypeValue {};
-struct WordTypeIndex {};
-
 //
 // Compress array in Flex format
 // Decompress array in WTypeBits formats
@@ -99,7 +96,7 @@ inline std::vector<int64_t> FlexCompressor::get_all(const IntegerCompressor& c,
     BfIterator data_iterator{data, 0, v_w, v_w, 0};
     auto remaining_bits = ndx_w * range;
     while (remaining_bits >= bit_per_it) {
-        auto word = unaligned_ndx_iterator.get(bit_per_it);
+        auto word = unaligned_ndx_iterator.consume(bit_per_it);
         for (int i = 0; i < values_per_word; ++i) {
             const auto index = word & ndx_mask;
             data_iterator.move(static_cast<size_t>(index));
@@ -108,10 +105,9 @@ inline std::vector<int64_t> FlexCompressor::get_all(const IntegerCompressor& c,
             word >>= ndx_w;
         }
         remaining_bits -= bit_per_it;
-        unaligned_ndx_iterator.bump(bit_per_it);
     }
     if (remaining_bits) {
-        auto last_word = unaligned_ndx_iterator.get(remaining_bits);
+        auto last_word = unaligned_ndx_iterator.consume(remaining_bits);
         while (remaining_bits) {
             const auto index = last_word & ndx_mask;
             data_iterator.move(static_cast<size_t>(index));
@@ -254,44 +250,6 @@ inline bool FlexCompressor::find_linear(const Array& arr, int64_t value, size_t
     return true;
 }
 
-template <typename Cond, typename Type = WordTypeValue>
-inline uint64_t vector_compare(uint64_t MSBs, uint64_t a, uint64_t b)
-{
-    if constexpr (std::is_same_v<Cond, Equal>)
-        return find_all_fields_EQ(MSBs, a, b);
-    if constexpr (std::is_same_v<Cond, NotEqual>)
-        return find_all_fields_NE(MSBs, a, b);
-
-    if constexpr (std::is_same_v<Cond, Greater>) {
-        if (std::is_same_v<Type, WordTypeValue>)
-            return find_all_fields_signed_GT(MSBs, a, b);
-        if (std::is_same_v<Type, WordTypeIndex>)
-            return find_all_fields_unsigned_GT(MSBs, a, b);
-        REALM_UNREACHABLE();
-    }
-    if constexpr (std::is_same_v<Cond, GreaterEqual>) {
-        if constexpr (std::is_same_v<Type, WordTypeValue>)
-            return find_all_fields_signed_GE(MSBs, a, b);
-        if constexpr (std::is_same_v<Type, WordTypeIndex>)
-            return find_all_fields_unsigned_GE(MSBs, a, b);
-        REALM_UNREACHABLE();
-    }
-    if constexpr (std::is_same_v<Cond, Less>) {
-        if constexpr (std::is_same_v<Type, WordTypeValue>)
-            return find_all_fields_signed_LT(MSBs, a, b);
-        if constexpr (std::is_same_v<Type, WordTypeIndex>)
-            return find_all_fields_unsigned_LT(MSBs, a, b);
-        REALM_UNREACHABLE();
-    }
-    if constexpr (std::is_same_v<Cond, LessEqual>) {
-        if constexpr (std::is_same_v<Type, WordTypeValue>)
-            return find_all_fields_signed_LT(MSBs, a, b);
-        if constexpr (std::is_same_v<Type, WordTypeIndex>)
-            return find_all_fields_unsigned_LE(MSBs, a, b);
-        REALM_UNREACHABLE();
-    }
-}
-
 template <typename CondVal, typename CondIndex>
 inline bool FlexCompressor::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
                                           QueryStateBase* state)
@@ -305,14 +263,14 @@ inline bool FlexCompressor::find_parallel(const Array& arr, int64_t value, size_
 
     auto MSBs = compressor.msb();
     auto search_vector = populate(v_width, value);
-    auto v_start = parallel_subword_find(vector_compare<CondVal>, data, 0, v_width, MSBs, search_vector, 0, v_size);
+    auto v_start = parallel_subword_find(find_all_fields<CondVal>, data, 0, v_width, MSBs, search_vector, 0, v_size);
     if (v_start == v_size)
         return true;
 
     MSBs = compressor.ndx_msb();
     search_vector = populate(ndx_width, v_start);
     while (start < end) {
-        start = parallel_subword_find(vector_compare<CondIndex, WordTypeIndex>, data, offset, ndx_width, MSBs,
+        start = parallel_subword_find(find_all_fields_unsigned<CondIndex>, data, offset, ndx_width, MSBs,
                                       search_vector, start, end);
         if (start < end)
             if (!state->match(start + baseindex))

diff --git a/src/realm/integer_packed_compressor.hpp b/src/realm/integer_packed_compressor.hpp
@@ -82,16 +82,15 @@ inline std::vector<int64_t> PackedCompressor::get_all(const IntegerCompressor& c
     UnalignedWordIter unaligned_data_iterator(data, starting_bit);
     auto cnt_bits = starting_bit;
     while (cnt_bits + bit_per_it < total_bits) {
-        auto word = unaligned_data_iterator.get(bit_per_it);
+        auto word = unaligned_data_iterator.consume(bit_per_it);
         for (int i = 0; i < values_per_word; ++i) {
             res.push_back(sign_extend_field_by_mask(sign_mask, word & mask));
             word >>= v_w;
         }
         cnt_bits += bit_per_it;
-        unaligned_data_iterator.bump(bit_per_it);
     }
     if (cnt_bits < total_bits) {
-        auto last_word = unaligned_data_iterator.get(static_cast<unsigned>(total_bits - cnt_bits));
+        auto last_word = unaligned_data_iterator.consume(static_cast<unsigned>(total_bits - cnt_bits));
         while (cnt_bits < total_bits) {
             res.push_back(sign_extend_field_by_mask(sign_mask, last_word & mask));
             cnt_bits += v_w;
@@ -172,26 +171,12 @@ inline bool PackedCompressor::find_parallel(const Array& arr, int64_t value, siz
     // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
     // the width of each single value within a 64 bit word and N is the total number of values stored in the array.
 
-    // apparently the compiler is not able to deduce the type of a global function after moving stuff in the header
-    // (no so sure why)
-    static auto vector_compare = [](uint64_t MSBs, uint64_t a, uint64_t b) {
-        if constexpr (std::is_same_v<Cond, Equal>)
-            return find_all_fields_EQ(MSBs, a, b);
-        if constexpr (std::is_same_v<Cond, NotEqual>)
-            return find_all_fields_NE(MSBs, a, b);
-        if constexpr (std::is_same_v<Cond, Greater>)
-            return find_all_fields_signed_GT(MSBs, a, b);
-        if constexpr (std::is_same_v<Cond, Less>)
-            return find_all_fields_signed_LT(MSBs, a, b);
-        REALM_UNREACHABLE();
-    };
-
     const auto data = (const uint64_t*)arr.m_data;
     const auto width = arr.m_width;
     const auto MSBs = arr.integer_compressor().msb();
     const auto search_vector = populate(arr.m_width, value);
     while (start < end) {
-        start = parallel_subword_find(vector_compare, data, 0, width, MSBs, search_vector, start, end);
+        start = parallel_subword_find(find_all_fields<Cond>, data, 0, width, MSBs, search_vector, start, end);
         if (start < end)
             if (!state->match(start + baseindex))
                 return false;