Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Getting rid of some glue code #7748

Merged
merged 1 commit into from
May 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 29 additions & 144 deletions src/realm/array_direct.hpp
Original file line number Diff line number Diff line change
@@ -194,43 +194,47 @@ class UnalignedWordIter {
}
// 'num_bits' number of bits which must be read
// WARNING returned word may be garbage above the first 'num_bits' bits.
uint64_t get(size_t num_bits)
uint64_t consume(size_t num_bits)
{
auto first_word = m_word_ptr[0];
uint64_t result = first_word >> m_in_word_offset;
// note: above shifts in zeroes
if (m_in_word_offset + num_bits <= 64)
return result;
// if we're here, in_word_offset > 0
auto first_word_size = 64 - m_in_word_offset;
auto second_word = m_word_ptr[1];
result |= second_word << first_word_size;
// note: above shifts in zeroes below the bits we want
if (m_in_word_offset + num_bits > 64) {
// if we're here, in_word_offset > 0
auto first_word_size = 64 - m_in_word_offset;
auto second_word = m_word_ptr[1];
result |= second_word << first_word_size;
// note: above shifts in zeroes below the bits we want
}
_bump(num_bits);
return result;
}
uint64_t get_with_unsafe_prefetch(size_t num_bits)
uint64_t consume_with_unsafe_prefetch(size_t num_bits)
{
auto first_word = m_word_ptr[0];
uint64_t result = first_word >> m_in_word_offset;
// note: above shifts in zeroes
auto first_word_size = 64 - m_in_word_offset;
auto second_word = m_word_ptr[1];
REALM_ASSERT_DEBUG(num_bits <= 64);
result |= (m_in_word_offset + num_bits > 64) ? (second_word << first_word_size) : 0;
if (num_bits > first_word_size)
result |= second_word << first_word_size;
// note: above shifts in zeroes below the bits we want
_bump(num_bits);
return result;
}

private:
const uint64_t* m_word_ptr;
unsigned m_in_word_offset;

// bump the iterator the specified number of bits
void bump(size_t num_bits)
void _bump(size_t num_bits)
{
auto total_offset = m_in_word_offset + num_bits;
m_word_ptr += total_offset >> 6;
m_in_word_offset = total_offset & 0x3F;
}

private:
const uint64_t* m_word_ptr;
unsigned m_in_word_offset;
};

// Read a bit field of up to 64 bits.
@@ -524,127 +528,6 @@ constexpr uint64_t field_sign_bit(int width)
return populate(width, 1ULL << (width - 1));
}

/* Unsigned LT.

This can be determined by trial subtaction. However, some care must be exercised
since simply subtracting one vector from another will allow carries from one
bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping
the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in
the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB)
using boolean logic as described below.

Unsigned LT is also used to find all zero fields or all non-zero fields, so it is
the backbone of all comparisons returning vectors.
*/

// compute the overflows in unsigned trial subtraction A-B. The overflows
// will be marked by 1 in the sign bit of each field in the result. Other
// bits in the result are zero.
// Overflow are detected for each field pair where A is less than B.
inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B)
{
// 1. compute borrow from most significant bit
// Isolate bitfields inside A and B before subtraction (prevent carries from spilling over)
// do this by clamping most significant bit in A to 1, and msb in B to 0
auto A_isolated = A | MSBs; // 1 op
auto B_isolated = B & ~MSBs; // 2 ops
auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4)

// 2. determine what subtraction against most significant bit would give:
// A B borrow-in: (A-B-borrow-in)
// 0 0 0 (0-0-0) = 0
// 0 0 1 (0-0-1) = 1 + borrow-out
// 0 1 0 (0-1-0) = 1 + borrow-out
// 0 1 1 (0-1-1) = 0 + borrow-out
// 1 0 0 (1-0-0) = 1
// 1 0 1 (1-0-1) = 0
// 1 1 0 (1-1-0) = 0
// 1 1 1 (1-1-1) = 1 + borrow-out
// borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in)
// The overflows are simply the borrow-out, now encoded into the sign bits of each field.
auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit);
// ^ 6 ops, total latency 6 (4+2)
return overflows & MSBs; // 1 op, total latency 7
// total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel
// and still reach a combined latency of 10 or less.
}

inline uint64_t find_all_fields_unsigned_LT(uint64_t MSBs, uint64_t A, uint64_t B)
{
return unsigned_LT_vector(MSBs, A, B);
}

inline uint64_t find_all_fields_NE(uint64_t MSBs, uint64_t A, uint64_t B)
{
// 0 != A^B, same as asking 0 - (A^B) overflows.
return unsigned_LT_vector(MSBs, 0, A ^ B);
}

inline uint64_t find_all_fields_EQ(uint64_t MSBs, uint64_t A, uint64_t B)
{
// get the fields which are EQ and negate the result
auto all_fields_NE = find_all_fields_NE(MSBs, A, B);
auto all_fields_NE_negated = ~all_fields_NE;
// must filter the negated vector so only MSB are left.
return MSBs & all_fields_NE_negated;
}

inline uint64_t find_all_fields_unsigned_LE(uint64_t MSBs, uint64_t A, uint64_t B)
{
// Now A <= B is the same as !(A > B) so...
// reverse A and B to turn (A>B) --> (B<A)
auto GT = unsigned_LT_vector(MSBs, B, A);
// Negate the matches
auto GT_negated = ~GT;
// and since this negates all bits, filter so we only have MSBs again
return MSBs & GT_negated;
}

inline uint64_t find_all_fields_unsigned_GE(uint64_t MSBs, uint64_t A, uint64_t B)
{
return find_all_fields_unsigned_LE(MSBs, B, A);
}

inline uint64_t find_all_fields_unsigned_GT(uint64_t MSBs, uint64_t A, uint64_t B)
{
return find_all_fields_unsigned_LT(MSBs, B, A);
}

/*
Handling signed values

Trial subtraction only works as is for unsigned. We simply transform signed into unsigned
by pusing all values up by 1<<(field_width-1). This makes all negative values positive and positive
values remain positive, although larger. Any overflow during the push can be ignored.
After that transformation Trial subtraction should correctly detect the LT condition.

*/


inline uint64_t find_all_fields_signed_LT(uint64_t MSBs, uint64_t A, uint64_t B)
{
auto sign_bits = MSBs;
return unsigned_LT_vector(MSBs, A ^ sign_bits, B ^ sign_bits);
}

inline uint64_t find_all_fields_signed_LE(uint64_t MSBs, uint64_t A, uint64_t B)
{
auto sign_bits = MSBs;
return find_all_fields_unsigned_LE(MSBs, A ^ sign_bits, B ^ sign_bits);
}

inline uint64_t find_all_fields_signed_GT(uint64_t MSBs, uint64_t A, uint64_t B)
{
// A > B is the same as B < A
return find_all_fields_signed_LT(MSBs, B, A);
}

inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)
{
// A >= B is the same as B <= A
return find_all_fields_signed_LE(MSBs, B, A);
}

constexpr uint32_t inverse_width[65] = {
65536 * 64 / 1, // never used
65536 * 64 / 1, 65536 * 64 / 2, 65536 * 64 / 3, 65536 * 64 / 4, 65536 * 64 / 5, 65536 * 64 / 6,
@@ -709,12 +592,10 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
uint64_t found_vector = 0;
while (total_bit_count_left >= fast_scan_limit) {
// unrolling 2x
const auto word0 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
it.bump(bit_count_pr_iteration);
const auto word1 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
const auto word0 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
const auto word1 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
auto found_vector0 = vector_compare(MSBs, word0, search_vector);
auto found_vector1 = vector_compare(MSBs, word1, search_vector);
it.bump(bit_count_pr_iteration);
if (found_vector0) {
const auto sub_word_index = first_field_marked(width, found_vector0);
return start + sub_word_index;
@@ -726,19 +607,23 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
total_bit_count_left -= 2 * bit_count_pr_iteration;
start += 2 * field_count;
}

// One word at a time
while (total_bit_count_left >= bit_count_pr_iteration) {
const auto word = it.get(bit_count_pr_iteration);
const auto word = it.consume(bit_count_pr_iteration);
found_vector = vector_compare(MSBs, word, search_vector);
if (found_vector) {
const auto sub_word_index = first_field_marked(width, found_vector);
return start + sub_word_index;
}
total_bit_count_left -= bit_count_pr_iteration;
start += field_count;
it.bump(bit_count_pr_iteration);
}
if (total_bit_count_left) { // final subword, may be partial
const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array

// final subword, may be partial
if (total_bit_count_left) {
// limit lookahead to avoid touching memory beyond array
const auto word = it.consume(total_bit_count_left);
found_vector = vector_compare(MSBs, word, search_vector);
auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
found_vector &= last_word_mask;
1 change: 1 addition & 0 deletions src/realm/integer_compressor.hpp
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@
#include <vector>
#include <realm/query_conditions.hpp>
#include <realm/array_direct.hpp>
#include <realm/node.hpp>

namespace realm {

50 changes: 4 additions & 46 deletions src/realm/integer_flex_compressor.hpp
Original file line number Diff line number Diff line change
@@ -27,9 +27,6 @@

namespace realm {

struct WordTypeValue {};
struct WordTypeIndex {};

//
// Compress array in Flex format
// Decompress array in WTypeBits formats
@@ -99,7 +96,7 @@ inline std::vector<int64_t> FlexCompressor::get_all(const IntegerCompressor& c,
BfIterator data_iterator{data, 0, v_w, v_w, 0};
auto remaining_bits = ndx_w * range;
while (remaining_bits >= bit_per_it) {
auto word = unaligned_ndx_iterator.get(bit_per_it);
auto word = unaligned_ndx_iterator.consume(bit_per_it);
for (int i = 0; i < values_per_word; ++i) {
const auto index = word & ndx_mask;
data_iterator.move(static_cast<size_t>(index));
@@ -108,10 +105,9 @@ inline std::vector<int64_t> FlexCompressor::get_all(const IntegerCompressor& c,
word >>= ndx_w;
}
remaining_bits -= bit_per_it;
unaligned_ndx_iterator.bump(bit_per_it);
}
if (remaining_bits) {
auto last_word = unaligned_ndx_iterator.get(remaining_bits);
auto last_word = unaligned_ndx_iterator.consume(remaining_bits);
while (remaining_bits) {
const auto index = last_word & ndx_mask;
data_iterator.move(static_cast<size_t>(index));
@@ -254,44 +250,6 @@ inline bool FlexCompressor::find_linear(const Array& arr, int64_t value, size_t
return true;
}

template <typename Cond, typename Type = WordTypeValue>
inline uint64_t vector_compare(uint64_t MSBs, uint64_t a, uint64_t b)
{
if constexpr (std::is_same_v<Cond, Equal>)
return find_all_fields_EQ(MSBs, a, b);
if constexpr (std::is_same_v<Cond, NotEqual>)
return find_all_fields_NE(MSBs, a, b);

if constexpr (std::is_same_v<Cond, Greater>) {
if (std::is_same_v<Type, WordTypeValue>)
return find_all_fields_signed_GT(MSBs, a, b);
if (std::is_same_v<Type, WordTypeIndex>)
return find_all_fields_unsigned_GT(MSBs, a, b);
REALM_UNREACHABLE();
}
if constexpr (std::is_same_v<Cond, GreaterEqual>) {
if constexpr (std::is_same_v<Type, WordTypeValue>)
return find_all_fields_signed_GE(MSBs, a, b);
if constexpr (std::is_same_v<Type, WordTypeIndex>)
return find_all_fields_unsigned_GE(MSBs, a, b);
REALM_UNREACHABLE();
}
if constexpr (std::is_same_v<Cond, Less>) {
if constexpr (std::is_same_v<Type, WordTypeValue>)
return find_all_fields_signed_LT(MSBs, a, b);
if constexpr (std::is_same_v<Type, WordTypeIndex>)
return find_all_fields_unsigned_LT(MSBs, a, b);
REALM_UNREACHABLE();
}
if constexpr (std::is_same_v<Cond, LessEqual>) {
if constexpr (std::is_same_v<Type, WordTypeValue>)
return find_all_fields_signed_LT(MSBs, a, b);
if constexpr (std::is_same_v<Type, WordTypeIndex>)
return find_all_fields_unsigned_LE(MSBs, a, b);
REALM_UNREACHABLE();
}
}

template <typename CondVal, typename CondIndex>
inline bool FlexCompressor::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
QueryStateBase* state)
@@ -305,14 +263,14 @@ inline bool FlexCompressor::find_parallel(const Array& arr, int64_t value, size_

auto MSBs = compressor.msb();
auto search_vector = populate(v_width, value);
auto v_start = parallel_subword_find(vector_compare<CondVal>, data, 0, v_width, MSBs, search_vector, 0, v_size);
auto v_start = parallel_subword_find(find_all_fields<CondVal>, data, 0, v_width, MSBs, search_vector, 0, v_size);
if (v_start == v_size)
return true;

MSBs = compressor.ndx_msb();
search_vector = populate(ndx_width, v_start);
while (start < end) {
start = parallel_subword_find(vector_compare<CondIndex, WordTypeIndex>, data, offset, ndx_width, MSBs,
start = parallel_subword_find(find_all_fields_unsigned<CondIndex>, data, offset, ndx_width, MSBs,
search_vector, start, end);
if (start < end)
if (!state->match(start + baseindex))
21 changes: 3 additions & 18 deletions src/realm/integer_packed_compressor.hpp
Original file line number Diff line number Diff line change
@@ -82,16 +82,15 @@ inline std::vector<int64_t> PackedCompressor::get_all(const IntegerCompressor& c
UnalignedWordIter unaligned_data_iterator(data, starting_bit);
auto cnt_bits = starting_bit;
while (cnt_bits + bit_per_it < total_bits) {
auto word = unaligned_data_iterator.get(bit_per_it);
auto word = unaligned_data_iterator.consume(bit_per_it);
for (int i = 0; i < values_per_word; ++i) {
res.push_back(sign_extend_field_by_mask(sign_mask, word & mask));
word >>= v_w;
}
cnt_bits += bit_per_it;
unaligned_data_iterator.bump(bit_per_it);
}
if (cnt_bits < total_bits) {
auto last_word = unaligned_data_iterator.get(static_cast<unsigned>(total_bits - cnt_bits));
auto last_word = unaligned_data_iterator.consume(static_cast<unsigned>(total_bits - cnt_bits));
while (cnt_bits < total_bits) {
res.push_back(sign_extend_field_by_mask(sign_mask, last_word & mask));
cnt_bits += v_w;
@@ -172,26 +171,12 @@ inline bool PackedCompressor::find_parallel(const Array& arr, int64_t value, siz
// see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
// the width of each single value within a 64 bit word and N is the total number of values stored in the array.

// apparently the compiler is not able to deduce the type of a global function after moving stuff in the header
// (no so sure why)
static auto vector_compare = [](uint64_t MSBs, uint64_t a, uint64_t b) {
if constexpr (std::is_same_v<Cond, Equal>)
return find_all_fields_EQ(MSBs, a, b);
if constexpr (std::is_same_v<Cond, NotEqual>)
return find_all_fields_NE(MSBs, a, b);
if constexpr (std::is_same_v<Cond, Greater>)
return find_all_fields_signed_GT(MSBs, a, b);
if constexpr (std::is_same_v<Cond, Less>)
return find_all_fields_signed_LT(MSBs, a, b);
REALM_UNREACHABLE();
};

const auto data = (const uint64_t*)arr.m_data;
const auto width = arr.m_width;
const auto MSBs = arr.integer_compressor().msb();
const auto search_vector = populate(arr.m_width, value);
while (start < end) {
start = parallel_subword_find(vector_compare, data, 0, width, MSBs, search_vector, start, end);
start = parallel_subword_find(find_all_fields<Cond>, data, 0, width, MSBs, search_vector, start, end);
if (start < end)
if (!state->match(start + baseindex))
return false;
Loading