diff --git a/Package.swift b/Package.swift
index 55a07398f7b..1a2e581eddb 100644
--- a/Package.swift
+++ b/Package.swift
@@ -52,6 +52,7 @@ let notSyncServerSources: [String] = [
     "realm/array_blobs_small.cpp",
     "realm/array_decimal128.cpp",
     "realm/array_fixed_bytes.cpp",
+    "realm/array_aggregate_optimizations.cpp",
     "realm/array_integer.cpp",
     "realm/array_key.cpp",
     "realm/array_mixed.cpp",
@@ -78,6 +79,9 @@ let notSyncServerSources: [String] = [
     "realm/group.cpp",
     "realm/group_writer.cpp",
     "realm/history.cpp",
+    "realm/integer_compressor.cpp",
+    "realm/integer_flex_compressor.cpp",
+    "realm/integer_packed_compressor.cpp",
     "realm/impl",
     "realm/index_string.cpp",
     "realm/link_translator.cpp",
diff --git a/evergreen/config.yml b/evergreen/config.yml
index 8b0fed20697..debf15fd1fa 100644
--- a/evergreen/config.yml
+++ b/evergreen/config.yml
@@ -137,6 +137,10 @@ functions:
               set_cmake_var realm_vars REALM_LLVM_COVERAGE BOOL On
           fi
 
+          if [[ -n "${compress|}" ]]; then
+              set_cmake_var realm_vars REALM_COMPRESS PATH "${cmake_toolchain_file}"
+          fi
+
           set_cmake_var realm_vars REALM_BUILD_COMMANDLINE_TOOLS BOOL "${build_command_line_tools|On}"
           set_cmake_var realm_vars REALM_ENABLE_ENCRYPTION BOOL "${enable_realm_encryption|On}"
           if [[ -n "${compress|}" ]]; then
diff --git a/src/realm/CMakeLists.txt b/src/realm/CMakeLists.txt
index b5aebd5d3bf..18583f3549a 100644
--- a/src/realm/CMakeLists.txt
+++ b/src/realm/CMakeLists.txt
@@ -13,6 +13,7 @@ set(REALM_SOURCES
     array_blobs_big.cpp
     array_decimal128.cpp
     array_fixed_bytes.cpp
+    array_aggregate_optimizations.cpp
     array_integer.cpp
     array_key.cpp
     array_mixed.cpp
@@ -36,6 +37,9 @@ set(REALM_SOURCES
     db.cpp
     group_writer.cpp
     history.cpp
+    integer_compressor.cpp
+    integer_flex_compressor.cpp
+    integer_packed_compressor.cpp
     impl/copy_replication.cpp
     impl/output_stream.cpp
     impl/simulated_failure.cpp
@@ -163,6 +167,9 @@ set(REALM_INSTALL_HEADERS
     handover_defs.hpp
     history.hpp
     index_string.hpp
+    integer_compressor.hpp
+    integer_flex_compressor.hpp
+    integer_packed_compressor.hpp
     keys.hpp
     list.hpp
     mixed.hpp
diff --git a/src/realm/alloc_slab.cpp b/src/realm/alloc_slab.cpp
index 24b122e50d6..5465603c882 100644
--- a/src/realm/alloc_slab.cpp
+++ b/src/realm/alloc_slab.cpp
@@ -388,6 +388,10 @@ SlabAlloc::FreeBlock* SlabAlloc::allocate_block(int size)
     if (remaining)
         push_freelist_entry(remaining);
     REALM_ASSERT_EX(size_from_block(block) >= size, size_from_block(block), size, get_file_path_for_assertions());
+    const auto block_before = bb_before(block);
+    REALM_ASSERT_DEBUG(block_before && block_before->block_after_size >= size);
+    const auto after_block_size = size_from_block(block);
+    REALM_ASSERT_DEBUG(after_block_size >= size);
     return block;
 }
 
diff --git a/src/realm/array.cpp b/src/realm/array.cpp
index 2f96b15877d..be70388bb2b 100644
--- a/src/realm/array.cpp
+++ b/src/realm/array.cpp
@@ -42,7 +42,6 @@
 #pragma warning(disable : 4127) // Condition is constant warning
 #endif
 
-
 // Header format (8 bytes):
 // ------------------------
 //
@@ -190,38 +189,79 @@ using namespace realm::util;
 
 void QueryStateBase::dyncast() {}
 
-size_t Array::bit_width(int64_t v)
+uint8_t Array::bit_width(int64_t v)
 {
     // FIXME: Assuming there is a 64-bit CPU reverse bitscan
     // instruction and it is fast, then this function could be
     // implemented as a table lookup on the result of the scan
-
     if ((uint64_t(v) >> 4) == 0) {
         static const int8_t bits[] = {0, 1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
         return bits[int8_t(v)];
     }
-
-    // First flip all bits if bit 63 is set (will now always be zero)
     if (v < 0)
         v = ~v;
-
     // Then check if bits 15-31 used (32b), 7-31 used (16b), else (8b)
     return uint64_t(v) >> 31 ? 64 : uint64_t(v) >> 15 ? 32 : uint64_t(v) >> 7 ? 16 : 8;
 }
 
+template <size_t width>
+struct Array::VTableForWidth {
+    struct PopulatedVTable : VTable {
+        PopulatedVTable()
+        {
+            getter = &Array::get<width>;
+            setter = &Array::set<width>;
+            chunk_getter = &Array::get_chunk<width>;
+            finder[cond_Equal] = &Array::find_vtable<Equal>;
+            finder[cond_NotEqual] = &Array::find_vtable<NotEqual>;
+            finder[cond_Greater] = &Array::find_vtable<Greater>;
+            finder[cond_Less] = &Array::find_vtable<Less>;
+        }
+    };
+    static const PopulatedVTable vtable;
+};
+
+template <size_t width>
+const typename Array::VTableForWidth<width>::PopulatedVTable Array::VTableForWidth<width>::vtable;
+
 void Array::init_from_mem(MemRef mem) noexcept
 {
-    char* header = Node::init_from_mem(mem);
-    // Parse header
+    // Header is the type of header that has been allocated, in case we are decompressing,
+    // the header is of kind A, which is kind of deceiving the purpose of these checks.
+    // Since we will try to fetch some data from the just initialised header, and never reset
+    // important fields used for type A arrays, like width, lower, upper_bound which are used
+    // for expanding the array, but also query the data.
+    const auto header = mem.get_addr();
+    const auto is_extended = m_integer_compressor.init(header);
+
     m_is_inner_bptree_node = get_is_inner_bptree_node_from_header(header);
     m_has_refs = get_hasrefs_from_header(header);
     m_context_flag = get_context_flag_from_header(header);
-    update_width_cache_from_header();
+
+    if (is_extended) {
+        m_ref = mem.get_ref();
+        m_data = get_data_from_header(header);
+        m_size = m_integer_compressor.size();
+        m_width = m_integer_compressor.v_width();
+        m_lbound = -m_integer_compressor.v_mask();
+        m_ubound = m_integer_compressor.v_mask() - 1;
+        m_integer_compressor.set_vtable(*this);
+        m_getter = m_vtable->getter;
+    }
+    else {
+        // Old init phase.
+        Node::init_from_mem(mem);
+        update_width_cache_from_header();
+    }
+}
+
+MemRef Array::get_mem() const noexcept
+{
+    return MemRef(get_header_from_data(m_data), m_ref, m_alloc);
 }
 
 void Array::update_from_parent() noexcept
 {
-    REALM_ASSERT_DEBUG(is_attached());
     ArrayParent* parent = get_parent();
     REALM_ASSERT_DEBUG(parent);
     ref_type new_ref = get_ref_from_parent();
@@ -230,7 +270,7 @@ void Array::update_from_parent() noexcept
 
 void Array::set_type(Type type)
 {
-    REALM_ASSERT(is_attached());
+    REALM_ASSERT_DEBUG(is_attached());
 
     copy_on_write(); // Throws
 
@@ -254,7 +294,6 @@ void Array::set_type(Type type)
     set_hasrefs_in_header(init_has_refs, header);
 }
 
-
 void Array::destroy_children(size_t offset) noexcept
 {
     for (size_t i = offset; i != m_size; ++i) {
@@ -275,15 +314,28 @@ void Array::destroy_children(size_t offset) noexcept
     }
 }
 
+// size_t Array::get_byte_size() const noexcept
+//{
+//     const auto header = get_header();
+//     auto num_bytes = get_byte_size_from_header(header);
+//     auto read_only = m_alloc.is_read_only(m_ref) == true;
+//     auto capacity = get_capacity_from_header(header);
+//     auto bytes_ok = num_bytes <= capacity;
+//     REALM_ASSERT(read_only || bytes_ok);
+//     REALM_ASSERT_7(m_alloc.is_read_only(m_ref), ==, true, ||, num_bytes, <=, get_capacity_from_header(header));
+//     return num_bytes;
+// }
 
 ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const
 {
-    // Write flat array
+    // here we might want to compress the array and write down.
     const char* header = get_header_from_data(m_data);
     size_t byte_size = get_byte_size();
-    uint32_t dummy_checksum = 0x41414141UL;                                // "AAAA" in ASCII
-    ref_type new_ref = out.write_array(header, byte_size, dummy_checksum); // Throws
-    REALM_ASSERT_3(new_ref % 8, ==, 0);                                    // 8-byte alignment
+    const auto compressed = is_compressed();
+    uint32_t dummy_checksum = compressed ? 0x42424242UL : 0x41414141UL; //
+    uint32_t dummy_checksum_bytes = compressed ? 2 : 4; // AAAA / BB (only 2 bytes for extended arrays)
+    ref_type new_ref = out.write_array(header, byte_size, dummy_checksum, dummy_checksum_bytes); // Throws
+    REALM_ASSERT_3(new_ref % 8, ==, 0);                                                          // 8-byte alignment
     return new_ref;
 }
 
@@ -308,7 +360,6 @@ ref_type Array::do_write_deep(_impl::ArrayWriterBase& out, bool only_if_modified
         }
         new_array.add(value); // Throws
     }
-
     return new_array.do_write_shallow(out); // Throws
 }
 
@@ -333,8 +384,8 @@ void Array::move(size_t begin, size_t end, size_t dest_begin)
     if (bits_per_elem < 8) {
         // FIXME: Should be optimized
         for (size_t i = begin; i != end; ++i) {
-            int_fast64_t v = (this->*m_getter)(i);
-            (this->*(m_vtable->setter))(dest_begin++, v);
+            int_fast64_t v = m_getter(*this, i);
+            m_vtable->setter(*this, dest_begin++, v);
         }
         return;
     }
@@ -360,8 +411,8 @@ void Array::move(Array& dst, size_t ndx)
     size_t sz = m_size;
 
     for (size_t i = ndx; i < sz; i++) {
-        auto v = (this->*getter)(i);
-        (dst.*setter)(dest_begin++, v);
+        auto v = getter(*this, i);
+        setter(dst, dest_begin++, v);
     }
 
     truncate(ndx);
@@ -370,17 +421,15 @@ void Array::move(Array& dst, size_t ndx)
 void Array::set(size_t ndx, int64_t value)
 {
     REALM_ASSERT_3(ndx, <, m_size);
-    if ((this->*(m_vtable->getter))(ndx) == value)
+    if (m_vtable->getter(*this, ndx) == value)
         return;
 
     // Check if we need to copy before modifying
     copy_on_write(); // Throws
-
     // Grow the array if needed to store this value
     ensure_minimum_width(value); // Throws
-
     // Set the value
-    (this->*(m_vtable->setter))(ndx, value);
+    m_vtable->setter(*this, ndx, value);
 }
 
 void Array::set_as_ref(size_t ndx, ref_type ref)
@@ -428,6 +477,7 @@ void Array::insert(size_t ndx, int_fast64_t value)
 {
     REALM_ASSERT_DEBUG(ndx <= m_size);
 
+    decompress_array(*this);
     const auto old_width = m_width;
     const auto old_size = m_size;
     const Getter old_getter = m_getter; // Save old getter before potential width expansion
@@ -447,8 +497,8 @@ void Array::insert(size_t ndx, int_fast64_t value)
         size_t i = old_size;
         while (i > ndx) {
             --i;
-            int64_t v = (this->*old_getter)(i);
-            (this->*(m_vtable->setter))(i + 1, v);
+            int64_t v = old_getter(*this, i);
+            m_vtable->setter(*this, i + 1, v);
         }
     }
     else if (ndx != old_size) {
@@ -462,19 +512,30 @@ void Array::insert(size_t ndx, int_fast64_t value)
     }
 
     // Insert the new value
-    (this->*(m_vtable->setter))(ndx, value);
+    m_vtable->setter(*this, ndx, value);
 
     // Expand values above insertion
     if (do_expand) {
         size_t i = ndx;
         while (i != 0) {
             --i;
-            int64_t v = (this->*old_getter)(i);
-            (this->*(m_vtable->setter))(i, v);
+            int64_t v = old_getter(*this, i);
+            m_vtable->setter(*this, i, v);
         }
     }
 }
 
+void Array::copy_on_write()
+{
+    if (is_read_only() && !decompress_array(*this))
+        Node::copy_on_write();
+}
+
+void Array::copy_on_write(size_t min_size)
+{
+    if (is_read_only() && !decompress_array(*this))
+        Node::copy_on_write(min_size);
+}
 
 void Array::truncate(size_t new_size)
 {
@@ -499,7 +560,6 @@ void Array::truncate(size_t new_size)
     }
 }
 
-
 void Array::truncate_and_destroy_children(size_t new_size)
 {
     REALM_ASSERT(is_attached());
@@ -528,10 +588,8 @@ void Array::truncate_and_destroy_children(size_t new_size)
     }
 }
 
-
 void Array::do_ensure_minimum_width(int_fast64_t value)
 {
-
     // Make room for the new value
     const size_t width = bit_width(value);
 
@@ -544,353 +602,32 @@ void Array::do_ensure_minimum_width(int_fast64_t value)
     size_t i = m_size;
     while (i != 0) {
         --i;
-        int64_t v = (this->*old_getter)(i);
-        (this->*(m_vtable->setter))(i, v);
+        int64_t v = old_getter(*this, i);
+        m_vtable->setter(*this, i, v);
     }
 }
 
-int64_t Array::sum(size_t start, size_t end) const
+bool Array::compress_array(Array& arr) const
 {
-    REALM_TEMPEX(return sum, m_width, (start, end));
+    if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) {
+        return m_integer_compressor.compress(*this, arr);
+    }
+    return false;
 }
 
-template <size_t w>
-int64_t Array::sum(size_t start, size_t end) const
+bool Array::decompress_array(Array& arr) const
 {
-    if (end == size_t(-1))
-        end = m_size;
-    REALM_ASSERT_EX(end <= m_size && start <= end, start, end, m_size);
-
-    if (w == 0 || start == end)
-        return 0;
-
-    int64_t s = 0;
-
-    // Sum manually until 128 bit aligned
-    for (; (start < end) && (((size_t(m_data) & 0xf) * 8 + start * w) % 128 != 0); start++) {
-        s += get<w>(start);
-    }
-
-    if (w == 1 || w == 2 || w == 4) {
-        // Sum of bitwidths less than a byte (which are always positive)
-        // uses a divide and conquer algorithm that is a variation of popolation count:
-        // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-
-        // static values needed for fast sums
-        const uint64_t m2 = 0x3333333333333333ULL;
-        const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
-        const uint64_t h01 = 0x0101010101010101ULL;
-
-        int64_t* data = reinterpret_cast<int64_t*>(m_data + start * w / 8);
-        size_t chunks = (end - start) * w / 8 / sizeof(int64_t);
-
-        for (size_t t = 0; t < chunks; t++) {
-            if (w == 1) {
-#if 0
-#if defined(USE_SSE42) && defined(_MSC_VER) && defined(REALM_PTR_64)
-                s += __popcnt64(data[t]);
-#elif !defined(_MSC_VER) && defined(USE_SSE42) && defined(REALM_PTR_64)
-                s += __builtin_popcountll(data[t]);
-#else
-                uint64_t a = data[t];
-                const uint64_t m1  = 0x5555555555555555ULL;
-                a -= (a >> 1) & m1;
-                a = (a & m2) + ((a >> 2) & m2);
-                a = (a + (a >> 4)) & m4;
-                a = (a * h01) >> 56;
-                s += a;
-#endif
-#endif
-                s += fast_popcount64(data[t]);
-            }
-            else if (w == 2) {
-                uint64_t a = data[t];
-                a = (a & m2) + ((a >> 2) & m2);
-                a = (a + (a >> 4)) & m4;
-                a = (a * h01) >> 56;
-
-                s += a;
-            }
-            else if (w == 4) {
-                uint64_t a = data[t];
-                a = (a & m4) + ((a >> 4) & m4);
-                a = (a * h01) >> 56;
-                s += a;
-            }
-        }
-        start += sizeof(int64_t) * 8 / no0(w) * chunks;
-    }
-
-#ifdef REALM_COMPILER_SSE
-    if (sseavx<42>()) {
-        // 2000 items summed 500000 times, 8/16/32 bits, miliseconds:
-        // Naive, templated get<>: 391 371 374
-        // SSE:                     97 148 282
-
-        if ((w == 8 || w == 16 || w == 32) && end - start > sizeof(__m128i) * 8 / no0(w)) {
-            __m128i* data = reinterpret_cast<__m128i*>(m_data + start * w / 8);
-            __m128i sum_result = {0};
-            __m128i sum2;
-
-            size_t chunks = (end - start) * w / 8 / sizeof(__m128i);
-
-            for (size_t t = 0; t < chunks; t++) {
-                if (w == 8) {
-                    /*
-                    // 469 ms AND disadvantage of handling max 64k elements before overflow
-                    __m128i vl = _mm_cvtepi8_epi16(data[t]);
-                    __m128i vh = data[t];
-                    vh.m128i_i64[0] = vh.m128i_i64[1];
-                    vh = _mm_cvtepi8_epi16(vh);
-                    sum_result = _mm_add_epi16(sum_result, vl);
-                    sum_result = _mm_add_epi16(sum_result, vh);
-                    */
-
-                    /*
-                    // 424 ms
-                    __m128i vl = _mm_unpacklo_epi8(data[t], _mm_set1_epi8(0));
-                    __m128i vh = _mm_unpackhi_epi8(data[t], _mm_set1_epi8(0));
-                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vl, _mm_set1_epi16(1)));
-                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vh, _mm_set1_epi16(1)));
-                    */
-
-                    __m128i vl = _mm_cvtepi8_epi16(data[t]); // sign extend lower words 8->16
-                    __m128i vh = data[t];
-                    vh = _mm_srli_si128(vh, 8); // v >>= 64
-                    vh = _mm_cvtepi8_epi16(vh); // sign extend lower words 8->16
-                    __m128i sum1 = _mm_add_epi16(vl, vh);
-                    __m128i sumH = _mm_cvtepi16_epi32(sum1);
-                    __m128i sumL = _mm_srli_si128(sum1, 8); // v >>= 64
-                    sumL = _mm_cvtepi16_epi32(sumL);
-                    sum_result = _mm_add_epi32(sum_result, sumL);
-                    sum_result = _mm_add_epi32(sum_result, sumH);
-                }
-                else if (w == 16) {
-                    // todo, can overflow for array size > 2^32
-                    __m128i vl = _mm_cvtepi16_epi32(data[t]); // sign extend lower words 16->32
-                    __m128i vh = data[t];
-                    vh = _mm_srli_si128(vh, 8);  // v >>= 64
-                    vh = _mm_cvtepi16_epi32(vh); // sign extend lower words 16->32
-                    sum_result = _mm_add_epi32(sum_result, vl);
-                    sum_result = _mm_add_epi32(sum_result, vh);
-                }
-                else if (w == 32) {
-                    __m128i v = data[t];
-                    __m128i v0 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
-                    v = _mm_srli_si128(v, 8);           // v >>= 64
-                    __m128i v1 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
-                    sum_result = _mm_add_epi64(sum_result, v0);
-                    sum_result = _mm_add_epi64(sum_result, v1);
-
-                    /*
-                    __m128i m = _mm_set1_epi32(0xc000);             // test if overflow could happen (still need
-                    underflow test).
-                    __m128i mm = _mm_and_si128(data[t], m);
-                    zz = _mm_or_si128(mm, zz);
-                    sum_result = _mm_add_epi32(sum_result, data[t]);
-                    */
-                }
-            }
-            start += sizeof(__m128i) * 8 / no0(w) * chunks;
-
-            // prevent taking address of 'state' to make the compiler keep it in SSE register in above loop
-            // (vc2010/gcc4.6)
-            sum2 = sum_result;
-
-            // Avoid aliasing bug where sum2 might not yet be initialized when accessed by get_universal
-            char sum3[sizeof sum2];
-            memcpy(&sum3, &sum2, sizeof sum2);
-
-            // Sum elements of sum
-            for (size_t t = 0; t < sizeof(__m128i) * 8 / ((w == 8 || w == 16) ? 32 : 64); ++t) {
-                int64_t v = get_universal < (w == 8 || w == 16) ? 32 : 64 > (reinterpret_cast<char*>(&sum3), t);
-                s += v;
-            }
-        }
-    }
-#endif
-
-    // Sum remaining elements
-    for (; start < end; ++start)
-        s += get<w>(start);
-
-    return s;
+    return arr.is_compressed() ? m_integer_compressor.decompress(arr) : false;
 }
 
-size_t Array::count(int64_t value) const noexcept
+bool Array::try_compress(Array& arr) const
 {
-    const uint64_t* next = reinterpret_cast<uint64_t*>(m_data);
-    size_t value_count = 0;
-    const size_t end = m_size;
-    size_t i = 0;
-
-    // static values needed for fast population count
-    const uint64_t m1 = 0x5555555555555555ULL;
-    const uint64_t m2 = 0x3333333333333333ULL;
-    const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
-    const uint64_t h01 = 0x0101010101010101ULL;
-
-    if (m_width == 0) {
-        if (value == 0)
-            return m_size;
-        return 0;
-    }
-    if (m_width == 1) {
-        if (uint64_t(value) > 1)
-            return 0;
-
-        const size_t chunkvals = 64;
-        for (; i + chunkvals <= end; i += chunkvals) {
-            uint64_t a = next[i / chunkvals];
-            if (value == 0)
-                a = ~a; // reverse
-
-            a -= (a >> 1) & m1;
-            a = (a & m2) + ((a >> 2) & m2);
-            a = (a + (a >> 4)) & m4;
-            a = (a * h01) >> 56;
-
-            // Could use intrinsic instead:
-            // a = __builtin_popcountll(a); // gcc intrinsic
-
-            value_count += to_size_t(a);
-        }
-    }
-    else if (m_width == 2) {
-        if (uint64_t(value) > 3)
-            return 0;
-
-        const uint64_t v = ~0ULL / 0x3 * value;
-
-        // Masks to avoid spillover between segments in cascades
-        const uint64_t c1 = ~0ULL / 0x3 * 0x1;
-
-        const size_t chunkvals = 32;
-        for (; i + chunkvals <= end; i += chunkvals) {
-            uint64_t a = next[i / chunkvals];
-            a ^= v;             // zero matching bit segments
-            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
-            a &= m1;            // isolate single bit in each segment
-            a ^= m1;            // reverse isolated bits
-            // if (!a) continue;
-
-            // Population count
-            a = (a & m2) + ((a >> 2) & m2);
-            a = (a + (a >> 4)) & m4;
-            a = (a * h01) >> 56;
-
-            value_count += to_size_t(a);
-        }
-    }
-    else if (m_width == 4) {
-        if (uint64_t(value) > 15)
-            return 0;
-
-        const uint64_t v = ~0ULL / 0xF * value;
-        const uint64_t m = ~0ULL / 0xF * 0x1;
-
-        // Masks to avoid spillover between segments in cascades
-        const uint64_t c1 = ~0ULL / 0xF * 0x7;
-        const uint64_t c2 = ~0ULL / 0xF * 0x3;
-
-        const size_t chunkvals = 16;
-        for (; i + chunkvals <= end; i += chunkvals) {
-            uint64_t a = next[i / chunkvals];
-            a ^= v;             // zero matching bit segments
-            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
-            a |= (a >> 2) & c2;
-            a &= m; // isolate single bit in each segment
-            a ^= m; // reverse isolated bits
-
-            // Population count
-            a = (a + (a >> 4)) & m4;
-            a = (a * h01) >> 56;
-
-            value_count += to_size_t(a);
-        }
-    }
-    else if (m_width == 8) {
-        if (value > 0x7FLL || value < -0x80LL)
-            return 0; // by casting?
-
-        const uint64_t v = ~0ULL / 0xFF * value;
-        const uint64_t m = ~0ULL / 0xFF * 0x1;
-
-        // Masks to avoid spillover between segments in cascades
-        const uint64_t c1 = ~0ULL / 0xFF * 0x7F;
-        const uint64_t c2 = ~0ULL / 0xFF * 0x3F;
-        const uint64_t c3 = ~0ULL / 0xFF * 0x0F;
-
-        const size_t chunkvals = 8;
-        for (; i + chunkvals <= end; i += chunkvals) {
-            uint64_t a = next[i / chunkvals];
-            a ^= v;             // zero matching bit segments
-            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
-            a |= (a >> 2) & c2;
-            a |= (a >> 4) & c3;
-            a &= m; // isolate single bit in each segment
-            a ^= m; // reverse isolated bits
-
-            // Population count
-            a = (a * h01) >> 56;
-
-            value_count += to_size_t(a);
-        }
-    }
-    else if (m_width == 16) {
-        if (value > 0x7FFFLL || value < -0x8000LL)
-            return 0; // by casting?
-
-        const uint64_t v = ~0ULL / 0xFFFF * value;
-        const uint64_t m = ~0ULL / 0xFFFF * 0x1;
-
-        // Masks to avoid spillover between segments in cascades
-        const uint64_t c1 = ~0ULL / 0xFFFF * 0x7FFF;
-        const uint64_t c2 = ~0ULL / 0xFFFF * 0x3FFF;
-        const uint64_t c3 = ~0ULL / 0xFFFF * 0x0FFF;
-        const uint64_t c4 = ~0ULL / 0xFFFF * 0x00FF;
-
-        const size_t chunkvals = 4;
-        for (; i + chunkvals <= end; i += chunkvals) {
-            uint64_t a = next[i / chunkvals];
-            a ^= v;             // zero matching bit segments
-            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
-            a |= (a >> 2) & c2;
-            a |= (a >> 4) & c3;
-            a |= (a >> 8) & c4;
-            a &= m; // isolate single bit in each segment
-            a ^= m; // reverse isolated bits
-
-            // Population count
-            a = (a * h01) >> 56;
-
-            value_count += to_size_t(a);
-        }
-    }
-    else if (m_width == 32) {
-        int32_t v = int32_t(value);
-        const int32_t* d = reinterpret_cast<int32_t*>(m_data);
-        for (; i < end; ++i) {
-            if (d[i] == v)
-                ++value_count;
-        }
-        return value_count;
-    }
-    else if (m_width == 64) {
-        const int64_t* d = reinterpret_cast<int64_t*>(m_data);
-        for (; i < end; ++i) {
-            if (d[i] == value)
-                ++value_count;
-        }
-        return value_count;
-    }
-
-    // Check remaining elements
-    for (; i < end; ++i)
-        if (value == get(i))
-            ++value_count;
+    return compress_array(arr);
+}
 
-    return value_count;
+bool Array::try_decompress()
+{
+    return decompress_array(*this);
 }
 
 size_t Array::calc_aligned_byte_size(size_t size, int width)
@@ -990,9 +727,9 @@ MemRef Array::create(Type type, bool context_flag, WidthType width_type, size_t
 {
     REALM_ASSERT_DEBUG(value == 0 || width_type == wtype_Bits);
     REALM_ASSERT_DEBUG(size == 0 || width_type != wtype_Ignore);
-    int width = 0;
+    uint8_t width = 0;
     if (value != 0)
-        width = static_cast<int>(bit_width(value));
+        width = bit_width(value);
     auto mem = Node::create_node(size, alloc, context_flag, type, width_type, width);
     if (value != 0) {
         const auto header = mem.get_addr();
@@ -1004,52 +741,32 @@ MemRef Array::create(Type type, bool context_flag, WidthType width_type, size_t
 }
 
 // This is the one installed into the m_vtable->finder slots.
-template <class cond, size_t bitwidth>
-bool Array::find_vtable(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
+template <class cond>
+bool Array::find_vtable(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                        QueryStateBase* state)
 {
-    return ArrayWithFind(*this).find_optimized<cond, bitwidth>(value, start, end, baseindex, state);
+    REALM_TEMPEX2(return ArrayWithFind(arr).find_optimized, cond, arr.m_width, (value, start, end, baseindex, state));
 }
 
-
-template <size_t width>
-struct Array::VTableForWidth {
-    struct PopulatedVTable : Array::VTable {
-        PopulatedVTable()
-        {
-            getter = &Array::get<width>;
-            setter = &Array::set<width>;
-            chunk_getter = &Array::get_chunk<width>;
-            finder[cond_Equal] = &Array::find_vtable<Equal, width>;
-            finder[cond_NotEqual] = &Array::find_vtable<NotEqual, width>;
-            finder[cond_Greater] = &Array::find_vtable<Greater, width>;
-            finder[cond_Less] = &Array::find_vtable<Less, width>;
-        }
-    };
-    static const PopulatedVTable vtable;
-};
-
-template <size_t width>
-const typename Array::VTableForWidth<width>::PopulatedVTable Array::VTableForWidth<width>::vtable;
-
 void Array::update_width_cache_from_header() noexcept
 {
-    auto width = get_width_from_header(get_header());
-    m_lbound = lbound_for_width(width);
-    m_ubound = ubound_for_width(width);
-
-    m_width = width;
-
-    REALM_TEMPEX(m_vtable = &VTableForWidth, width, ::vtable);
+    m_width = get_width_from_header(get_header());
+    m_lbound = lbound_for_width(m_width);
+    m_ubound = ubound_for_width(m_width);
+    REALM_ASSERT_DEBUG(m_lbound <= m_ubound);
+    REALM_ASSERT_DEBUG(m_width >= m_lbound);
+    REALM_ASSERT_DEBUG(m_width <= m_ubound);
+    REALM_TEMPEX(m_vtable = &VTableForWidth, m_width, ::vtable);
     m_getter = m_vtable->getter;
 }
 
 // This method reads 8 concecutive values into res[8], starting from index 'ndx'. It's allowed for the 8 values to
 // exceed array length; in this case, remainder of res[8] will be be set to 0.
 template <size_t w>
-void Array::get_chunk(size_t ndx, int64_t res[8]) const noexcept
+void Array::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) noexcept
 {
-    REALM_ASSERT_3(ndx, <, m_size);
-
+    auto sz = arr.size();
+    REALM_ASSERT_3(ndx, <, sz);
     size_t i = 0;
 
     // if constexpr to avoid producing spurious warnings resulting from
@@ -1061,7 +778,7 @@ void Array::get_chunk(size_t ndx, int64_t res[8]) const noexcept
 
         // Round m_size down to byte granularity as the trailing bits in the last
         // byte are uninitialized
-        size_t bytes_available = m_size / elements_per_byte;
+        size_t bytes_available = sz / elements_per_byte;
 
         // Round start and end to be byte-aligned. Start is rounded down and
         // end is rounded up as we may read up to 7 unused bits at each end.
@@ -1073,7 +790,7 @@ void Array::get_chunk(size_t ndx, int64_t res[8]) const noexcept
             uint64_t c = 0;
             for (size_t i = end; i > start; --i) {
                 c <<= 8;
-                c += *reinterpret_cast<const uint8_t*>(m_data + i - 1);
+                c += *reinterpret_cast<const uint8_t*>(arr.m_data + i - 1);
             }
             // Trim off leading bits which aren't part of the requested range
             c >>= (ndx - start * elements_per_byte) * w;
@@ -1093,31 +810,31 @@ void Array::get_chunk(size_t ndx, int64_t res[8]) const noexcept
         }
     }
 
-    for (; i + ndx < m_size && i < 8; i++)
-        res[i] = get<w>(ndx + i);
+    for (; i + ndx < sz && i < 8; i++)
+        res[i] = get<w>(arr, ndx + i);
     for (; i < 8; i++)
         res[i] = 0;
 
 #ifdef REALM_DEBUG
-    for (int j = 0; j + ndx < m_size && j < 8; j++) {
-        int64_t expected = get<w>(ndx + j);
+    for (int j = 0; j + ndx < sz && j < 8; j++) {
+        int64_t expected = Array::get_universal<w>(arr.m_data, ndx + j);
         REALM_ASSERT(res[j] == expected);
     }
 #endif
 }
 
 template <>
-void Array::get_chunk<0>(size_t ndx, int64_t res[8]) const noexcept
+void Array::get_chunk<0>(const Array& arr, size_t ndx, int64_t res[8]) noexcept
 {
-    REALM_ASSERT_3(ndx, <, m_size);
+    REALM_ASSERT_3(ndx, <, arr.m_size);
     memset(res, 0, sizeof(int64_t) * 8);
 }
 
 
 template <size_t width>
-void Array::set(size_t ndx, int64_t value)
+void Array::set(Array& arr, size_t ndx, int64_t value)
 {
-    set_direct<width>(m_data, ndx, value);
+    realm::set_direct<width>(arr.m_data, ndx, value);
 }
 
 void Array::_mem_usage(size_t& mem) const noexcept
@@ -1222,10 +939,15 @@ void Array::report_memory_usage_2(MemUsageHandler& handler) const
 void Array::verify() const
 {
 #ifdef REALM_DEBUG
-    REALM_ASSERT(is_attached());
 
-    REALM_ASSERT(m_width == 0 || m_width == 1 || m_width == 2 || m_width == 4 || m_width == 8 || m_width == 16 ||
-                 m_width == 32 || m_width == 64);
+    REALM_ASSERT(is_attached());
+    if (!wtype_is_extended(get_header())) {
+        REALM_ASSERT(m_width == 0 || m_width == 1 || m_width == 2 || m_width == 4 || m_width == 8 || m_width == 16 ||
+                     m_width == 32 || m_width == 64);
+    }
+    else {
+        REALM_ASSERT(m_width <= 64);
+    }
 
     if (!get_parent())
         return;
@@ -1238,35 +960,60 @@ void Array::verify() const
 
 size_t Array::lower_bound_int(int64_t value) const noexcept
 {
+    if (is_compressed())
+        return lower_bound_int_compressed(value);
     REALM_TEMPEX(return lower_bound, m_width, (m_data, m_size, value));
 }
 
 size_t Array::upper_bound_int(int64_t value) const noexcept
 {
+    if (is_compressed())
+        return upper_bound_int_compressed(value);
     REALM_TEMPEX(return upper_bound, m_width, (m_data, m_size, value));
 }
 
-
-size_t Array::find_first(int64_t value, size_t start, size_t end) const
+size_t Array::lower_bound_int_compressed(int64_t value) const noexcept
 {
-    return find_first<Equal>(value, start, end);
+    static impl::CompressedDataFetcher<IntegerCompressor> encoder;
+    encoder.ptr = &m_integer_compressor;
+    return lower_bound(m_data, m_size, value, encoder);
 }
 
+size_t Array::upper_bound_int_compressed(int64_t value) const noexcept
+{
+    static impl::CompressedDataFetcher<IntegerCompressor> encoder;
+    encoder.ptr = &m_integer_compressor;
+    return upper_bound(m_data, m_size, value, encoder);
+}
 
 int_fast64_t Array::get(const char* header, size_t ndx) noexcept
 {
-    const char* data = get_data_from_header(header);
-    uint_least8_t width = get_width_from_header(header);
-    return get_direct(data, width, ndx);
+    // this is very important. Most of the times we end up here
+    // because we are traversing the cluster, the keys/refs in the cluster
+    // are not compressed (because there is almost no gain), so the intent
+    // is avoiding to pollute traversing the cluster as little as possible.
+    // We need to check the header wtype and only initialise the
+    // integer compressor, if needed. Otherwise we should just call
+    // get_direct. On average there should be one more access to the header
+    // while traversing the cluster tree.
+    if (REALM_LIKELY(!NodeHeader::wtype_is_extended(header))) {
+        const char* data = get_data_from_header(header);
+        uint_least8_t width = get_width_from_header(header);
+        return get_direct(data, width, ndx);
+    }
+    // Ideally, we would not want to construct a compressor every time we end up here.
+    // However the compressor initalization should be fast enough. Creating an array,
+    // which owns a compressor internally, is the better approach if we intend to access
+    // the same data over and over again. The compressor basically caches the most important
+    // information about the layuot of the data itself.
+    IntegerCompressor s_compressor;
+    s_compressor.init(header);
+    return s_compressor.get(ndx);
 }
 
-
 std::pair<int64_t, int64_t> Array::get_two(const char* header, size_t ndx) noexcept
 {
-    const char* data = get_data_from_header(header);
-    uint_least8_t width = get_width_from_header(header);
-    std::pair<int64_t, int64_t> p = ::get_two(data, width, ndx);
-    return std::make_pair(p.first, p.second);
+    return std::make_pair(get(header, ndx), get(header, ndx + 1));
 }
 
 bool QueryStateCount::match(size_t, Mixed) noexcept
@@ -1312,7 +1059,6 @@ bool QueryStateFindAll<std::vector<ObjKey>>::match(size_t index) noexcept
     ++m_match_count;
     int64_t key_value = (m_key_values ? m_key_values->get(index) : index) + m_key_offset;
     m_keys.push_back(ObjKey(key_value));
-
     return (m_limit > m_match_count);
 }
 
diff --git a/src/realm/array.hpp b/src/realm/array.hpp
index 1df0aa2b992..6b9569ebd82 100644
--- a/src/realm/array.hpp
+++ b/src/realm/array.hpp
@@ -21,8 +21,10 @@
 
 #include <realm/node.hpp>
 #include <realm/query_state.hpp>
+#include <realm/query_conditions.hpp>
 #include <realm/column_fwd.hpp>
 #include <realm/array_direct.hpp>
+#include <realm/integer_compressor.hpp>
 
 namespace realm {
 
@@ -90,12 +92,8 @@ class QueryStateFindFirst : public QueryStateBase {
 class Array : public Node, public ArrayParent {
 public:
     /// Create an array accessor in the unattached state.
-    explicit Array(Allocator& allocator) noexcept
-        : Node(allocator)
-    {
-    }
-
-    ~Array() noexcept override {}
+    explicit Array(Allocator& allocator) noexcept;
+    virtual ~Array() noexcept = default;
 
     /// Create a new integer array of the specified type and size, and filled
     /// with the specified value, and attach this accessor to it. This does not
@@ -126,6 +124,8 @@ class Array : public Node, public ArrayParent {
         init_from_ref(ref);
     }
 
+    MemRef get_mem() const noexcept;
+
     /// Called in the context of Group::commit() to ensure that attached
     /// accessors stay valid across a commit. Please note that this works only
     /// for non-transactional commits. Accessors obtained during a transaction
@@ -174,21 +174,23 @@ class Array : public Node, public ArrayParent {
     void set_as_ref(size_t ndx, ref_type ref);
 
     template <size_t w>
-    void set(size_t ndx, int64_t value);
+    static void set(Array&, size_t ndx, int64_t value);
 
     int64_t get(size_t ndx) const noexcept;
 
+    std::vector<int64_t> get_all(size_t b, size_t e) const;
+
     template <size_t w>
-    int64_t get(size_t ndx) const noexcept;
+    static int64_t get(const Array& arr, size_t ndx) noexcept;
 
     void get_chunk(size_t ndx, int64_t res[8]) const noexcept;
 
     template <size_t w>
-    void get_chunk(size_t ndx, int64_t res[8]) const noexcept;
+    static void get_chunk(const Array&, size_t ndx, int64_t res[8]) noexcept;
 
     ref_type get_as_ref(size_t ndx) const noexcept;
-
     RefOrTagged get_as_ref_or_tagged(size_t ndx) const noexcept;
+
     void set(size_t ndx, RefOrTagged);
     void add(RefOrTagged);
     void ensure_minimum_width(RefOrTagged);
@@ -198,12 +200,21 @@ class Array : public Node, public ArrayParent {
 
     void alloc(size_t init_size, size_t new_width)
     {
-        REALM_ASSERT_3(m_width, ==, get_width_from_header(get_header()));
-        REALM_ASSERT_3(m_size, ==, get_size_from_header(get_header()));
+        // Node::alloc is the one that triggers copy on write. If we call alloc for a B
+        //       array we have a bug in our machinery, the array should have been decompressed
+        //       way before calling alloc.
+        const auto header = get_header();
+        REALM_ASSERT_3(m_width, ==, get_width_from_header(header));
+        REALM_ASSERT_3(m_size, ==, get_size_from_header(header));
         Node::alloc(init_size, new_width);
         update_width_cache_from_header();
     }
 
+    bool is_empty() const noexcept
+    {
+        return size() == 0;
+    }
+
     /// Remove the element at the specified index, and move elements at higher
     /// indexes to the next lower index.
     ///
@@ -322,6 +333,8 @@ class Array : public Node, public ArrayParent {
     /// by doing a linear search for short sequences.
     size_t lower_bound_int(int64_t value) const noexcept;
     size_t upper_bound_int(int64_t value) const noexcept;
+    size_t lower_bound_int_compressed(int64_t value) const noexcept;
+    size_t upper_bound_int_compressed(int64_t value) const noexcept;
     //@}
 
     int64_t get_sum(size_t start = 0, size_t end = size_t(-1)) const
@@ -351,6 +364,18 @@ class Array : public Node, public ArrayParent {
     /// (idempotency).
     void destroy_deep() noexcept;
 
+    /// check if the array is encoded (in B format)
+    inline bool is_compressed() const;
+
+    inline const IntegerCompressor& integer_compressor() const;
+
+    /// used only for testing, encode the array passed as argument
+    bool try_compress(Array&) const;
+
+    /// used only for testing, decode the array, on which this method is invoked. If the array is not encoded, this is
+    /// a NOP
+    bool try_decompress();
+
     /// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`.
     static void destroy_deep(ref_type ref, Allocator& alloc) noexcept;
 
@@ -383,25 +408,35 @@ class Array : public Node, public ArrayParent {
 
     /// Same as non-static write() with `deep` set to true. This is for the
     /// cases where you do not already have an array accessor available.
+    /// Compression may be attempted if `compress_in_flight` is true.
+    /// This should be avoided if you rely on the size of the array beeing unchanged.
     static ref_type write(ref_type, Allocator&, _impl::ArrayWriterBase&, bool only_if_modified,
                           bool compress_in_flight);
 
-    size_t find_first(int64_t value, size_t begin = 0, size_t end = size_t(-1)) const;
+    inline size_t find_first(int64_t value, size_t begin = 0, size_t end = size_t(-1)) const
+    {
+        return find_first<Equal>(value, begin, end);
+    }
 
     // Wrappers for backwards compatibility and for simple use without
     // setting up state initialization etc
     template <class cond>
     size_t find_first(int64_t value, size_t start = 0, size_t end = size_t(-1)) const
     {
-        REALM_ASSERT(start <= m_size && (end <= m_size || end == size_t(-1)) && start <= end);
-        // todo, would be nice to avoid this in order to speed up find_first loops
         QueryStateFindFirst state;
         Finder finder = m_vtable->finder[cond::condition];
-        (this->*finder)(value, start, end, 0, &state);
+        finder(*this, value, start, end, 0, &state);
+        return state.m_state;
+    }
 
-        return static_cast<size_t>(state.m_state);
+    template <class cond>
+    bool find(int64_t value, size_t start, size_t end, size_t baseIndex, QueryStateBase* state) const
+    {
+        Finder finder = m_vtable->finder[cond::condition];
+        return finder(*this, value, start, end, baseIndex, state);
     }
 
+
     /// Get the specified element without the cost of constructing an
     /// array instance. If an array instance is already available, or
     /// you need to get multiple values, then this method will be
@@ -463,11 +498,15 @@ class Array : public Node, public ArrayParent {
     /// Takes a 64-bit value and returns the minimum number of bits needed
     /// to fit the value. For alignment this is rounded up to nearest
     /// log2. Possible results {0, 1, 2, 4, 8, 16, 32, 64}
-    static size_t bit_width(int64_t value);
+    static uint8_t bit_width(int64_t value);
 
     void typed_print(std::string prefix) const;
 
 protected:
+    friend class NodeTree;
+    void copy_on_write();
+    void copy_on_write(size_t min_size);
+
     // This returns the minimum value ("lower bound") of the representable values
     // for the given bit width. Valid widths are 0, 1, 2, 4, 8, 16, 32, and 64.
     static constexpr int_fast64_t lbound_for_width(size_t width) noexcept;
@@ -505,14 +544,17 @@ class Array : public Node, public ArrayParent {
 
 protected:
     // Getters and Setters for adaptive-packed arrays
-    typedef int64_t (Array::*Getter)(size_t) const; // Note: getters must not throw
-    typedef void (Array::*Setter)(size_t, int64_t);
-    typedef bool (Array::*Finder)(int64_t, size_t, size_t, size_t, QueryStateBase*) const;
-    typedef void (Array::*ChunkGetter)(size_t, int64_t res[8]) const; // Note: getters must not throw
+    typedef int64_t (*Getter)(const Array&, size_t); // Note: getters must not throw
+    typedef void (*Setter)(Array&, size_t, int64_t);
+    typedef bool (*Finder)(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+    typedef void (*ChunkGetter)(const Array&, size_t, int64_t res[8]); // Note: getters must not throw
+
+    typedef std::vector<int64_t> (*GetterAll)(const Array&, size_t, size_t); // Note: getters must not throw
 
     struct VTable {
         Getter getter;
         ChunkGetter chunk_getter;
+        GetterAll getter_all;
         Setter setter;
         Finder finder[cond_VTABLE_FINDER_COUNT]; // one for each active function pointer
     };
@@ -520,11 +562,12 @@ class Array : public Node, public ArrayParent {
     struct VTableForWidth;
 
     // This is the one installed into the m_vtable->finder slots.
-    template <class cond, size_t bitwidth>
-    bool find_vtable(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
+    template <class cond>
+    static bool find_vtable(const Array&, int64_t value, size_t start, size_t end, size_t baseindex,
+                            QueryStateBase* state);
 
     template <size_t w>
-    int64_t get_universal(const char* const data, const size_t ndx) const;
+    static int64_t get_universal(const char* const data, const size_t ndx);
 
 protected:
     Getter m_getter = nullptr; // cached to avoid indirection
@@ -538,6 +581,11 @@ class Array : public Node, public ArrayParent {
     bool m_has_refs;             // Elements whose first bit is zero are refs to subarrays.
     bool m_context_flag;         // Meaning depends on context.
 
+    IntegerCompressor m_integer_compressor;
+    // compress/decompress this array
+    bool compress_array(Array&) const;
+    bool decompress_array(Array& arr) const;
+
 private:
     ref_type do_write_shallow(_impl::ArrayWriterBase&) const;
     ref_type do_write_deep(_impl::ArrayWriterBase&, bool only_if_modified, bool compress) const;
@@ -548,10 +596,15 @@ class Array : public Node, public ArrayParent {
     void report_memory_usage_2(MemUsageHandler&) const;
 #endif
 
+
+private:
     friend class Allocator;
     friend class SlabAlloc;
     friend class GroupWriter;
     friend class ArrayWithFind;
+    friend class IntegerCompressor;
+    friend class PackedCompressor;
+    friend class FlexCompressor;
 };
 
 class TempArray : public Array {
@@ -573,6 +626,57 @@ class TempArray : public Array {
 
 // Implementation:
 
+inline Array::Array(Allocator& allocator) noexcept
+    : Node(allocator)
+{
+}
+
+inline bool Array::is_compressed() const
+{
+    const auto enc = m_integer_compressor.get_encoding();
+    return enc == NodeHeader::Encoding::Flex || enc == NodeHeader::Encoding::Packed;
+}
+
+inline const IntegerCompressor& Array::integer_compressor() const
+{
+    return m_integer_compressor;
+}
+
+inline int64_t Array::get(size_t ndx) const noexcept
+{
+    REALM_ASSERT_DEBUG(is_attached());
+    REALM_ASSERT_DEBUG_EX(ndx < m_size, ndx, m_size);
+    return m_getter(*this, ndx);
+
+    // Two ideas that are not efficient but may be worth looking into again:
+    /*
+        // Assume correct width is found early in REALM_TEMPEX, which is the case for B tree offsets that
+        // are probably either 2^16 long. Turns out to be 25% faster if found immediately, but 50-300% slower
+        // if found later
+        REALM_TEMPEX(return get, (ndx));
+    */
+    /*
+        // Slightly slower in both of the if-cases. Also needs an matchcount m_size check too, to avoid
+        // reading beyond array.
+        if (m_width >= 8 && m_size > ndx + 7)
+            return get<64>(ndx >> m_shift) & m_widthmask;
+        else
+            return (this->*(m_vtable->getter))(ndx);
+    */
+}
+
+inline std::vector<int64_t> Array::get_all(size_t b, size_t e) const
+{
+    REALM_ASSERT_DEBUG(is_compressed());
+    return m_vtable->getter_all(*this, b, e);
+}
+
+template <size_t w>
+inline int64_t Array::get(const Array& arr, size_t ndx) noexcept
+{
+    REALM_ASSERT_DEBUG(arr.is_attached());
+    return get_universal<w>(arr.m_data, ndx);
+}
 
 constexpr inline int_fast64_t Array::lbound_for_width(size_t width) noexcept
 {
@@ -673,7 +777,6 @@ inline void Array::create(Type type, bool context_flag, size_t length, int_fast6
     init_from_mem(mem);
 }
 
-
 inline Array::Type Array::get_type() const noexcept
 {
     if (m_is_inner_bptree_node) {
@@ -689,41 +792,44 @@ inline Array::Type Array::get_type() const noexcept
 inline void Array::get_chunk(size_t ndx, int64_t res[8]) const noexcept
 {
     REALM_ASSERT_DEBUG(ndx < m_size);
-    (this->*(m_vtable->chunk_getter))(ndx, res);
+    m_vtable->chunk_getter(*this, ndx, res);
 }
 
 template <size_t w>
-int64_t Array::get_universal(const char* data, size_t ndx) const
+inline int64_t Array::get_universal(const char* data, size_t ndx)
 {
-    if (w == 0) {
-        return 0;
-    }
-    else if (w == 1) {
-        size_t offset = ndx >> 3;
-        return (data[offset] >> (ndx & 7)) & 0x01;
+    if (w == 64) {
+        size_t offset = ndx << 3;
+        return *reinterpret_cast<const int64_t*>(data + offset);
     }
-    else if (w == 2) {
-        size_t offset = ndx >> 2;
-        return (data[offset] >> ((ndx & 3) << 1)) & 0x03;
+    else if (w == 32) {
+        size_t offset = ndx << 2;
+        return *reinterpret_cast<const int32_t*>(data + offset);
     }
-    else if (w == 4) {
-        size_t offset = ndx >> 1;
-        return (data[offset] >> ((ndx & 1) << 2)) & 0x0F;
+    else if (w == 16) {
+        size_t offset = ndx << 1;
+        return *reinterpret_cast<const int16_t*>(data + offset);
     }
     else if (w == 8) {
         return *reinterpret_cast<const signed char*>(data + ndx);
     }
-    else if (w == 16) {
-        size_t offset = ndx * 2;
-        return *reinterpret_cast<const int16_t*>(data + offset);
+    else if (w == 4) {
+        size_t offset = ndx >> 1;
+        auto d = data[offset];
+        return (d >> ((ndx & 1) << 2)) & 0x0F;
     }
-    else if (w == 32) {
-        size_t offset = ndx * 4;
-        return *reinterpret_cast<const int32_t*>(data + offset);
+    else if (w == 2) {
+        size_t offset = ndx >> 2;
+        auto d = data[offset];
+        return (d >> ((ndx & 3) << 1)) & 0x03;
     }
-    else if (w == 64) {
-        size_t offset = ndx * 8;
-        return *reinterpret_cast<const int64_t*>(data + offset);
+    else if (w == 1) {
+        size_t offset = ndx >> 3;
+        auto d = data[offset];
+        return (d >> (ndx & 7)) & 0x01;
+    }
+    else if (w == 0) {
+        return 0;
     }
     else {
         REALM_ASSERT_DEBUG(false);
@@ -731,35 +837,6 @@ int64_t Array::get_universal(const char* data, size_t ndx) const
     }
 }
 
-template <size_t w>
-int64_t Array::get(size_t ndx) const noexcept
-{
-    return get_universal<w>(m_data, ndx);
-}
-
-inline int64_t Array::get(size_t ndx) const noexcept
-{
-    REALM_ASSERT_DEBUG(is_attached());
-    REALM_ASSERT_DEBUG_EX(ndx < m_size, ndx, m_size);
-    return (this->*m_getter)(ndx);
-
-    // Two ideas that are not efficient but may be worth looking into again:
-    /*
-        // Assume correct width is found early in REALM_TEMPEX, which is the case for B tree offsets that
-        // are probably either 2^16 long. Turns out to be 25% faster if found immediately, but 50-300% slower
-        // if found later
-        REALM_TEMPEX(return get, (ndx));
-    */
-    /*
-        // Slightly slower in both of the if-cases. Also needs an matchcount m_size check too, to avoid
-        // reading beyond array.
-        if (m_width >= 8 && m_size > ndx + 7)
-            return get<64>(ndx >> m_shift) & m_widthmask;
-        else
-            return (this->*(m_vtable->getter))(ndx);
-    */
-}
-
 inline int64_t Array::front() const noexcept
 {
     return get(0);
@@ -848,34 +925,6 @@ inline void Array::destroy_deep() noexcept
     m_data = nullptr;
 }
 
-inline ref_type Array::write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified, bool compress) const
-{
-    REALM_ASSERT(is_attached());
-
-    if (only_if_modified && m_alloc.is_read_only(m_ref))
-        return m_ref;
-
-    if (!deep || !m_has_refs)
-        return do_write_shallow(out); // Throws
-
-    return do_write_deep(out, only_if_modified, compress); // Throws
-}
-
-inline ref_type Array::write(ref_type ref, Allocator& alloc, _impl::ArrayWriterBase& out, bool only_if_modified,
-                             bool compress)
-{
-    if (only_if_modified && alloc.is_read_only(ref))
-        return ref;
-
-    Array array(alloc);
-    array.init_from_ref(ref);
-
-    if (!array.m_has_refs)
-        return array.do_write_shallow(out); // Throws
-
-    return array.do_write_deep(out, only_if_modified, compress); // Throws
-}
-
 inline void Array::add(int_fast64_t value)
 {
     insert(m_size, value);
@@ -986,7 +1035,6 @@ inline size_t Array::get_max_byte_size(size_t num_elems) noexcept
     return header_size + num_elems * max_bytes_per_elem;
 }
 
-
 inline void Array::update_child_ref(size_t child_ndx, ref_type new_ref)
 {
     set(child_ndx, new_ref);
@@ -1004,6 +1052,73 @@ inline void Array::ensure_minimum_width(int_fast64_t value)
     do_ensure_minimum_width(value);
 }
 
+inline ref_type Array::write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified,
+                             bool compress_in_flight) const
+{
+    REALM_ASSERT_DEBUG(is_attached());
+    // The default allocator cannot be trusted wrt is_read_only():
+    REALM_ASSERT_DEBUG(!only_if_modified || &m_alloc != &Allocator::get_default());
+    if (only_if_modified && m_alloc.is_read_only(m_ref))
+        return m_ref;
+
+    if (!deep || !m_has_refs) {
+        // however - creating an array using ANYTHING BUT the default allocator during commit is also wrong....
+        // it only works by accident, because the whole slab area is reinitialized after commit.
+        // We should have: Array encoded_array{Allocator::get_default()};
+        Array compressed_array{Allocator::get_default()};
+        if (compress_in_flight && compress_array(compressed_array)) {
+#ifdef REALM_DEBUG
+            const auto encoding = compressed_array.m_integer_compressor.get_encoding();
+            REALM_ASSERT_DEBUG(encoding == Encoding::Flex || encoding == Encoding::Packed);
+            REALM_ASSERT_DEBUG(size() == compressed_array.size());
+            for (size_t i = 0; i < compressed_array.size(); ++i) {
+                REALM_ASSERT_DEBUG(get(i) == compressed_array.get(i));
+            }
+#endif
+            auto ref = compressed_array.do_write_shallow(out);
+            compressed_array.destroy();
+            return ref;
+        }
+        return do_write_shallow(out); // Throws
+    }
+
+    return do_write_deep(out, only_if_modified, compress_in_flight); // Throws
+}
+
+inline ref_type Array::write(ref_type ref, Allocator& alloc, _impl::ArrayWriterBase& out, bool only_if_modified,
+                             bool compress_in_flight)
+{
+    // The default allocator cannot be trusted wrt is_read_only():
+    REALM_ASSERT_DEBUG(!only_if_modified || &alloc != &Allocator::get_default());
+    if (only_if_modified && alloc.is_read_only(ref))
+        return ref;
+
+    Array array(alloc);
+    array.init_from_ref(ref);
+    REALM_ASSERT_DEBUG(array.is_attached());
+
+    if (!array.m_has_refs) {
+        Array compressed_array{Allocator::get_default()};
+        if (compress_in_flight && array.compress_array(compressed_array)) {
+#ifdef REALM_DEBUG
+            const auto encoding = compressed_array.m_integer_compressor.get_encoding();
+            REALM_ASSERT_DEBUG(encoding == Encoding::Flex || encoding == Encoding::Packed);
+            REALM_ASSERT_DEBUG(array.size() == compressed_array.size());
+            for (size_t i = 0; i < compressed_array.size(); ++i) {
+                REALM_ASSERT_DEBUG(array.get(i) == compressed_array.get(i));
+            }
+#endif
+            auto ref = compressed_array.do_write_shallow(out);
+            compressed_array.destroy();
+            return ref;
+        }
+        else {
+            return array.do_write_shallow(out); // Throws
+        }
+    }
+    return array.do_write_deep(out, only_if_modified, compress_in_flight); // Throws
+}
+
 
 } // namespace realm
 
diff --git a/src/realm/array_aggregate_optimizations.cpp b/src/realm/array_aggregate_optimizations.cpp
new file mode 100644
index 00000000000..6242e6853dd
--- /dev/null
+++ b/src/realm/array_aggregate_optimizations.cpp
@@ -0,0 +1,369 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#include <realm/array.hpp>
+#include <realm/array_with_find.hpp>
+
+using namespace realm;
+
+int64_t Array::sum(size_t start, size_t end) const
+{
+    REALM_TEMPEX(return sum, m_width, (start, end));
+}
+
+template <size_t w>
+int64_t Array::sum(size_t start, size_t end) const
+{
+    if (end == size_t(-1))
+        end = m_size;
+
+    REALM_ASSERT_EX(end <= m_size && start <= end, start, end, m_size);
+
+    if (start == end)
+        return 0;
+
+    int64_t s = 0;
+
+    // Sum manually until 128 bit aligned
+    for (; (start < end) && (((size_t(m_data) & 0xf) * 8 + start * w) % 128 != 0); start++) {
+        s += get<w>(*this, start);
+    }
+
+    if (w == 1 || w == 2 || w == 4) {
+        // Sum of bitwidths less than a byte (which are always positive)
+        // uses a divide and conquer algorithm that is a variation of popolation count:
+        // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+        // static values needed for fast sums
+        const uint64_t m2 = 0x3333333333333333ULL;
+        const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
+        const uint64_t h01 = 0x0101010101010101ULL;
+
+        int64_t* data = reinterpret_cast<int64_t*>(m_data + start * w / 8);
+        size_t chunks = (end - start) * w / 8 / sizeof(int64_t);
+
+        for (size_t t = 0; t < chunks; t++) {
+            if (w == 1) {
+#if 0
+#if defined(USE_SSE42) && defined(_MSC_VER) && defined(REALM_PTR_64)
+                s += __popcnt64(data[t]);
+#elif !defined(_MSC_VER) && defined(USE_SSE42) && defined(REALM_PTR_64)
+                s += __builtin_popcountll(data[t]);
+#else
+                uint64_t a = data[t];
+                const uint64_t m1  = 0x5555555555555555ULL;
+                a -= (a >> 1) & m1;
+                a = (a & m2) + ((a >> 2) & m2);
+                a = (a + (a >> 4)) & m4;
+                a = (a * h01) >> 56;
+                s += a;
+#endif
+#endif
+                s += fast_popcount64(data[t]);
+            }
+            else if (w == 2) {
+                uint64_t a = data[t];
+                a = (a & m2) + ((a >> 2) & m2);
+                a = (a + (a >> 4)) & m4;
+                a = (a * h01) >> 56;
+
+                s += a;
+            }
+            else if (w == 4) {
+                uint64_t a = data[t];
+                a = (a & m4) + ((a >> 4) & m4);
+                a = (a * h01) >> 56;
+                s += a;
+            }
+        }
+        start += sizeof(int64_t) * 8 / no0(w) * chunks;
+    }
+
+#ifdef REALM_COMPILER_SSE
+    if (sseavx<42>()) {
+        // 2000 items summed 500000 times, 8/16/32 bits, miliseconds:
+        // Naive, templated get<>: 391 371 374
+        // SSE:                     97 148 282
+
+        if ((w == 8 || w == 16 || w == 32) && end - start > sizeof(__m128i) * 8 / no0(w)) {
+            __m128i* data = reinterpret_cast<__m128i*>(m_data + start * w / 8);
+            __m128i sum_result = {0};
+            __m128i sum2;
+
+            size_t chunks = (end - start) * w / 8 / sizeof(__m128i);
+
+            for (size_t t = 0; t < chunks; t++) {
+                if (w == 8) {
+                    /*
+                    // 469 ms AND disadvantage of handling max 64k elements before overflow
+                    __m128i vl = _mm_cvtepi8_epi16(data[t]);
+                    __m128i vh = data[t];
+                    vh.m128i_i64[0] = vh.m128i_i64[1];
+                    vh = _mm_cvtepi8_epi16(vh);
+                    sum_result = _mm_add_epi16(sum_result, vl);
+                    sum_result = _mm_add_epi16(sum_result, vh);
+                    */
+
+                    /*
+                    // 424 ms
+                    __m128i vl = _mm_unpacklo_epi8(data[t], _mm_set1_epi8(0));
+                    __m128i vh = _mm_unpackhi_epi8(data[t], _mm_set1_epi8(0));
+                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vl, _mm_set1_epi16(1)));
+                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vh, _mm_set1_epi16(1)));
+                    */
+
+                    __m128i vl = _mm_cvtepi8_epi16(data[t]); // sign extend lower words 8->16
+                    __m128i vh = data[t];
+                    vh = _mm_srli_si128(vh, 8); // v >>= 64
+                    vh = _mm_cvtepi8_epi16(vh); // sign extend lower words 8->16
+                    __m128i sum1 = _mm_add_epi16(vl, vh);
+                    __m128i sumH = _mm_cvtepi16_epi32(sum1);
+                    __m128i sumL = _mm_srli_si128(sum1, 8); // v >>= 64
+                    sumL = _mm_cvtepi16_epi32(sumL);
+                    sum_result = _mm_add_epi32(sum_result, sumL);
+                    sum_result = _mm_add_epi32(sum_result, sumH);
+                }
+                else if (w == 16) {
+                    // todo, can overflow for array size > 2^32
+                    __m128i vl = _mm_cvtepi16_epi32(data[t]); // sign extend lower words 16->32
+                    __m128i vh = data[t];
+                    vh = _mm_srli_si128(vh, 8);  // v >>= 64
+                    vh = _mm_cvtepi16_epi32(vh); // sign extend lower words 16->32
+                    sum_result = _mm_add_epi32(sum_result, vl);
+                    sum_result = _mm_add_epi32(sum_result, vh);
+                }
+                else if (w == 32) {
+                    __m128i v = data[t];
+                    __m128i v0 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
+                    v = _mm_srli_si128(v, 8);           // v >>= 64
+                    __m128i v1 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
+                    sum_result = _mm_add_epi64(sum_result, v0);
+                    sum_result = _mm_add_epi64(sum_result, v1);
+
+                    /*
+                    __m128i m = _mm_set1_epi32(0xc000);             // test if overflow could happen (still need
+                    underflow test).
+                    __m128i mm = _mm_and_si128(data[t], m);
+                    zz = _mm_or_si128(mm, zz);
+                    sum_result = _mm_add_epi32(sum_result, data[t]);
+                    */
+                }
+            }
+            start += sizeof(__m128i) * 8 / no0(w) * chunks;
+
+            // prevent taking address of 'state' to make the compiler keep it in SSE register in above loop
+            // (vc2010/gcc4.6)
+            sum2 = sum_result;
+
+            // Avoid aliasing bug where sum2 might not yet be initialized when accessed by get_universal
+            char sum3[sizeof sum2];
+            memcpy(&sum3, &sum2, sizeof sum2);
+
+            // Sum elements of sum
+            for (size_t t = 0; t < sizeof(__m128i) * 8 / ((w == 8 || w == 16) ? 32 : 64); ++t) {
+                int64_t v = get_universal < (w == 8 || w == 16) ? 32 : 64 > (reinterpret_cast<char*>(&sum3), t);
+                s += v;
+            }
+        }
+    }
+#endif
+
+    // Sum remaining elements
+    for (; start < end; ++start)
+        s += get<w>(*this, start);
+
+    return s;
+}
+
+size_t Array::count(int64_t value) const noexcept
+{
+    // This is not used anywhere in the code, I believe we can delete this
+    // since the query logic does not use this
+    const uint64_t* next = reinterpret_cast<uint64_t*>(m_data);
+    size_t value_count = 0;
+    const size_t end = m_size;
+    size_t i = 0;
+
+    // static values needed for fast population count
+    const uint64_t m1 = 0x5555555555555555ULL;
+    const uint64_t m2 = 0x3333333333333333ULL;
+    const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
+    const uint64_t h01 = 0x0101010101010101ULL;
+
+    if (m_width == 0) {
+        if (value == 0)
+            return m_size;
+        return 0;
+    }
+    if (m_width == 1) {
+        if (uint64_t(value) > 1)
+            return 0;
+
+        const size_t chunkvals = 64;
+        for (; i + chunkvals <= end; i += chunkvals) {
+            uint64_t a = next[i / chunkvals];
+            if (value == 0)
+                a = ~a; // reverse
+
+            a -= (a >> 1) & m1;
+            a = (a & m2) + ((a >> 2) & m2);
+            a = (a + (a >> 4)) & m4;
+            a = (a * h01) >> 56;
+
+            // Could use intrinsic instead:
+            // a = __builtin_popcountll(a); // gcc intrinsic
+
+            value_count += to_size_t(a);
+        }
+    }
+    else if (m_width == 2) {
+        if (uint64_t(value) > 3)
+            return 0;
+
+        const uint64_t v = ~0ULL / 0x3 * value;
+
+        // Masks to avoid spillover between segments in cascades
+        const uint64_t c1 = ~0ULL / 0x3 * 0x1;
+
+        const size_t chunkvals = 32;
+        for (; i + chunkvals <= end; i += chunkvals) {
+            uint64_t a = next[i / chunkvals];
+            a ^= v;             // zero matching bit segments
+            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
+            a &= m1;            // isolate single bit in each segment
+            a ^= m1;            // reverse isolated bits
+            // if (!a) continue;
+
+            // Population count
+            a = (a & m2) + ((a >> 2) & m2);
+            a = (a + (a >> 4)) & m4;
+            a = (a * h01) >> 56;
+
+            value_count += to_size_t(a);
+        }
+    }
+    else if (m_width == 4) {
+        if (uint64_t(value) > 15)
+            return 0;
+
+        const uint64_t v = ~0ULL / 0xF * value;
+        const uint64_t m = ~0ULL / 0xF * 0x1;
+
+        // Masks to avoid spillover between segments in cascades
+        const uint64_t c1 = ~0ULL / 0xF * 0x7;
+        const uint64_t c2 = ~0ULL / 0xF * 0x3;
+
+        const size_t chunkvals = 16;
+        for (; i + chunkvals <= end; i += chunkvals) {
+            uint64_t a = next[i / chunkvals];
+            a ^= v;             // zero matching bit segments
+            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
+            a |= (a >> 2) & c2;
+            a &= m; // isolate single bit in each segment
+            a ^= m; // reverse isolated bits
+
+            // Population count
+            a = (a + (a >> 4)) & m4;
+            a = (a * h01) >> 56;
+
+            value_count += to_size_t(a);
+        }
+    }
+    else if (m_width == 8) {
+        if (value > 0x7FLL || value < -0x80LL)
+            return 0; // by casting?
+
+        const uint64_t v = ~0ULL / 0xFF * value;
+        const uint64_t m = ~0ULL / 0xFF * 0x1;
+
+        // Masks to avoid spillover between segments in cascades
+        const uint64_t c1 = ~0ULL / 0xFF * 0x7F;
+        const uint64_t c2 = ~0ULL / 0xFF * 0x3F;
+        const uint64_t c3 = ~0ULL / 0xFF * 0x0F;
+
+        const size_t chunkvals = 8;
+        for (; i + chunkvals <= end; i += chunkvals) {
+            uint64_t a = next[i / chunkvals];
+            a ^= v;             // zero matching bit segments
+            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
+            a |= (a >> 2) & c2;
+            a |= (a >> 4) & c3;
+            a &= m; // isolate single bit in each segment
+            a ^= m; // reverse isolated bits
+
+            // Population count
+            a = (a * h01) >> 56;
+
+            value_count += to_size_t(a);
+        }
+    }
+    else if (m_width == 16) {
+        if (value > 0x7FFFLL || value < -0x8000LL)
+            return 0; // by casting?
+
+        const uint64_t v = ~0ULL / 0xFFFF * value;
+        const uint64_t m = ~0ULL / 0xFFFF * 0x1;
+
+        // Masks to avoid spillover between segments in cascades
+        const uint64_t c1 = ~0ULL / 0xFFFF * 0x7FFF;
+        const uint64_t c2 = ~0ULL / 0xFFFF * 0x3FFF;
+        const uint64_t c3 = ~0ULL / 0xFFFF * 0x0FFF;
+        const uint64_t c4 = ~0ULL / 0xFFFF * 0x00FF;
+
+        const size_t chunkvals = 4;
+        for (; i + chunkvals <= end; i += chunkvals) {
+            uint64_t a = next[i / chunkvals];
+            a ^= v;             // zero matching bit segments
+            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
+            a |= (a >> 2) & c2;
+            a |= (a >> 4) & c3;
+            a |= (a >> 8) & c4;
+            a &= m; // isolate single bit in each segment
+            a ^= m; // reverse isolated bits
+
+            // Population count
+            a = (a * h01) >> 56;
+
+            value_count += to_size_t(a);
+        }
+    }
+    else if (m_width == 32) {
+        int32_t v = int32_t(value);
+        const int32_t* d = reinterpret_cast<int32_t*>(m_data);
+        for (; i < end; ++i) {
+            if (d[i] == v)
+                ++value_count;
+        }
+        return value_count;
+    }
+    else if (m_width == 64) {
+        const int64_t* d = reinterpret_cast<int64_t*>(m_data);
+        for (; i < end; ++i) {
+            if (d[i] == value)
+                ++value_count;
+        }
+        return value_count;
+    }
+
+    // Check remaining elements
+    for (; i < end; ++i)
+        if (value == get(i))
+            ++value_count;
+
+    return value_count;
+}
diff --git a/src/realm/array_blobs_small.cpp b/src/realm/array_blobs_small.cpp
index bca4d012a1f..4e93f40c5f4 100644
--- a/src/realm/array_blobs_small.cpp
+++ b/src/realm/array_blobs_small.cpp
@@ -91,7 +91,8 @@ void ArraySmallBlobs::erase(size_t ndx)
     REALM_ASSERT_3(ndx, <, m_offsets.size());
 
     size_t start = ndx ? to_size_t(m_offsets.get(ndx - 1)) : 0;
-    size_t end = to_size_t(m_offsets.get(ndx));
+    auto offset = m_offsets.get(ndx);
+    size_t end = to_size_t(offset);
 
     m_blob.erase(start, end);
     m_offsets.erase(ndx);
diff --git a/src/realm/array_blobs_small.hpp b/src/realm/array_blobs_small.hpp
index 8db3467a209..e1a08e43e4f 100644
--- a/src/realm/array_blobs_small.hpp
+++ b/src/realm/array_blobs_small.hpp
@@ -176,7 +176,8 @@ inline BinaryData ArraySmallBlobs::get(size_t ndx) const noexcept
     }
     else {
         size_t begin = ndx ? to_size_t(m_offsets.get(ndx - 1)) : 0;
-        size_t end = to_size_t(m_offsets.get(ndx));
+        auto offset = m_offsets.get(ndx);
+        size_t end = to_size_t(offset);
 
         BinaryData bd = BinaryData(m_blob.get(begin), end - begin);
         // Old database file (non-nullable column should never return null)
diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp
index 5380876700f..4b92141bf55 100644
--- a/src/realm/array_direct.hpp
+++ b/src/realm/array_direct.hpp
@@ -26,48 +26,48 @@
 // clang-format off
 /* wid == 16/32 likely when accessing offsets in B tree */
 #define REALM_TEMPEX(fun, wid, arg) \
-    if (wid == 16) {fun<16> arg;} \
-    else if (wid == 32) {fun<32> arg;} \
-    else if (wid == 0) {fun<0> arg;} \
-    else if (wid == 1) {fun<1> arg;} \
-    else if (wid == 2) {fun<2> arg;} \
-    else if (wid == 4) {fun<4> arg;} \
-    else if (wid == 8) {fun<8> arg;} \
-    else if (wid == 64) {fun<64> arg;} \
-    else {REALM_ASSERT_DEBUG(false); fun<0> arg;}
+if (wid == 16) {fun<16> arg;} \
+else if (wid == 32) {fun<32> arg;} \
+else if (wid == 0) {fun<0> arg;} \
+else if (wid == 1) {fun<1> arg;} \
+else if (wid == 2) {fun<2> arg;} \
+else if (wid == 4) {fun<4> arg;} \
+else if (wid == 8) {fun<8> arg;} \
+else if (wid == 64) {fun<64> arg;} \
+else {REALM_ASSERT_DEBUG(false); fun<0> arg;}
 
 #define REALM_TEMPEX2(fun, targ, wid, arg) \
-    if (wid == 16) {fun<targ, 16> arg;} \
-    else if (wid == 32) {fun<targ, 32> arg;} \
-    else if (wid == 0) {fun<targ, 0> arg;} \
-    else if (wid == 1) {fun<targ, 1> arg;} \
-    else if (wid == 2) {fun<targ, 2> arg;} \
-    else if (wid == 4) {fun<targ, 4> arg;} \
-    else if (wid == 8) {fun<targ, 8> arg;} \
-    else if (wid == 64) {fun<targ, 64> arg;} \
-    else {REALM_ASSERT_DEBUG(false); fun<targ, 0> arg;}
+if (wid == 16) {fun<targ, 16> arg;} \
+else if (wid == 32) {fun<targ, 32> arg;} \
+else if (wid == 0) {fun<targ, 0> arg;} \
+else if (wid == 1) {fun<targ, 1> arg;} \
+else if (wid == 2) {fun<targ, 2> arg;} \
+else if (wid == 4) {fun<targ, 4> arg;} \
+else if (wid == 8) {fun<targ, 8> arg;} \
+else if (wid == 64) {fun<targ, 64> arg;} \
+else {REALM_ASSERT_DEBUG(false); fun<targ, 0> arg;}
 
 #define REALM_TEMPEX3(fun, targ1, wid, targ3, arg) \
-    if (wid == 16) {fun<targ1, 16, targ3> arg;} \
-    else if (wid == 32) {fun<targ1, 32, targ3> arg;} \
-    else if (wid == 0) {fun<targ1, 0, targ3> arg;} \
-    else if (wid == 1) {fun<targ1, 1, targ3> arg;} \
-    else if (wid == 2) {fun<targ1, 2, targ3> arg;} \
-    else if (wid == 4) {fun<targ1, 4, targ3> arg;} \
-    else if (wid == 8) {fun<targ1, 8, targ3> arg;} \
-    else if (wid == 64) {fun<targ1, 64, targ3> arg;} \
-    else {REALM_ASSERT_DEBUG(false); fun<targ1, 0, targ3> arg;}
+if (wid == 16) {fun<targ1, 16, targ3> arg;} \
+else if (wid == 32) {fun<targ1, 32, targ3> arg;} \
+else if (wid == 0) {fun<targ1, 0, targ3> arg;} \
+else if (wid == 1) {fun<targ1, 1, targ3> arg;} \
+else if (wid == 2) {fun<targ1, 2, targ3> arg;} \
+else if (wid == 4) {fun<targ1, 4, targ3> arg;} \
+else if (wid == 8) {fun<targ1, 8, targ3> arg;} \
+else if (wid == 64) {fun<targ1, 64, targ3> arg;} \
+else {REALM_ASSERT_DEBUG(false); fun<targ1, 0, targ3> arg;}
 
 #define REALM_TEMPEX4(fun, targ1, targ3, targ4, wid, arg) \
-    if (wid == 16) {fun<targ1, targ3, targ4, 16> arg;} \
-    else if (wid == 32) {fun<targ1, targ3, targ4, 32> arg;} \
-    else if (wid == 0) {fun<targ1, targ3, targ4, 0> arg;} \
-    else if (wid == 1) {fun<targ1, targ3, targ4, 1> arg;} \
-    else if (wid == 2) {fun<targ1, targ3, targ4, 2> arg;} \
-    else if (wid == 4) {fun<targ1, targ3, targ4, 4> arg;} \
-    else if (wid == 8) {fun<targ1, targ3, targ4, 8> arg;} \
-    else if (wid == 64) {fun<targ1, targ3, targ4, 64> arg;} \
-    else {REALM_ASSERT_DEBUG(false); fun<targ1, targ3, targ4, 0> arg;}
+if (wid == 16) {fun<targ1, targ3, targ4, 16> arg;} \
+else if (wid == 32) {fun<targ1, targ3, targ4, 32> arg;} \
+else if (wid == 0) {fun<targ1, targ3, targ4, 0> arg;} \
+else if (wid == 1) {fun<targ1, targ3, targ4, 1> arg;} \
+else if (wid == 2) {fun<targ1, targ3, targ4, 2> arg;} \
+else if (wid == 4) {fun<targ1, targ3, targ4, 4> arg;} \
+else if (wid == 8) {fun<targ1, targ3, targ4, 8> arg;} \
+else if (wid == 64) {fun<targ1, targ3, targ4, 64> arg;} \
+else {REALM_ASSERT_DEBUG(false); fun<targ1, targ3, targ4, 0> arg;}
 // clang-format on
 
 namespace realm {
@@ -194,21 +194,22 @@ class UnalignedWordIter {
     }
     // 'num_bits' number of bits which must be read
     // WARNING returned word may be garbage above the first 'num_bits' bits.
-    uint64_t get(size_t num_bits)
+    uint64_t consume(size_t num_bits)
     {
         auto first_word = m_word_ptr[0];
         uint64_t result = first_word >> m_in_word_offset;
         // note: above shifts in zeroes
-        if (m_in_word_offset + num_bits <= 64)
-            return result;
-        // if we're here, in_word_offset > 0
-        auto first_word_size = 64 - m_in_word_offset;
-        auto second_word = m_word_ptr[1];
-        result |= second_word << first_word_size;
-        // note: above shifts in zeroes below the bits we want
+        if (m_in_word_offset + num_bits > 64) {
+            // if we're here, in_word_offset > 0
+            auto first_word_size = 64 - m_in_word_offset;
+            auto second_word = m_word_ptr[1];
+            result |= second_word << first_word_size;
+            // note: above shifts in zeroes below the bits we want
+        }
+        _bump(num_bits);
         return result;
     }
-    uint64_t get_with_unsafe_prefetch(size_t num_bits)
+    uint64_t consume_with_unsafe_prefetch(size_t num_bits)
     {
         auto first_word = m_word_ptr[0];
         uint64_t result = first_word >> m_in_word_offset;
@@ -216,21 +217,24 @@ class UnalignedWordIter {
         auto first_word_size = 64 - m_in_word_offset;
         auto second_word = m_word_ptr[1];
         REALM_ASSERT_DEBUG(num_bits <= 64);
-        result |= (m_in_word_offset + num_bits > 64) ? (second_word << first_word_size) : 0;
+        if (num_bits > first_word_size)
+            result |= second_word << first_word_size;
         // note: above shifts in zeroes below the bits we want
+        _bump(num_bits);
         return result;
     }
+
+private:
+    const uint64_t* m_word_ptr;
+    unsigned m_in_word_offset;
+
     // bump the iterator the specified number of bits
-    void bump(size_t num_bits)
+    void _bump(size_t num_bits)
     {
         auto total_offset = m_in_word_offset + num_bits;
         m_word_ptr += total_offset >> 6;
         m_in_word_offset = total_offset & 0x3F;
     }
-
-private:
-    const uint64_t* m_word_ptr;
-    unsigned m_in_word_offset;
 };
 
 // Read a bit field of up to 64 bits.
@@ -241,16 +245,19 @@ class UnalignedWordIter {
 // iterator useful for scanning arrays faster than by indexing each element
 // supports arrays of pairs by differentiating field size and step size.
 class BfIterator {
+    friend class FlexCompressor;
+    friend class PackedCompressor;
+
 public:
     BfIterator() = default;
     BfIterator(const BfIterator&) = default;
     BfIterator(BfIterator&&) = default;
     BfIterator& operator=(const BfIterator&) = default;
     BfIterator& operator=(BfIterator&&) = default;
-    BfIterator(uint64_t* data_area, size_t initial_offset, size_t field_size, size_t step_size, size_t index)
+    BfIterator(uint64_t* data_area, size_t initial_offset, uint8_t field_size, uint8_t step_size, size_t index)
         : data_area(data_area)
-        , field_size(static_cast<uint8_t>(field_size))
-        , step_size(static_cast<uint8_t>(step_size))
+        , field_size(field_size)
+        , step_size(step_size)
         , offset(initial_offset)
     {
         if (field_size < 64)
@@ -376,13 +383,13 @@ inline bool operator<(const BfIterator& a, const BfIterator& b)
     return a.field_position < b.field_position;
 }
 
-inline uint64_t read_bitfield(uint64_t* data_area, size_t field_position, size_t width)
+inline uint64_t read_bitfield(uint64_t* data_area, size_t field_position, uint8_t width)
 {
     BfIterator it(data_area, field_position, width, width, 0);
     return *it;
 }
 
-inline void write_bitfield(uint64_t* data_area, size_t field_position, size_t width, uint64_t value)
+inline void write_bitfield(uint64_t* data_area, size_t field_position, uint8_t width, uint64_t value)
 {
     BfIterator it(data_area, field_position, width, width, 0);
     it.set_value(value);
@@ -414,26 +421,26 @@ inline std::pair<int64_t, int64_t> get_two(const char* data, size_t width, size_
 
 /* Subword parallel search
 
-    The following provides facilities for subword parallel search for bitfields of any size.
-    To simplify, the first bitfield must be aligned within the word: it must occupy the lowest
-    bits of the word.
+ The following provides facilities for subword parallel search for bitfields of any size.
+ To simplify, the first bitfield must be aligned within the word: it must occupy the lowest
+ bits of the word.
 
-    In general the metods here return a vector with the most significant bit in each field
-    marking that a condition was met when comparing the corresponding pair of fields in two
-    vectors. Checking if any field meets a condition is as simple as comparing the return
-    vector against 0. Finding the first to meet a condition is also supported.
+ In general the metods here return a vector with the most significant bit in each field
+ marking that a condition was met when comparing the corresponding pair of fields in two
+ vectors. Checking if any field meets a condition is as simple as comparing the return
+ vector against 0. Finding the first to meet a condition is also supported.
 
-    Vectors are "split" into fields according to a MSB vector, wich indicates the most
-    significant bit of each field. The MSB must be passed in as an argument to most
-    bit field comparison functions. It can be generated by the field_sign_bit<width> template.
+ Vectors are "split" into fields according to a MSB vector, wich indicates the most
+ significant bit of each field. The MSB must be passed in as an argument to most
+ bit field comparison functions. It can be generated by the field_sign_bit<width> template.
 
-    The simplest condition to test is any_field_NE(A,B), where A and B are words.
-    This condition should be true if any bitfield in A is not equal to the corresponding
-    field in B.
+ The simplest condition to test is any_field_NE(A,B), where A and B are words.
+ This condition should be true if any bitfield in A is not equal to the corresponding
+ field in B.
 
-    This is almost as simple as a direct word compare, but needs to take into account that
-    we may want to have part of the words undefined.
-*/
+ This is almost as simple as a direct word compare, but needs to take into account that
+ we may want to have part of the words undefined.
+ */
 constexpr uint8_t num_fields_table[65] = {0, 64, 32, 21, 16, 12, 10, 9, // 0-7
                                           8, 7,  6,  5,  5,  4,  4,  4, // 8-15
                                           4, 3,  3,  3,  3,  3,  2,  2, // 16-23
@@ -521,127 +528,6 @@ constexpr uint64_t field_sign_bit(int width)
     return populate(width, 1ULL << (width - 1));
 }
 
-/* Unsigned LT.
-
-    This can be determined by trial subtaction. However, some care must be exercised
-    since simply subtracting one vector from another will allow carries from one
-    bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping
-    the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in
-    the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB)
-    using boolean logic as described below.
-
-    Unsigned LT is also used to find all zero fields or all non-zero fields, so it is
-    the backbone of all comparisons returning vectors.
-*/
-
-// compute the overflows in unsigned trial subtraction A-B. The overflows
-// will be marked by 1 in the sign bit of each field in the result. Other
-// bits in the result are zero.
-// Overflow are detected for each field pair where A is less than B.
-inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // 1. compute borrow from most significant bit
-    // Isolate bitfields inside A and B before subtraction (prevent carries from spilling over)
-    // do this by clamping most significant bit in A to 1, and msb in B to 0
-    auto A_isolated = A | MSBs;                              // 1 op
-    auto B_isolated = B & ~MSBs;                             // 2 ops
-    auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4)
-
-    // 2. determine what subtraction against most significant bit would give:
-    // A B borrow-in:   (A-B-borrow-in)
-    // 0 0 0            (0-0-0) = 0
-    // 0 0 1            (0-0-1) = 1 + borrow-out
-    // 0 1 0            (0-1-0) = 1 + borrow-out
-    // 0 1 1            (0-1-1) = 0 + borrow-out
-    // 1 0 0            (1-0-0) = 1
-    // 1 0 1            (1-0-1) = 0
-    // 1 1 0            (1-1-0) = 0
-    // 1 1 1            (1-1-1) = 1 + borrow-out
-    // borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in)
-    // The overflows are simply the borrow-out, now encoded into the sign bits of each field.
-    auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit);
-    // ^ 6 ops, total latency 6 (4+2)
-    return overflows & MSBs; // 1 op, total latency 7
-    // total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel
-    // and still reach a combined latency of 10 or less.
-}
-
-inline uint64_t find_all_fields_unsigned_LT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return unsigned_LT_vector(MSBs, A, B);
-}
-
-inline uint64_t find_all_fields_NE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // 0 != A^B, same as asking 0 - (A^B) overflows.
-    return unsigned_LT_vector(MSBs, 0, A ^ B);
-}
-
-inline uint64_t find_all_fields_EQ(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // get the fields which are EQ and negate the result
-    auto all_fields_NE = find_all_fields_NE(MSBs, A, B);
-    auto all_fields_NE_negated = ~all_fields_NE;
-    // must filter the negated vector so only MSB are left.
-    return MSBs & all_fields_NE_negated;
-}
-
-inline uint64_t find_all_fields_unsigned_LE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // Now A <= B is the same as !(A > B) so...
-    // reverse A and B to turn (A>B) --> (B<A)
-    auto GT = unsigned_LT_vector(MSBs, B, A);
-    // Negate the matches
-    auto GT_negated = ~GT;
-    // and since this negates all bits, filter so we only have MSBs again
-    return MSBs & GT_negated;
-}
-
-inline uint64_t find_all_fields_unsigned_GE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return find_all_fields_unsigned_LE(MSBs, B, A);
-}
-
-inline uint64_t find_all_fields_unsigned_GT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    return find_all_fields_unsigned_LT(MSBs, B, A);
-}
-
-/*
-    Handling signed values
-
-    Trial subtraction only works as is for unsigned. We simply transform signed into unsigned
-    by pusing all values up by 1<<(field_width-1). This makes all negative values positive and positive
-    values remain positive, although larger. Any overflow during the push can be ignored.
-    After that transformation Trial subtraction should correctly detect the LT condition.
-
-*/
-
-
-inline uint64_t find_all_fields_signed_LT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    auto sign_bits = MSBs;
-    return unsigned_LT_vector(MSBs, A ^ sign_bits, B ^ sign_bits);
-}
-
-inline uint64_t find_all_fields_signed_LE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    auto sign_bits = MSBs;
-    return find_all_fields_unsigned_LE(MSBs, A ^ sign_bits, B ^ sign_bits);
-}
-
-inline uint64_t find_all_fields_signed_GT(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // A > B is the same as B < A
-    return find_all_fields_signed_LT(MSBs, B, A);
-}
-
-inline uint64_t find_all_fields_signed_GE(uint64_t MSBs, uint64_t A, uint64_t B)
-{
-    // A >= B is the same as B <= A
-    return find_all_fields_signed_LE(MSBs, B, A);
-}
-
 constexpr uint32_t inverse_width[65] = {
     65536 * 64 / 1, // never used
     65536 * 64 / 1,  65536 * 64 / 2,  65536 * 64 / 3,  65536 * 64 / 4,  65536 * 64 / 5,  65536 * 64 / 6,
@@ -706,12 +592,10 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
     uint64_t found_vector = 0;
     while (total_bit_count_left >= fast_scan_limit) {
         // unrolling 2x
-        const auto word0 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
-        it.bump(bit_count_pr_iteration);
-        const auto word1 = it.get_with_unsafe_prefetch(bit_count_pr_iteration);
+        const auto word0 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
+        const auto word1 = it.consume_with_unsafe_prefetch(bit_count_pr_iteration);
         auto found_vector0 = vector_compare(MSBs, word0, search_vector);
         auto found_vector1 = vector_compare(MSBs, word1, search_vector);
-        it.bump(bit_count_pr_iteration);
         if (found_vector0) {
             const auto sub_word_index = first_field_marked(width, found_vector0);
             return start + sub_word_index;
@@ -723,8 +607,10 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
         total_bit_count_left -= 2 * bit_count_pr_iteration;
         start += 2 * field_count;
     }
+
+    // One word at a time
     while (total_bit_count_left >= bit_count_pr_iteration) {
-        const auto word = it.get(bit_count_pr_iteration);
+        const auto word = it.consume(bit_count_pr_iteration);
         found_vector = vector_compare(MSBs, word, search_vector);
         if (found_vector) {
             const auto sub_word_index = first_field_marked(width, found_vector);
@@ -732,10 +618,12 @@ size_t parallel_subword_find(VectorCompare vector_compare, const uint64_t* data,
         }
         total_bit_count_left -= bit_count_pr_iteration;
         start += field_count;
-        it.bump(bit_count_pr_iteration);
     }
-    if (total_bit_count_left) {                         // final subword, may be partial
-        const auto word = it.get(total_bit_count_left); // <-- limit lookahead to avoid touching memory beyond array
+
+    // final subword, may be partial
+    if (total_bit_count_left) {
+        // limit lookahead to avoid touching memory beyond array
+        const auto word = it.consume(total_bit_count_left);
         found_vector = vector_compare(MSBs, word, search_vector);
         auto last_word_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - total_bit_count_left);
         found_vector &= last_word_mask;
diff --git a/src/realm/array_integer.cpp b/src/realm/array_integer.cpp
index 8cf854c671f..f86871c3225 100644
--- a/src/realm/array_integer.cpp
+++ b/src/realm/array_integer.cpp
@@ -24,6 +24,12 @@
 
 using namespace realm;
 
+ArrayInteger::ArrayInteger(Allocator& allocator) noexcept
+    : Array(allocator)
+{
+    m_is_inner_bptree_node = false;
+}
+
 Mixed ArrayInteger::get_any(size_t ndx) const
 {
     return Mixed(get(ndx));
@@ -112,7 +118,6 @@ void ArrayIntNull::replace_nulls_with(int64_t new_null)
     }
 }
 
-
 void ArrayIntNull::avoid_null_collision(int64_t value)
 {
     if (m_width == 64) {
diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp
index 3b50d3757d1..b8739414091 100644
--- a/src/realm/array_integer.hpp
+++ b/src/realm/array_integer.hpp
@@ -29,16 +29,10 @@ namespace realm {
 class ArrayInteger : public Array, public ArrayPayload {
 public:
     using value_type = int64_t;
-
-    using Array::add;
     using Array::find_first;
-    using Array::get;
-    using Array::insert;
-    using Array::move;
-    using Array::set;
 
     explicit ArrayInteger(Allocator&) noexcept;
-    ~ArrayInteger() noexcept override {}
+    ~ArrayInteger() noexcept override = default;
 
     static value_type default_value(bool)
     {
@@ -171,12 +165,6 @@ class ArrayIntNull : public Array, public ArrayPayload {
 
 // Implementation:
 
-inline ArrayInteger::ArrayInteger(Allocator& allocator) noexcept
-    : Array(allocator)
-{
-    m_is_inner_bptree_node = false;
-}
-
 inline ArrayIntNull::ArrayIntNull(Allocator& allocator) noexcept
     : Array(allocator)
 {
diff --git a/src/realm/array_integer_tpl.hpp b/src/realm/array_integer_tpl.hpp
index 9d96584ab3c..0914b1bae65 100644
--- a/src/realm/array_integer_tpl.hpp
+++ b/src/realm/array_integer_tpl.hpp
@@ -27,9 +27,10 @@ namespace realm {
 template <class cond>
 bool ArrayInteger::find(value_type value, size_t start, size_t end, QueryStateBase* state) const
 {
-    return ArrayWithFind(*this).find<cond>(value, start, end, 0, state);
+    return Array::find<cond>(value, start, end, 0, state);
 }
 
+
 inline bool ArrayIntNull::find_impl(int cond, value_type value, size_t start, size_t end, QueryStateBase* state) const
 {
     switch (cond) {
@@ -74,9 +75,7 @@ bool ArrayIntNull::find_impl(value_type opt_value, size_t start, size_t end, Que
                 value = *opt_value;
             }
         }
-
-        // Fall back to plain Array find.
-        return ArrayWithFind(*this).find<cond>(value, start2, end2, baseindex2, state);
+        return Array::find<cond>(value, start2, end2, baseindex2, state);
     }
     else {
         cond c;
diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp
index b0542da93b0..7d00991ad5b 100644
--- a/src/realm/array_mixed.cpp
+++ b/src/realm/array_mixed.cpp
@@ -274,6 +274,7 @@ size_t ArrayMixed::find_first(Mixed value, size_t begin, size_t end) const noexc
     DataType type = value.get_type();
     if (end == realm::npos)
         end = size();
+
     for (size_t i = begin; i < end; i++) {
         if (Mixed::data_types_are_comparable(this->get_type(i), type) && get(i) == value) {
             return i;
diff --git a/src/realm/array_unsigned.cpp b/src/realm/array_unsigned.cpp
index e1aac8dbf80..938fe5aece8 100644
--- a/src/realm/array_unsigned.cpp
+++ b/src/realm/array_unsigned.cpp
@@ -71,6 +71,7 @@ inline uint64_t ArrayUnsigned::_get(size_t ndx, uint8_t width) const
         return reinterpret_cast<uint32_t*>(m_data)[ndx];
     }
     return get_direct(m_data, width, ndx);
+    REALM_UNREACHABLE();
 }
 
 void ArrayUnsigned::create(size_t initial_size, uint64_t ubound_value)
@@ -168,7 +169,8 @@ size_t ArrayUnsigned::upper_bound(uint64_t value) const noexcept
 void ArrayUnsigned::insert(size_t ndx, uint64_t value)
 {
     REALM_ASSERT_DEBUG(m_width >= 8);
-    bool do_expand = value > m_ubound;
+
+    bool do_expand = value > (uint64_t)m_ubound;
     const uint8_t old_width = m_width;
     const uint8_t new_width = do_expand ? bit_width(value) : m_width;
     const auto old_size = m_size;
@@ -215,6 +217,7 @@ void ArrayUnsigned::insert(size_t ndx, uint64_t value)
 void ArrayUnsigned::erase(size_t ndx)
 {
     REALM_ASSERT_DEBUG(m_width >= 8);
+
     copy_on_write(); // Throws
 
     size_t w = m_width >> 3;
diff --git a/src/realm/array_unsigned.hpp b/src/realm/array_unsigned.hpp
index f1926ec7fc0..3e13b35e8dd 100644
--- a/src/realm/array_unsigned.hpp
+++ b/src/realm/array_unsigned.hpp
@@ -19,7 +19,7 @@
 #ifndef REALM_ARRAY_UNSIGNED_HPP
 #define REALM_ARRAY_UNSIGNED_HPP
 
-#include <realm/node.hpp>
+#include <realm/array.hpp>
 
 namespace realm {
 
@@ -81,13 +81,13 @@ class ArrayUnsigned : public Node {
     }
 
 private:
-    uint_least8_t m_width = 0; // Size of an element (meaning depend on type of array).
-    uint64_t m_ubound;         // max number that can be stored with current m_width
+    uint_least8_t m_width = 0;
+    uint64_t m_ubound = 0; // max is 0xFFFFFFFFFFFFFFFFLL
 
     void init_from_mem(MemRef mem) noexcept
     {
-        Node::init_from_mem(mem);
-        set_width(get_width_from_header(get_header()));
+        auto header = Node::init_from_mem(mem);
+        set_width(get_width_from_header(header));
     }
 
     void adjust(size_t ndx, int64_t diff)
diff --git a/src/realm/array_with_find.cpp b/src/realm/array_with_find.cpp
index e33513ef28e..2cf528a5c47 100644
--- a/src/realm/array_with_find.cpp
+++ b/src/realm/array_with_find.cpp
@@ -34,32 +34,6 @@ void ArrayWithFind::find_all(IntegerColumn* result, int64_t value, size_t col_of
     return;
 }
 
-
-bool ArrayWithFind::find(int cond, int64_t value, size_t start, size_t end, size_t baseindex,
-                         QueryStateBase* state) const
-{
-    if (cond == cond_Equal) {
-        return find<Equal>(value, start, end, baseindex, state);
-    }
-    if (cond == cond_NotEqual) {
-        return find<NotEqual>(value, start, end, baseindex, state);
-    }
-    if (cond == cond_Greater) {
-        return find<Greater>(value, start, end, baseindex, state);
-    }
-    if (cond == cond_Less) {
-        return find<Less>(value, start, end, baseindex, state);
-    }
-    if (cond == cond_None) {
-        return find<None>(value, start, end, baseindex, state);
-    }
-    else if (cond == cond_LeftNotNull) {
-        return find<NotNull>(value, start, end, baseindex, state);
-    }
-    REALM_ASSERT_DEBUG(false);
-    return false;
-}
-
 size_t ArrayWithFind::first_set_bit(uint32_t v) const
 {
     // (v & -v) is UB when v is INT_MIN
@@ -79,5 +53,15 @@ size_t ArrayWithFind::first_set_bit64(int64_t v) const
     return first_set_bit(v1) + 32;
 }
 
+bool ArrayWithFind::find_all_will_match(size_t start2, size_t end, size_t baseindex, QueryStateBase* state) const
+{
+    REALM_ASSERT_DEBUG(state->match_count() < state->limit());
+    size_t process = state->limit() - state->match_count();
+    size_t end2 = end - start2 > process ? start2 + process : end;
+    for (; start2 < end2; start2++)
+        if (!state->match(start2 + baseindex))
+            return false;
+    return true;
+}
 
 } // namespace realm
diff --git a/src/realm/array_with_find.hpp b/src/realm/array_with_find.hpp
index 81d86d47e44..b35ed85e808 100644
--- a/src/realm/array_with_find.hpp
+++ b/src/realm/array_with_find.hpp
@@ -89,8 +89,6 @@ class ArrayWithFind {
     }
 
     // Main finding function - used for find_first, find_all, sum, max, min, etc.
-    bool find(int cond, int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
-
     template <class cond>
     bool find(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
 
@@ -161,7 +159,6 @@ class ArrayWithFind {
 private:
     const Array& m_array;
 
-    template <size_t bitwidth>
     bool find_all_will_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
 };
 //*************************************************************************************
@@ -276,19 +273,6 @@ uint64_t ArrayWithFind::cascade(uint64_t a) const
     }
 }
 
-template <size_t bitwidth>
-REALM_NOINLINE bool ArrayWithFind::find_all_will_match(size_t start2, size_t end, size_t baseindex,
-                                                       QueryStateBase* state) const
-{
-    REALM_ASSERT_DEBUG(state->match_count() < state->limit());
-    size_t process = state->limit() - state->match_count();
-    size_t end2 = end - start2 > process ? start2 + process : end;
-    for (; start2 < end2; start2++)
-        if (!state->match(start2 + baseindex))
-            return false;
-    return true;
-}
-
 // This is the main finding function for Array. Other finding functions are just
 // wrappers around this one. Search for 'value' using condition cond (Equal,
 // NotEqual, Less, etc) and call QueryStateBase::match() for each match. Break and
@@ -318,7 +302,7 @@ bool ArrayWithFind::find_optimized(int64_t value, size_t start, size_t end, size
 
     // optimization if all items are guaranteed to match (such as cond == NotEqual && value == 100 && m_ubound == 15)
     if (c.will_match(value, lbound, ubound)) {
-        return find_all_will_match<bitwidth>(start2, end, baseindex, state);
+        return find_all_will_match(start2, end, baseindex, state);
     }
 
     // finder cannot handle this bitwidth
@@ -567,14 +551,18 @@ inline bool ArrayWithFind::compare_equality(int64_t value, size_t start, size_t
                                             QueryStateBase* state) const
 {
     REALM_ASSERT_DEBUG(start <= m_array.m_size && (end <= m_array.m_size || end == size_t(-1)) && start <= end);
+    REALM_ASSERT_DEBUG(width == m_array.m_width);
 
-    size_t ee = round_up(start, 64 / no0(width));
+    auto v = 64 / no0(width);
+    size_t ee = round_up(start, v);
     ee = ee > end ? end : ee;
-    for (; start < ee; ++start)
-        if (eq ? (m_array.get<width>(start) == value) : (m_array.get<width>(start) != value)) {
+    for (; start < ee; ++start) {
+        auto v = Array::get<width>(m_array, start);
+        if (eq ? (v == value) : (v != value)) {
             if (!state->match(start + baseindex))
                 return false;
         }
+    }
 
     if (start >= end)
         return true;
@@ -624,7 +612,7 @@ inline bool ArrayWithFind::compare_equality(int64_t value, size_t start, size_t
     }
 
     while (start < end) {
-        if (eq ? m_array.get<width>(start) == value : m_array.get<width>(start) != value) {
+        if (eq ? Array::get<width>(m_array, start) == value : Array::get<width>(m_array, start) != value) {
             if (!state->match(start + baseindex)) {
                 return false;
             }
@@ -903,8 +891,8 @@ bool ArrayWithFind::compare_relation(int64_t value, size_t start, size_t end, si
     size_t ee = round_up(start, 64 / no0(bitwidth));
     ee = ee > end ? end : ee;
     for (; start < ee; start++) {
-        if (gt ? (m_array.get<bitwidth>(start) > value) : (m_array.get<bitwidth>(start) < value)) {
-            if (!state->match(start + baseindex, m_array.get<bitwidth>(start)))
+        if (gt ? (Array::get<bitwidth>(m_array, start) > value) : (Array::get<bitwidth>(m_array, start) < value)) {
+            if (!state->match(start + baseindex, Array::get<bitwidth>(m_array, start)))
                 return false;
         }
     }
@@ -969,7 +957,7 @@ bool ArrayWithFind::compare_relation(int64_t value, size_t start, size_t end, si
 
     // Test unaligned end and/or values of width > 16 manually
     while (start < end) {
-        if (gt ? m_array.get<bitwidth>(start) > value : m_array.get<bitwidth>(start) < value) {
+        if (gt ? Array::get<bitwidth>(m_array, start) > value : Array::get<bitwidth>(m_array, start) < value) {
             if (!state->match(start + baseindex))
                 return false;
         }
diff --git a/src/realm/group.cpp b/src/realm/group.cpp
index ab3bef4c68a..eeecbaed4f5 100644
--- a/src/realm/group.cpp
+++ b/src/realm/group.cpp
@@ -1012,10 +1012,6 @@ ref_type Group::DefaultTableWriter::write_names(_impl::OutputStream& out)
 }
 ref_type Group::DefaultTableWriter::write_tables(_impl::OutputStream& out)
 {
-    // bool deep = true;              // Deep
-    // bool only_if_modified = false; // Always
-    // bool compress = false;         // true;
-    // return m_group->m_tables.write(out, deep, only_if_modified, compress); // Throws
     return m_group->typed_write_tables(out);
 }
 
@@ -1141,7 +1137,6 @@ void Group::write(std::ostream& out, int file_format_version, TableWriter& table
         REALM_ASSERT(version_number == 0 || version_number == 1);
     }
     else {
-        // table_writer.typed_print("");
         // Because we need to include the total logical file size in the
         // top-array, we have to start by writing everything except the
         // top-array, and then finally compute and write a correct version of
@@ -1151,7 +1146,8 @@ void Group::write(std::ostream& out, int file_format_version, TableWriter& table
         // DB to compact the database by writing only the live data
         // into a separate file.
         ref_type names_ref = table_writer.write_names(out_2);   // Throws
-        ref_type tables_ref = table_writer.write_tables(out_2); // Throws
+        ref_type tables_ref = table_writer.write_tables(out_2);
+
         SlabAlloc new_alloc;
         new_alloc.attach_empty(); // Throws
         Array top(new_alloc);
@@ -1214,8 +1210,8 @@ void Group::write(std::ostream& out, int file_format_version, TableWriter& table
         top.set(2, RefOrTagged::make_tagged(final_file_size)); // Throws
 
         // Write the top array
-        bool deep = false;                        // Shallow
-        bool only_if_modified = false;            // Always
+        bool deep = false;             // Shallow
+        bool only_if_modified = false; // Always
         bool compress = false;
         top.write(out_2, deep, only_if_modified, compress); // Throws
         REALM_ASSERT_3(size_t(out_2.get_ref_of_next_array()), ==, final_file_size);
diff --git a/src/realm/group.hpp b/src/realm/group.hpp
index 434c0258336..08ddd9acd44 100644
--- a/src/realm/group.hpp
+++ b/src/realm/group.hpp
@@ -1133,6 +1133,7 @@ class Group::TableWriter {
     {
         m_group->typed_print(prefix);
     }
+
     virtual ~TableWriter() noexcept {}
 
     void set_group(const Group* g)
diff --git a/src/realm/group_writer.cpp b/src/realm/group_writer.cpp
index 2990e010d3a..4ce470fec62 100644
--- a/src/realm/group_writer.cpp
+++ b/src/realm/group_writer.cpp
@@ -41,15 +41,16 @@ class InMemoryWriter : public _impl::ArrayWriterBase {
         , m_alloc(owner.m_alloc)
     {
     }
-    ref_type write_array(const char* data, size_t size, uint32_t checksum) override
+    ref_type write_array(const char* data, size_t size, uint32_t checksum, uint32_t checksum_bytes) override
     {
+        REALM_ASSERT(checksum_bytes == 4 || checksum_bytes == 2);
         size_t pos = m_owner.get_free_space(size);
 
         // Write the block
         char* dest_addr = translate(pos);
         REALM_ASSERT_RELEASE(dest_addr && (reinterpret_cast<size_t>(dest_addr) & 7) == 0);
-        memcpy(dest_addr, &checksum, 4);
-        memcpy(dest_addr + 4, data + 4, size - 4);
+        memcpy(dest_addr, &checksum, checksum_bytes);
+        memcpy(dest_addr + checksum_bytes, data + checksum_bytes, size - checksum_bytes);
         // return ref of the written array
         ref_type ref = to_ref(pos);
         return ref;
@@ -1339,8 +1340,9 @@ bool inline is_aligned(char* addr)
     return (as_binary & 7) == 0;
 }
 
-ref_type GroupWriter::write_array(const char* data, size_t size, uint32_t checksum)
+ref_type GroupWriter::write_array(const char* data, size_t size, uint32_t checksum, uint32_t checksum_bytes)
 {
+    REALM_ASSERT(checksum_bytes == 4 || checksum_bytes == 2);
     // Get position of free space to write in (expanding file if needed)
     size_t pos = get_free_space(size);
 
@@ -1349,8 +1351,8 @@ ref_type GroupWriter::write_array(const char* data, size_t size, uint32_t checks
     char* dest_addr = window->translate(pos);
     REALM_ASSERT_RELEASE(is_aligned(dest_addr));
     window->encryption_read_barrier(dest_addr, size);
-    memcpy(dest_addr, &checksum, 4);
-    memcpy(dest_addr + 4, data + 4, size - 4);
+    memcpy(dest_addr, &checksum, checksum_bytes);
+    memcpy(dest_addr + checksum_bytes, data + checksum_bytes, size - checksum_bytes);
     window->encryption_write_barrier(dest_addr, size);
     // return ref of the written array
     ref_type ref = to_ref(pos);
diff --git a/src/realm/group_writer.hpp b/src/realm/group_writer.hpp
index 438879114c6..b6caed048f6 100644
--- a/src/realm/group_writer.hpp
+++ b/src/realm/group_writer.hpp
@@ -135,7 +135,7 @@ class GroupWriter : public _impl::ArrayWriterBase {
 
     size_t get_file_size() const noexcept;
 
-    ref_type write_array(const char*, size_t, uint32_t) override;
+    ref_type write_array(const char*, size_t, uint32_t, uint32_t) override;
 
 #ifdef REALM_DEBUG
     void dump();
diff --git a/src/realm/impl/array_writer.hpp b/src/realm/impl/array_writer.hpp
index 55fd42574bc..4096805e0fa 100644
--- a/src/realm/impl/array_writer.hpp
+++ b/src/realm/impl/array_writer.hpp
@@ -39,7 +39,7 @@ class ArrayWriterBase {
     ///
     /// Returns the ref (position in the target stream) of the written copy of
     /// the specified array data.
-    virtual ref_type write_array(const char* data, size_t size, uint32_t checksum) = 0;
+    virtual ref_type write_array(const char* data, size_t size, uint32_t checksum, uint32_t checksum_bytes) = 0;
 };
 
 } // namespace _impl
diff --git a/src/realm/impl/output_stream.cpp b/src/realm/impl/output_stream.cpp
index 04db91235b6..1b0d870aa2f 100644
--- a/src/realm/impl/output_stream.cpp
+++ b/src/realm/impl/output_stream.cpp
@@ -39,17 +39,18 @@ void OutputStream::write(const char* data, size_t size)
 }
 
 
-ref_type OutputStream::write_array(const char* data, size_t size, uint32_t checksum)
+ref_type OutputStream::write_array(const char* data, size_t size, uint32_t checksum, uint32_t checksum_bytes)
 {
     REALM_ASSERT(size % 8 == 0);
+    REALM_ASSERT(checksum_bytes == 4 || checksum_bytes == 2);
 
     const char* data_1 = data;
     size_t size_1 = size;
 
     const char* cksum_bytes = reinterpret_cast<const char*>(&checksum);
-    m_out.write(cksum_bytes, 4); // Throws
-    data_1 += 4;
-    size_1 -= 4;
+    m_out.write(cksum_bytes, checksum_bytes); // Throws
+    data_1 += checksum_bytes;
+    size_1 -= checksum_bytes;
 
     do_write(data_1, size_1); // Throws
 
diff --git a/src/realm/impl/output_stream.hpp b/src/realm/impl/output_stream.hpp
index eb459900485..ba287f92c30 100644
--- a/src/realm/impl/output_stream.hpp
+++ b/src/realm/impl/output_stream.hpp
@@ -41,7 +41,7 @@ class OutputStream : public ArrayWriterBase {
 
     void write(const char* data, size_t size);
 
-    ref_type write_array(const char* data, size_t size, uint32_t checksum) override;
+    ref_type write_array(const char* data, size_t size, uint32_t checksum, uint32_t checksum_bytes) override;
 
 private:
     ref_type m_next_ref;
diff --git a/src/realm/integer_compressor.cpp b/src/realm/integer_compressor.cpp
new file mode 100644
index 00000000000..5246928e775
--- /dev/null
+++ b/src/realm/integer_compressor.cpp
@@ -0,0 +1,318 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#include <realm/integer_compressor.hpp>
+#include <realm/array.hpp>
+#include <realm/integer_flex_compressor.hpp>
+#include <realm/integer_packed_compressor.hpp>
+#include <realm/array_with_find.hpp>
+#include <realm/query_conditions.hpp>
+
+#include <vector>
+#include <algorithm>
+
+using namespace realm;
+
+namespace {
+
+template <typename T, typename... Arg>
+inline void init_compress_array(Array& arr, size_t byte_size, Arg&&... args)
+{
+    Allocator& allocator = arr.get_alloc();
+    auto mem = allocator.alloc(byte_size);
+    auto h = mem.get_addr();
+    T::init_header(h, std::forward<Arg>(args)...);
+    NodeHeader::set_capacity_in_header(byte_size, h);
+    arr.init_from_mem(mem);
+}
+
+} // namespace
+
+bool IntegerCompressor::always_compress(const Array& origin, Array& arr, NodeHeader::Encoding encoding) const
+{
+    using Encoding = NodeHeader::Encoding;
+    std::vector<int64_t> values;
+    std::vector<unsigned> indices;
+    compress_values(origin, values, indices);
+    if (!values.empty()) {
+        const uint8_t flags = NodeHeader::get_flags(origin.get_header());
+        uint8_t v_width = std::max(Node::signed_to_num_bits(values.front()), Node::signed_to_num_bits(values.back()));
+
+        if (encoding == Encoding::Packed) {
+            const auto packed_size = NodeHeader::calc_size(indices.size(), v_width, NodeHeader::Encoding::Packed);
+            init_compress_array<PackedCompressor>(arr, packed_size, flags, v_width, origin.size());
+            PackedCompressor::copy_data(origin, arr);
+        }
+        else if (encoding == Encoding::Flex) {
+            uint8_t ndx_width = NodeHeader::unsigned_to_num_bits(values.size());
+            const auto flex_size = NodeHeader::calc_size(values.size(), indices.size(), v_width, ndx_width);
+            init_compress_array<FlexCompressor>(arr, flex_size, flags, v_width, ndx_width, values.size(),
+                                                indices.size());
+            FlexCompressor::copy_data(arr, values, indices);
+        }
+        else {
+            REALM_UNREACHABLE();
+        }
+        return true;
+    }
+    return false;
+}
+
+bool IntegerCompressor::compress(const Array& origin, Array& arr) const
+{
+    if (origin.m_width < 2 || origin.m_size == 0)
+        return false;
+
+#if REALM_COMPRESS
+    return always_compress(origin, arr, NodeHeader::Encoding::Flex);
+#else
+    std::vector<int64_t> values;
+    std::vector<unsigned> indices;
+    compress_values(origin, values, indices);
+    REALM_ASSERT(!values.empty());
+    const auto uncompressed_size = origin.get_byte_size();
+    uint8_t ndx_width = NodeHeader::unsigned_to_num_bits(values.size());
+    uint8_t v_width = std::max(Node::signed_to_num_bits(values.front()), Node::signed_to_num_bits(values.back()));
+    const auto packed_size = NodeHeader::calc_size(indices.size(), v_width, NodeHeader::Encoding::Packed);
+    const auto flex_size = NodeHeader::calc_size(values.size(), indices.size(), v_width, ndx_width);
+    // heuristic: only compress to packed if gain at least 11.1%
+    const auto adjusted_packed_size = packed_size + packed_size / 8;
+    // heuristic: only compress to flex if gain at least 20%
+    const auto adjusted_flex_size = flex_size + flex_size / 4;
+    if (adjusted_flex_size < adjusted_packed_size && adjusted_flex_size < uncompressed_size) {
+        const uint8_t flags = NodeHeader::get_flags(origin.get_header());
+        init_compress_array<FlexCompressor>(arr, flex_size, flags, v_width, ndx_width, values.size(), indices.size());
+        FlexCompressor::copy_data(arr, values, indices);
+        return true;
+    }
+    else if (adjusted_packed_size < uncompressed_size) {
+        const uint8_t flags = NodeHeader::get_flags(origin.get_header());
+        init_compress_array<PackedCompressor>(arr, packed_size, flags, v_width, origin.size());
+        PackedCompressor::copy_data(origin, arr);
+        return true;
+    }
+    return false;
+#endif
+}
+
+bool IntegerCompressor::decompress(Array& arr) const
+{
+    int64_t min_v = std::numeric_limits<int64_t>::max();
+    int64_t max_v = std::numeric_limits<int64_t>::min();
+    REALM_ASSERT_DEBUG(arr.is_attached());
+    auto values_fetcher = [&]() {
+        const auto sz = arr.size();
+        if (is_packed()) {
+            std::vector<int64_t> res;
+            res.reserve(sz);
+            for (size_t i = 0; i < sz; ++i) {
+                auto val = arr.get(i);
+                if (val > max_v)
+                    max_v = val;
+                if (val < min_v)
+                    min_v = val;
+                res.push_back(val);
+            }
+            return res;
+        }
+        min_v = FlexCompressor::min(*this);
+        max_v = FlexCompressor::max(*this);
+        return FlexCompressor::get_all(*this, 0, sz);
+    };
+    const auto& values = values_fetcher();
+    //  do the reverse of compressing the array
+    REALM_ASSERT_DEBUG(!values.empty());
+    using Encoding = NodeHeader::Encoding;
+    const auto flags = NodeHeader::get_flags(arr.get_header());
+    const auto size = values.size();
+    const auto width = std::max(Array::bit_width(min_v), Array::bit_width(max_v));
+    REALM_ASSERT_DEBUG(width == 0 || width == 1 || width == 2 || width == 4 || width == 8 || width == 16 ||
+                       width == 32 || width == 64);
+    // 64 is some slab allocator magic number.
+    // The padding is needed in order to account for bit width expansion.
+    const auto byte_size = 64 + NodeHeader::calc_size(size, width, Encoding::WTypBits);
+    REALM_ASSERT_DEBUG(byte_size % 8 == 0); // nevertheless all the values my be aligned to 8
+
+    // Create new array with the correct width
+    const auto mem = arr.get_alloc().alloc(byte_size);
+    const auto header = mem.get_addr();
+    init_header(header, Encoding::WTypBits, flags, width, size);
+    NodeHeader::set_capacity_in_header(byte_size, header);
+
+    // Destroy old array before initializing
+    arr.destroy();
+    arr.init_from_mem(mem);
+
+    // this is copying the bits straight, without doing any COW, since the array is basically restored, we just need
+    // to copy the data straight back into it. This makes decompressing the array equivalent to copy on write for
+    // normal arrays, in fact for a compressed array, we skip COW and we just decompress, getting the same result.
+    auto setter = arr.m_vtable->setter;
+    for (size_t ndx = 0; ndx < size; ++ndx)
+        setter(arr, ndx, values[ndx]);
+
+    // very important: since the ref of the current array has changed, the parent must be informed.
+    // Otherwise we will lose the link between parent array and child array.
+    arr.update_parent();
+    REALM_ASSERT_DEBUG(width == arr.get_width());
+    REALM_ASSERT_DEBUG(arr.size() == values.size());
+
+    return true;
+}
+
+bool IntegerCompressor::init(const char* h)
+{
+    m_encoding = NodeHeader::get_encoding(h);
+    // avoid to check wtype here, it is another access to the header, that we can avoid.
+    // We just need to know if the encoding is packed or flex.
+    // This makes Array::init_from_mem faster.
+    if (REALM_LIKELY(!(is_packed() || is_flex())))
+        return false;
+
+    if (is_packed()) {
+        init_packed(h);
+    }
+    else {
+        init_flex(h);
+    }
+    return true;
+}
+int64_t IntegerCompressor::get_packed(const Array& arr, size_t ndx)
+{
+    return PackedCompressor::get(arr.m_integer_compressor, ndx);
+}
+
+int64_t IntegerCompressor::get_flex(const Array& arr, size_t ndx)
+{
+    return FlexCompressor::get(arr.m_integer_compressor, ndx);
+}
+
+std::vector<int64_t> IntegerCompressor::get_all_packed(const Array& arr, size_t begin, size_t end)
+{
+    return PackedCompressor::get_all(arr.m_integer_compressor, begin, end);
+}
+
+std::vector<int64_t> IntegerCompressor::get_all_flex(const Array& arr, size_t begin, size_t end)
+{
+    return FlexCompressor::get_all(arr.m_integer_compressor, begin, end);
+}
+
+void IntegerCompressor::get_chunk_packed(const Array& arr, size_t ndx, int64_t res[8])
+{
+    PackedCompressor::get_chunk(arr.m_integer_compressor, ndx, res);
+}
+
+void IntegerCompressor::get_chunk_flex(const Array& arr, size_t ndx, int64_t res[8])
+{
+    FlexCompressor::get_chunk(arr.m_integer_compressor, ndx, res);
+}
+
+void IntegerCompressor::set_packed(Array& arr, size_t ndx, int64_t val)
+{
+    PackedCompressor::set_direct(arr.m_integer_compressor, ndx, val);
+}
+
+void IntegerCompressor::set_flex(Array& arr, size_t ndx, int64_t val)
+{
+    FlexCompressor::set_direct(arr.m_integer_compressor, ndx, val);
+}
+
+template <class Cond>
+bool IntegerCompressor::find_packed(const Array& arr, int64_t val, size_t begin, size_t end, size_t base_index,
+                                    QueryStateBase* st)
+{
+    return PackedCompressor::find_all<Cond>(arr, val, begin, end, base_index, st);
+}
+
+template <class Cond>
+bool IntegerCompressor::find_flex(const Array& arr, int64_t val, size_t begin, size_t end, size_t base_index,
+                                  QueryStateBase* st)
+{
+    return FlexCompressor::find_all<Cond>(arr, val, begin, end, base_index, st);
+}
+
+void IntegerCompressor::set_vtable(Array& arr)
+{
+    static const Array::VTable vtable_packed = {get_packed,
+                                                get_chunk_packed,
+                                                get_all_packed,
+                                                set_packed,
+                                                {
+                                                    find_packed<Equal>,
+                                                    find_packed<NotEqual>,
+                                                    find_packed<Greater>,
+                                                    find_packed<Less>,
+                                                }};
+    static const Array::VTable vtable_flex = {get_flex,
+                                              get_chunk_flex,
+                                              get_all_flex,
+                                              set_flex,
+                                              {
+                                                  find_flex<Equal>,
+                                                  find_flex<NotEqual>,
+                                                  find_flex<Greater>,
+                                                  find_flex<Less>,
+                                              }};
+    if (is_packed()) {
+        arr.m_vtable = &vtable_packed;
+    }
+    else {
+        arr.m_vtable = &vtable_flex;
+    }
+}
+
+int64_t IntegerCompressor::get(size_t ndx) const
+{
+    if (is_packed()) {
+        return PackedCompressor::get(*this, ndx);
+    }
+    else {
+        return FlexCompressor::get(*this, ndx);
+    }
+}
+
+void IntegerCompressor::compress_values(const Array& arr, std::vector<int64_t>& values,
+                                        std::vector<unsigned>& indices) const
+{
+    // The main idea is to compress the values in flex format. If Packed is better it will be chosen by
+    // IntegerCompressor::compress. The algorithm is O(n lg n), it gives us nice properties, but we could use an
+    // efficient hash table and try to boost perf during insertion, although leaf arrays are relatively small in
+    // general (256 entries). The two compresion formats are packed and flex, and the data in the array is re-arranged
+    // in the following ways (if compressed):
+    //  Packed: || node header || ..... values ..... ||
+    //  Flex:   || node header || ..... values ..... || ..... indices ..... ||
+
+    const auto sz = arr.size();
+    REALM_ASSERT_DEBUG(sz > 0);
+    values.reserve(sz);
+    indices.reserve(sz);
+
+    for (size_t i = 0; i < sz; ++i) {
+        auto item = arr.get(i);
+        values.push_back(item);
+    }
+
+    std::sort(values.begin(), values.end());
+    auto last = std::unique(values.begin(), values.end());
+    values.erase(last, values.end());
+
+    for (size_t i = 0; i < sz; ++i) {
+        auto pos = std::lower_bound(values.begin(), values.end(), arr.get(i));
+        indices.push_back(unsigned(std::distance(values.begin(), pos)));
+        REALM_ASSERT_DEBUG(values[indices[i]] == arr.get(i));
+    }
+}
diff --git a/src/realm/integer_compressor.hpp b/src/realm/integer_compressor.hpp
new file mode 100644
index 00000000000..4e9023cfe18
--- /dev/null
+++ b/src/realm/integer_compressor.hpp
@@ -0,0 +1,202 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#ifndef REALM_INTEGER_COMPRESSOR_HPP
+#define REALM_INTEGER_COMPRESSOR_HPP
+
+#include <cstdint>
+#include <cstddef>
+#include <vector>
+#include <realm/query_conditions.hpp>
+#include <realm/array_direct.hpp>
+#include <realm/node.hpp>
+
+namespace realm {
+
+class Array;
+class QueryStateBase;
+class IntegerCompressor {
+public:
+    // commit => encode, COW/insert => decode
+    bool compress(const Array&, Array&) const;
+    bool decompress(Array&) const;
+
+    bool init(const char*);
+    void set_vtable(Array&);
+
+    // init from mem B
+    inline uint64_t* data() const;
+    inline size_t size() const;
+    inline NodeHeader::Encoding get_encoding() const;
+    inline uint8_t v_width() const;
+    inline uint8_t ndx_width() const;
+    inline size_t v_size() const;
+    inline size_t ndx_size() const;
+
+    inline uint64_t v_mask() const;
+    inline uint64_t ndx_mask() const;
+    inline uint64_t msb() const;
+    inline uint64_t ndx_msb() const;
+    inline uint64_t bitmask_v() const;
+    inline uint64_t bitmask_ndx() const;
+
+    int64_t get(size_t) const;
+
+private:
+    // getting and setting interface specifically for encoding formats
+    inline void init_packed(const char*);
+    inline void init_flex(const char*);
+
+    static int64_t get_packed(const Array& arr, size_t ndx);
+    static int64_t get_flex(const Array& arr, size_t ndx);
+
+    static std::vector<int64_t> get_all_packed(const Array& arr, size_t begin, size_t end);
+    static std::vector<int64_t> get_all_flex(const Array& arr, size_t begin, size_t end);
+
+    static void get_chunk_packed(const Array& arr, size_t ndx, int64_t res[8]);
+    static void get_chunk_flex(const Array& arr, size_t ndx, int64_t res[8]);
+    static void set_packed(Array& arr, size_t ndx, int64_t val);
+    static void set_flex(Array& arr, size_t ndx, int64_t val);
+    // query interface
+    template <class Cond>
+    static bool find_packed(const Array& arr, int64_t val, size_t begin, size_t end, size_t base_index,
+                            QueryStateBase* st);
+    template <class Cond>
+    static bool find_flex(const Array& arr, int64_t val, size_t begin, size_t end, size_t base_index,
+                          QueryStateBase* st);
+
+    // internal impl
+    void compress_values(const Array&, std::vector<int64_t>&, std::vector<unsigned>&) const;
+    inline bool is_packed() const;
+    inline bool is_flex() const;
+
+    // for testing
+    bool always_compress(const Array&, Array&, Node::Encoding) const;
+
+private:
+    using Encoding = NodeHeader::Encoding;
+    Encoding m_encoding{NodeHeader::Encoding::WTypBits};
+    uint64_t* m_data;
+    uint8_t m_v_width = 0, m_ndx_width = 0;
+    size_t m_v_size = 0, m_ndx_size = 0;
+};
+
+inline void IntegerCompressor::init_packed(const char* h)
+{
+    m_data = (uint64_t*)NodeHeader::get_data_from_header(h);
+    m_v_width = NodeHeader::get_element_size(h, Encoding::Packed);
+    m_v_size = NodeHeader::get_num_elements(h, Encoding::Packed);
+}
+
+inline void IntegerCompressor::init_flex(const char* h)
+{
+    m_data = (uint64_t*)NodeHeader::get_data_from_header(h);
+    m_v_width = NodeHeader::get_elementA_size(h);
+    m_v_size = NodeHeader::get_arrayA_num_elements(h);
+    m_ndx_width = NodeHeader::get_elementB_size(h);
+    m_ndx_size = NodeHeader::get_arrayB_num_elements(h);
+}
+
+inline uint64_t* IntegerCompressor::data() const
+{
+    return m_data;
+}
+
+inline bool IntegerCompressor::is_packed() const
+{
+    return m_encoding == NodeHeader::Encoding::Packed;
+}
+
+inline bool IntegerCompressor::is_flex() const
+{
+    return m_encoding == NodeHeader::Encoding::Flex;
+}
+
+inline size_t IntegerCompressor::size() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return m_encoding == NodeHeader::Encoding::Packed ? v_size() : ndx_size();
+}
+
+inline size_t IntegerCompressor::v_size() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return m_v_size;
+}
+
+inline size_t IntegerCompressor::ndx_size() const
+{
+    REALM_ASSERT_DEBUG(is_flex());
+    return m_ndx_size;
+}
+
+inline uint8_t IntegerCompressor::v_width() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return m_v_width;
+}
+
+inline uint8_t IntegerCompressor::ndx_width() const
+{
+    REALM_ASSERT_DEBUG(is_flex());
+    return m_ndx_width;
+}
+
+inline NodeHeader::Encoding IntegerCompressor::get_encoding() const
+{
+    return m_encoding;
+}
+
+inline uint64_t IntegerCompressor::v_mask() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return 1ULL << (m_v_width - 1);
+}
+
+inline uint64_t IntegerCompressor::ndx_mask() const
+{
+    REALM_ASSERT_DEBUG(is_flex());
+    return 1ULL << (m_ndx_width - 1);
+}
+
+inline uint64_t IntegerCompressor::msb() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return populate(m_v_width, v_mask());
+}
+
+inline uint64_t IntegerCompressor::ndx_msb() const
+{
+    REALM_ASSERT_DEBUG(is_flex());
+    return populate(m_ndx_width, ndx_mask());
+}
+
+inline uint64_t IntegerCompressor::bitmask_v() const
+{
+    REALM_ASSERT_DEBUG(is_packed() || is_flex());
+    return 0xFFFFFFFFFFFFFFFFULL >> (64 - m_v_width);
+}
+
+inline uint64_t IntegerCompressor::bitmask_ndx() const
+{
+    REALM_ASSERT_DEBUG(is_flex());
+    return 0xFFFFFFFFFFFFFFFFULL >> (64 - m_ndx_width);
+}
+
+} // namespace realm
+#endif // REALM_INTEGER_COMPRESSOR_HPP
diff --git a/src/realm/integer_flex_compressor.cpp b/src/realm/integer_flex_compressor.cpp
new file mode 100644
index 00000000000..ef5e3b2fe6f
--- /dev/null
+++ b/src/realm/integer_flex_compressor.cpp
@@ -0,0 +1,79 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#include <realm/integer_flex_compressor.hpp>
+#include <realm/node_header.hpp>
+#include <realm/array_direct.hpp>
+
+#include <vector>
+#include <algorithm>
+
+#ifdef REALM_DEBUG
+#include <iostream>
+#include <sstream>
+#endif
+
+using namespace realm;
+
+void FlexCompressor::init_header(char* h, uint8_t flags, uint8_t v_width, uint8_t ndx_width, size_t v_size,
+                                 size_t ndx_size)
+{
+    using Encoding = NodeHeader::Encoding;
+    ::init_header(h, Encoding::Flex, flags, v_width, ndx_width, v_size, ndx_size);
+}
+
+void FlexCompressor::copy_data(const Array& arr, const std::vector<int64_t>& values,
+                               const std::vector<unsigned>& indices)
+{
+    using Encoding = NodeHeader::Encoding;
+    REALM_ASSERT_DEBUG(arr.is_attached());
+    const auto& compressor = arr.integer_compressor();
+    REALM_ASSERT_DEBUG(compressor.get_encoding() == Encoding::Flex);
+    const auto v_width = compressor.v_width();
+    const auto ndx_width = compressor.ndx_width();
+    const auto v_size = values.size();
+    const auto data = (uint64_t*)arr.m_data;
+    const auto offset = static_cast<size_t>(v_size * v_width);
+    BfIterator it_value{data, 0, v_width, v_width, 0};
+    BfIterator it_index{data, offset, ndx_width, ndx_width, 0};
+    for (size_t i = 0; i < v_size; ++i) {
+        it_value.set_value(values[i]);
+        REALM_ASSERT_DEBUG(sign_extend_value(v_width, it_value.get_value()) == values[i]);
+        ++it_value;
+    }
+    for (size_t i = 0; i < indices.size(); ++i) {
+        REALM_ASSERT_DEBUG(values[indices[i]] ==
+                           sign_extend_value(v_width, read_bitfield(data, indices[i] * v_width, v_width)));
+        it_index.set_value(indices[i]);
+        REALM_ASSERT_DEBUG(indices[i] == it_index.get_value());
+        REALM_ASSERT_DEBUG(values[indices[i]] ==
+                           sign_extend_value(v_width, read_bitfield(data, indices[i] * v_width, v_width)));
+        ++it_index;
+    }
+}
+
+bool FlexCompressor::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state)
+{
+    REALM_ASSERT_DEBUG(state->match_count() < state->limit());
+    const auto process = state->limit() - state->match_count();
+    const auto end2 = end - start > process ? start + process : end;
+    for (; start < end2; start++)
+        if (!state->match(start + baseindex))
+            return false;
+    return true;
+}
diff --git a/src/realm/integer_flex_compressor.hpp b/src/realm/integer_flex_compressor.hpp
new file mode 100644
index 00000000000..a7338978af8
--- /dev/null
+++ b/src/realm/integer_flex_compressor.hpp
@@ -0,0 +1,305 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#ifndef FLEX_COMPRESSOR_HPP
+#define FLEX_COMPRESSOR_HPP
+
+#include <realm/array.hpp>
+
+#include <cstdint>
+#include <stddef.h>
+#include <vector>
+
+namespace realm {
+
+//
+// Compress array in Flex format
+// Decompress array in WTypeBits formats
+//
+class FlexCompressor {
+public:
+    // encoding/decoding
+    static void init_header(char*, uint8_t, uint8_t, uint8_t, size_t, size_t);
+    static void copy_data(const Array&, const std::vector<int64_t>&, const std::vector<unsigned>&);
+    // getters/setters
+    static int64_t get(const IntegerCompressor&, size_t);
+    static std::vector<int64_t> get_all(const IntegerCompressor&, size_t, size_t);
+    static void get_chunk(const IntegerCompressor&, size_t, int64_t[8]);
+    static void set_direct(const IntegerCompressor&, size_t, int64_t);
+
+    template <typename Cond>
+    static bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    static int64_t min(const IntegerCompressor&);
+    static int64_t max(const IntegerCompressor&);
+
+private:
+    static bool find_all_match(size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename Cond>
+    static bool find_linear(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename VectorCond1, typename VectorCond2>
+    static bool find_parallel(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename LinearCond, typename VectorCond1, typename VectorCond2>
+    static bool do_find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename Cond>
+    static bool run_parallel_subscan(size_t, size_t, size_t);
+};
+
+inline int64_t FlexCompressor::get(const IntegerCompressor& c, size_t ndx)
+{
+    const auto offset = c.v_width() * c.v_size();
+    const auto ndx_w = c.ndx_width();
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    BfIterator ndx_iterator{data, offset, ndx_w, ndx_w, ndx};
+    BfIterator data_iterator{data, 0, v_w, v_w, static_cast<size_t>(*ndx_iterator)};
+    return sign_extend_field_by_mask(c.v_mask(), *data_iterator);
+}
+
+inline std::vector<int64_t> FlexCompressor::get_all(const IntegerCompressor& c, size_t b, size_t e)
+{
+    const auto offset = c.v_width() * c.v_size();
+    const auto ndx_w = c.ndx_width();
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    const auto sign_mask = c.v_mask();
+    const auto range = (e - b);
+    const auto starting_bit = offset + b * ndx_w;
+    const auto bit_per_it = num_bits_for_width(ndx_w);
+    const auto ndx_mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - ndx_w);
+    const auto values_per_word = num_fields_for_width(ndx_w);
+
+    // this is very important, x4 faster pre-allocating the array
+    std::vector<int64_t> res;
+    res.reserve(range);
+
+    UnalignedWordIter unaligned_ndx_iterator(data, starting_bit);
+    BfIterator data_iterator{data, 0, v_w, v_w, 0};
+    auto remaining_bits = ndx_w * range;
+    while (remaining_bits >= bit_per_it) {
+        auto word = unaligned_ndx_iterator.consume(bit_per_it);
+        for (int i = 0; i < values_per_word; ++i) {
+            const auto index = word & ndx_mask;
+            data_iterator.move(static_cast<size_t>(index));
+            const auto sv = sign_extend_field_by_mask(sign_mask, *data_iterator);
+            res.push_back(sv);
+            word >>= ndx_w;
+        }
+        remaining_bits -= bit_per_it;
+    }
+    if (remaining_bits) {
+        auto last_word = unaligned_ndx_iterator.consume(remaining_bits);
+        while (remaining_bits) {
+            const auto index = last_word & ndx_mask;
+            data_iterator.move(static_cast<size_t>(index));
+            const auto sv = sign_extend_field_by_mask(sign_mask, *data_iterator);
+            res.push_back(sv);
+            remaining_bits -= ndx_w;
+            last_word >>= ndx_w;
+        }
+    }
+    return res;
+}
+
+inline int64_t FlexCompressor::min(const IntegerCompressor& c)
+{
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    const auto sign_mask = c.v_mask();
+    BfIterator data_iterator{data, 0, v_w, v_w, 0};
+    return sign_extend_field_by_mask(sign_mask, *data_iterator);
+}
+
+inline int64_t FlexCompressor::max(const IntegerCompressor& c)
+{
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    const auto sign_mask = c.v_mask();
+    BfIterator data_iterator{data, 0, v_w, v_w, c.v_size() - 1};
+    return sign_extend_field_by_mask(sign_mask, *data_iterator);
+}
+
+inline void FlexCompressor::get_chunk(const IntegerCompressor& c, size_t ndx, int64_t res[8])
+{
+    auto sz = 8;
+    std::memset(res, 0, sizeof(int64_t) * sz);
+    auto supposed_end = ndx + sz;
+    size_t i = ndx;
+    size_t index = 0;
+    for (; i < supposed_end; ++i) {
+        res[index++] = get(c, i);
+    }
+    for (; index < 8; ++index) {
+        res[index++] = get(c, i++);
+    }
+}
+
+inline void FlexCompressor::set_direct(const IntegerCompressor& c, size_t ndx, int64_t value)
+{
+    const auto offset = c.v_width() * c.v_size();
+    const auto ndx_w = c.ndx_width();
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    BfIterator ndx_iterator{data, offset, ndx_w, ndx_w, ndx};
+    BfIterator data_iterator{data, 0, v_w, v_w, static_cast<size_t>(*ndx_iterator)};
+    data_iterator.set_value(value);
+}
+
+template <typename Cond>
+inline bool FlexCompressor::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                     QueryStateBase* state)
+{
+    REALM_ASSERT_DEBUG(start <= arr.m_size && (end <= arr.m_size || end == size_t(-1)) && start <= end);
+    Cond c;
+
+    if (end == npos)
+        end = arr.m_size;
+
+    if (!(arr.m_size > start && start < end))
+        return true;
+
+    const auto lbound = arr.m_lbound;
+    const auto ubound = arr.m_ubound;
+
+    if (!c.can_match(value, lbound, ubound))
+        return true;
+
+    if (c.will_match(value, lbound, ubound)) {
+        return find_all_match(start, end, baseindex, state);
+    }
+
+    REALM_ASSERT_DEBUG(arr.m_width != 0);
+
+    if constexpr (std::is_same_v<Equal, Cond>) {
+        return do_find_all<Equal, Equal, Equal>(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<NotEqual, Cond>) {
+        return do_find_all<NotEqual, Equal, NotEqual>(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Less, Cond>) {
+        return do_find_all<Less, GreaterEqual, Less>(arr, value, start, end, baseindex, state);
+    }
+    else if constexpr (std::is_same_v<Greater, Cond>) {
+        return do_find_all<Greater, Greater, GreaterEqual>(arr, value, start, end, baseindex, state);
+    }
+    return true;
+}
+
+template <typename LinearCond, typename VectorCond1, typename VectorCond2>
+inline bool FlexCompressor::do_find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                        QueryStateBase* state)
+{
+    const auto v_width = arr.m_width;
+    const auto v_range = arr.integer_compressor().v_size();
+    const auto ndx_range = end - start;
+    if (!run_parallel_subscan<LinearCond>(v_width, v_range, ndx_range))
+        return find_linear<LinearCond>(arr, value, start, end, baseindex, state);
+    return find_parallel<VectorCond1, VectorCond2>(arr, value, start, end, baseindex, state);
+}
+
+template <typename Cond>
+inline bool FlexCompressor::find_linear(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                        QueryStateBase* state)
+{
+    const auto cmp = [](int64_t item, int64_t key) {
+        if constexpr (std::is_same_v<Cond, Equal>)
+            return item == key;
+        if constexpr (std::is_same_v<Cond, NotEqual>)
+            return item != key;
+        if constexpr (std::is_same_v<Cond, Less>)
+            return item < key;
+        if constexpr (std::is_same_v<Cond, Greater>)
+            return item > key;
+        REALM_UNREACHABLE();
+    };
+
+    const auto& c = arr.integer_compressor();
+    const auto offset = c.v_width() * c.v_size();
+    const auto ndx_w = c.ndx_width();
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    const auto mask = c.v_mask();
+    BfIterator ndx_iterator{data, offset, ndx_w, ndx_w, start};
+    BfIterator data_iterator{data, 0, v_w, v_w, static_cast<size_t>(*ndx_iterator)};
+    while (start < end) {
+        const auto sv = sign_extend_field_by_mask(mask, *data_iterator);
+        if (cmp(sv, value) && !state->match(start + baseindex))
+            return false;
+        ndx_iterator.move(++start);
+        data_iterator.move(static_cast<size_t>(*ndx_iterator));
+    }
+    return true;
+}
+
+template <typename VectorCond1, typename VectorCond2>
+inline bool FlexCompressor::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                          QueryStateBase* state)
+{
+    //
+    // algorithm idea: first try to find in the array of values (should be shorter in size but more bits) using
+    // VectorCond1.
+    //                 Then match the index found in the array of indices using VectorCond2
+    //
+
+    const auto& compressor = arr.integer_compressor();
+    const auto v_width = compressor.v_width();
+    const auto v_size = compressor.v_size();
+    const auto ndx_width = compressor.ndx_width();
+    const auto offset = v_size * v_width;
+    uint64_t* data = (uint64_t*)arr.m_data;
+
+    auto MSBs = compressor.msb();
+    auto search_vector = populate(v_width, value);
+    auto v_start =
+        parallel_subword_find(find_all_fields<VectorCond1>, data, 0, v_width, MSBs, search_vector, 0, v_size);
+
+    if constexpr (!std::is_same_v<VectorCond2, NotEqual>) {
+        if (start == v_size)
+            return true;
+    }
+
+    MSBs = compressor.ndx_msb();
+    search_vector = populate(ndx_width, v_start);
+    while (start < end) {
+        start = parallel_subword_find(find_all_fields_unsigned<VectorCond2>, data, offset, ndx_width, MSBs,
+                                      search_vector, start, end);
+
+        if (start < end && !state->match(start + baseindex))
+            return false;
+
+        ++start;
+    }
+    return true;
+}
+
+template <typename Cond>
+inline bool FlexCompressor::run_parallel_subscan(size_t v_width, size_t v_range, size_t ndx_range)
+{
+    if constexpr (std::is_same_v<Cond, Equal> || std::is_same_v<Cond, NotEqual>) {
+        return v_width < 32 && v_range >= 20 && ndx_range >= 20;
+    }
+    // > and < need looks slower in parallel scan for large values
+    return v_width <= 16 && v_range >= 20 && ndx_range >= 20;
+}
+
+} // namespace realm
+#endif // FLEX_COMPRESSOR_HPP
diff --git a/src/realm/integer_packed_compressor.cpp b/src/realm/integer_packed_compressor.cpp
new file mode 100644
index 00000000000..2f7646b1b0c
--- /dev/null
+++ b/src/realm/integer_packed_compressor.cpp
@@ -0,0 +1,68 @@
+/*************************************************************************
+ *
+ * Copyright 2023 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#include <realm/integer_packed_compressor.hpp>
+#include <realm/integer_compressor.hpp>
+#include <realm/node_header.hpp>
+#include <realm/array_direct.hpp>
+#include <realm/array.hpp>
+
+#include <vector>
+#include <algorithm>
+
+#ifdef REALM_DEBUG
+#include <iostream>
+#include <sstream>
+#endif
+
+using namespace realm;
+
+void PackedCompressor::init_header(char* h, uint8_t flags, uint8_t v_width, size_t v_size)
+{
+    using Encoding = NodeHeader::Encoding;
+    ::init_header((char*)h, Encoding::Packed, flags, static_cast<uint8_t>(v_width), v_size);
+}
+
+void PackedCompressor::copy_data(const Array& origin, Array& arr)
+{
+    // this can be boosted a little bit, with and size should be known at this stage.
+    using Encoding = NodeHeader::Encoding;
+    REALM_ASSERT_DEBUG(arr.is_attached());
+    REALM_ASSERT_DEBUG(arr.integer_compressor().get_encoding() == Encoding::Packed);
+    // we don't need to access the header, init from mem must have been called
+    const auto v_width = arr.m_width;
+    const auto v_size = arr.m_size;
+    auto data = (uint64_t*)arr.m_data;
+    BfIterator it_value{data, 0, v_width, v_width, 0};
+    for (size_t i = 0; i < v_size; ++i) {
+        it_value.set_value(origin.get(i));
+        REALM_ASSERT_DEBUG(sign_extend_value(v_width, it_value.get_value()) == origin.get(i));
+        ++it_value;
+    }
+}
+
+bool PackedCompressor::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state)
+{
+    REALM_ASSERT_DEBUG(state->match_count() < state->limit());
+    const auto process = state->limit() - state->match_count();
+    const auto end2 = end - start > process ? start + process : end;
+    for (; start < end2; start++)
+        if (!state->match(start + baseindex))
+            return false;
+    return true;
+}
diff --git a/src/realm/integer_packed_compressor.hpp b/src/realm/integer_packed_compressor.hpp
new file mode 100644
index 00000000000..91d94fc5eab
--- /dev/null
+++ b/src/realm/integer_packed_compressor.hpp
@@ -0,0 +1,229 @@
+/*************************************************************************
+ *
+ * Copyright 2024 Realm Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **************************************************************************/
+
+#ifndef PACKED_COMPRESSOR_HPP
+#define PACKED_COMPRESSOR_HPP
+
+#include <realm/array.hpp>
+#include <realm/array_direct.hpp>
+
+#include <cstdint>
+#include <stddef.h>
+
+namespace realm {
+
+//
+// Compress array in Packed format
+// Decompress array in WTypeBits formats
+//
+class PackedCompressor {
+public:
+    // encoding/decoding
+    static void init_header(char*, uint8_t, uint8_t, size_t);
+    static void copy_data(const Array&, Array&);
+    // get or set
+    static int64_t get(const IntegerCompressor&, size_t);
+    static std::vector<int64_t> get_all(const IntegerCompressor& c, size_t b, size_t e);
+    static void get_chunk(const IntegerCompressor&, size_t, int64_t res[8]);
+    static void set_direct(const IntegerCompressor&, size_t, int64_t);
+
+    template <typename Cond>
+    static bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+private:
+    static bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state);
+
+    template <typename VectorCond>
+    static bool find_parallel(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename Cond>
+    static bool find_linear(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*);
+
+    template <typename Cond>
+    static bool run_parallel_scan(size_t, size_t);
+};
+
+inline int64_t PackedCompressor::get(const IntegerCompressor& c, size_t ndx)
+{
+    BfIterator it{c.data(), 0, c.v_width(), c.v_width(), ndx};
+    return sign_extend_field_by_mask(c.v_mask(), *it);
+}
+
+inline std::vector<int64_t> PackedCompressor::get_all(const IntegerCompressor& c, size_t b, size_t e)
+{
+    const auto range = (e - b);
+    const auto v_w = c.v_width();
+    const auto data = c.data();
+    const auto sign_mask = c.v_mask();
+    const auto starting_bit = b * v_w;
+    const auto total_bits = starting_bit + (v_w * range);
+    const auto mask = 0xFFFFFFFFFFFFFFFFULL >> (64 - v_w);
+    const auto bit_per_it = num_bits_for_width(v_w);
+    const auto values_per_word = num_fields_for_width(v_w);
+
+    std::vector<int64_t> res;
+    res.reserve(range);
+
+    UnalignedWordIter unaligned_data_iterator(data, starting_bit);
+    auto cnt_bits = starting_bit;
+    while (cnt_bits + bit_per_it < total_bits) {
+        auto word = unaligned_data_iterator.consume(bit_per_it);
+        for (int i = 0; i < values_per_word; ++i) {
+            res.push_back(sign_extend_field_by_mask(sign_mask, word & mask));
+            word >>= v_w;
+        }
+        cnt_bits += bit_per_it;
+    }
+    if (cnt_bits < total_bits) {
+        auto last_word = unaligned_data_iterator.consume(static_cast<unsigned>(total_bits - cnt_bits));
+        while (cnt_bits < total_bits) {
+            res.push_back(sign_extend_field_by_mask(sign_mask, last_word & mask));
+            cnt_bits += v_w;
+            last_word >>= v_w;
+        }
+    }
+    return res;
+}
+
+inline void PackedCompressor::set_direct(const IntegerCompressor& c, size_t ndx, int64_t value)
+{
+    BfIterator it{c.data(), 0, c.v_width(), c.v_width(), ndx};
+    it.set_value(value);
+}
+
+inline void PackedCompressor::get_chunk(const IntegerCompressor& c, size_t ndx, int64_t res[8])
+{
+    auto sz = 8;
+    std::memset(res, 0, sizeof(int64_t) * sz);
+    auto supposed_end = ndx + sz;
+    size_t i = ndx;
+    size_t index = 0;
+    // this can be done better, in one go, retrieve both!!!
+    for (; i < supposed_end; ++i) {
+        res[index++] = get(c, i);
+    }
+    for (; index < 8; ++index) {
+        res[index++] = get(c, i++);
+    }
+}
+
+
+template <typename Cond>
+inline bool PackedCompressor::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                       QueryStateBase* state)
+{
+    REALM_ASSERT_DEBUG(start <= arr.m_size && (end <= arr.m_size || end == size_t(-1)) && start <= end);
+    Cond c;
+
+    if (end == npos)
+        end = arr.m_size;
+
+    if (!(arr.m_size > start && start < end))
+        return true;
+
+    const auto lbound = arr.m_lbound;
+    const auto ubound = arr.m_ubound;
+
+    if (!c.can_match(value, lbound, ubound))
+        return true;
+
+    if (c.will_match(value, lbound, ubound)) {
+        return find_all_match(start, end, baseindex, state);
+    }
+
+    REALM_ASSERT_DEBUG(arr.m_width != 0);
+
+    if (!run_parallel_scan<Cond>(arr.m_width, end - start))
+        return find_linear<Cond>(arr, value, start, end, baseindex, state);
+
+    return find_parallel<Cond>(arr, value, start, end, baseindex, state);
+}
+
+template <typename VectorCond>
+inline bool PackedCompressor::find_parallel(const Array& arr, int64_t value, size_t start, size_t end,
+                                            size_t baseindex, QueryStateBase* state)
+{
+    //
+    // Main idea around find parallel (applicable to flex arrays too).
+    // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can
+    // contain in parallel. Once we have found the starting point, keep matching values as much as we can between
+    // start and end.
+    //
+    // EG: let's store 6, it gets stored in 4 bits (0110). 6 is 4 bits because 110 (6) + sign bit 0.
+    // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to
+    // apply a mask and a shift bits every time, then compare the extracted values.
+    // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go, and
+    // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is
+    // the width of each single value within a 64 bit word and N is the total number of values stored in the array.
+
+    const auto data = (const uint64_t*)arr.m_data;
+    const auto width = arr.m_width;
+    const auto MSBs = arr.integer_compressor().msb();
+    const auto search_vector = populate(arr.m_width, value);
+    while (start < end) {
+        start = parallel_subword_find(find_all_fields<VectorCond>, data, 0, width, MSBs, search_vector, start, end);
+        if (start < end && !state->match(start + baseindex))
+            return false;
+        ++start;
+    }
+    return true;
+}
+
+template <typename Cond>
+inline bool PackedCompressor::find_linear(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex,
+                                          QueryStateBase* state)
+{
+    auto compare = [](int64_t a, int64_t b) {
+        if constexpr (std::is_same_v<Cond, Equal>)
+            return a == b;
+        if constexpr (std::is_same_v<Cond, NotEqual>)
+            return a != b;
+        if constexpr (std::is_same_v<Cond, Greater>)
+            return a > b;
+        if constexpr (std::is_same_v<Cond, Less>)
+            return a < b;
+    };
+    const auto& c = arr.integer_compressor();
+    BfIterator it{c.data(), 0, c.v_width(), c.v_width(), start};
+    for (; start < end; ++start) {
+        it.move(start);
+        const auto sv = sign_extend_field_by_mask(c.v_mask(), *it);
+        if (compare(sv, value) && !state->match(start + baseindex))
+            return false;
+    }
+    return true;
+}
+
+template <typename Cond>
+inline bool PackedCompressor::run_parallel_scan(size_t width, size_t range)
+{
+    if constexpr (std::is_same_v<Cond, NotEqual>) {
+        // we seem to be particularly slow doing parallel scan in packed for NotEqual.
+        // we are much better with a linear scan. TODO: investigate this.
+        return false;
+    }
+    if constexpr (std::is_same_v<Cond, Equal>) {
+        return width < 32 && range >= 20;
+    }
+    // > and < need a different heuristic
+    return width <= 20 && range >= 20;
+}
+
+} // namespace realm
+
+#endif // PACKED_COMPRESSOR_HPP
diff --git a/src/realm/node.cpp b/src/realm/node.cpp
index f23cff4316b..63ef4d3962c 100644
--- a/src/realm/node.cpp
+++ b/src/realm/node.cpp
@@ -26,7 +26,8 @@
 
 using namespace realm;
 
-MemRef Node::create_node(size_t size, Allocator& alloc, bool context_flag, Type type, WidthType width_type, int width)
+MemRef Node::create_node(size_t size, Allocator& alloc, bool context_flag, Type type, WidthType width_type,
+                         uint8_t width)
 {
     size_t byte_size_0 = calc_byte_size(width_type, size, width);
     size_t byte_size = std::max(byte_size_0, size_t(initial_capacity));
@@ -81,9 +82,9 @@ size_t Node::calc_item_count(size_t bytes, size_t width) const noexcept
 
 void Node::alloc(size_t init_size, size_t new_width)
 {
-    REALM_ASSERT(is_attached());
+    REALM_ASSERT_DEBUG(is_attached());
     char* header = get_header_from_data(m_data);
-    REALM_ASSERT(!wtype_is_extended(header));
+    REALM_ASSERT_DEBUG(!wtype_is_extended(header));
     size_t needed_bytes = calc_byte_len(init_size, new_width);
     // this method is not public and callers must (and currently do) ensure that
     // needed_bytes are never larger than max_array_payload.
@@ -132,7 +133,7 @@ void Node::alloc(size_t init_size, size_t new_width)
     }
     // update width (important when we convert from normal uncompressed array into compressed format)
     if (new_width != orig_width) {
-        set_width_in_header(int(new_width), header);
+        set_width_in_header(new_width, header);
     }
     set_size_in_header(init_size, header);
     m_size = init_size;
diff --git a/src/realm/node.hpp b/src/realm/node.hpp
index 5cb637ab7d1..8a4b862a701 100644
--- a/src/realm/node.hpp
+++ b/src/realm/node.hpp
@@ -323,7 +323,7 @@ class Node : public NodeHeader {
     }
 
     static MemRef create_node(size_t size, Allocator& alloc, bool context_flag = false, Type type = type_Normal,
-                              WidthType width_type = wtype_Ignore, int width = 1);
+                              WidthType width_type = wtype_Ignore, uint8_t width = 1);
 
     void set_header_size(size_t value) noexcept
     {
diff --git a/src/realm/node_header.hpp b/src/realm/node_header.hpp
index 2ffe073b721..ca7d5638025 100644
--- a/src/realm/node_header.hpp
+++ b/src/realm/node_header.hpp
@@ -205,7 +205,7 @@ class NodeHeader {
         h[4] = h4;
     }
 
-    static size_t unsigned_to_num_bits(uint64_t value)
+    static uint8_t unsigned_to_num_bits(uint64_t value)
     {
         if constexpr (sizeof(size_t) == sizeof(uint64_t))
             return 1 + log2(static_cast<size_t>(value));
@@ -218,7 +218,7 @@ class NodeHeader {
         return 0;
     }
 
-    static inline size_t signed_to_num_bits(int64_t value)
+    static inline uint8_t signed_to_num_bits(int64_t value)
     {
         if (value >= 0)
             return 1 + unsigned_to_num_bits(value);
@@ -292,7 +292,6 @@ class NodeHeader {
             (reinterpret_cast<uint16_t*>(header))[0] = static_cast<uint16_t>(value >> 3);
         }
     }
-
     static size_t get_byte_size_from_header(const char* header) noexcept;
 
     // ^ First 3 must overlap numerically with corresponding wtype_X enum.
@@ -343,17 +342,18 @@ class NodeHeader {
 
 private:
     friend class Node;
+    friend class IntegerCompressor;
     // Setting element size for encodings with a single element size:
-    static void inline set_element_size(char* header, size_t bits_per_element, Encoding);
+    static void inline set_element_size(char* header, uint8_t bits_per_element, Encoding);
     // Getting element size for encodings with a single element size:
-    static inline size_t get_element_size(const char* header, Encoding);
+    static inline uint8_t get_element_size(const char* header, Encoding);
     // Used only by flex at this stage.
     // Setting element sizes for encodings with two element sizes (called A and B)
-    static inline void set_elementA_size(char* header, size_t bits_per_element);
-    static inline void set_elementB_size(char* header, size_t bits_per_element);
+    static inline void set_elementA_size(char* header, uint8_t bits_per_element);
+    static inline void set_elementB_size(char* header, uint8_t bits_per_element);
     // Getting element sizes for encodings with two element sizes (called A and B)
-    static inline size_t get_elementA_size(const char* header);
-    static inline size_t get_elementB_size(const char* header);
+    static inline uint8_t get_elementA_size(const char* header);
+    static inline uint8_t get_elementB_size(const char* header);
     // Setting num of elements for encodings with two element sizes (called A and B)
     static inline void set_arrayA_num_elements(char* header, size_t num_elements);
     static inline void set_arrayB_num_elements(char* header, size_t num_elements);
@@ -366,9 +366,9 @@ class NodeHeader {
     static inline void set_num_elements(char* header, size_t num_elements, Encoding);
 
     static inline size_t calc_size(size_t num_elements);
-    static inline size_t calc_size(size_t num_elements, size_t element_size, Encoding);
-    static inline size_t calc_size(size_t arrayA_num_elements, size_t arrayB_num_elements, size_t elementA_size,
-                                   size_t elementB_size);
+    static inline size_t calc_size(size_t num_elements, uint8_t element_size, Encoding);
+    static inline size_t calc_size(size_t arrayA_num_elements, size_t arrayB_num_elements, uint8_t elementA_size,
+                                   uint8_t elementB_size);
 
     static size_t calc_byte_size(WidthType wtype, size_t size, uint_least8_t width) noexcept
     {
@@ -441,7 +441,7 @@ class NodeHeader {
     }
 };
 
-inline void NodeHeader::set_element_size(char* header, size_t bits_per_element, Encoding encoding)
+inline void NodeHeader::set_element_size(char* header, uint8_t bits_per_element, Encoding encoding)
 {
     switch (encoding) {
         case NodeHeader::Encoding::Packed: {
@@ -469,7 +469,7 @@ inline void NodeHeader::set_element_size(char* header, size_t bits_per_element,
     }
 }
 
-inline size_t NodeHeader::get_element_size(const char* header, Encoding encoding)
+inline uint8_t NodeHeader::get_element_size(const char* header, Encoding encoding)
 {
     switch (encoding) {
         case NodeHeader::Encoding::Packed: {
@@ -496,7 +496,7 @@ inline size_t NodeHeader::get_element_size(const char* header, Encoding encoding
     }
 }
 
-inline void NodeHeader::set_elementA_size(char* header, size_t bits_per_element)
+inline void NodeHeader::set_elementA_size(char* header, uint8_t bits_per_element)
 {
     // we're a bit low on bits for the Flex encoding, so we need to squeeze stuff
     REALM_ASSERT_DEBUG(get_encoding(header) == Encoding::Flex);
@@ -509,7 +509,7 @@ inline void NodeHeader::set_elementA_size(char* header, size_t bits_per_element)
     (reinterpret_cast<uint16_t*>(header))[1] = word;
 }
 
-inline void NodeHeader::set_elementB_size(char* header, size_t bits_per_element)
+inline void NodeHeader::set_elementB_size(char* header, uint8_t bits_per_element)
 {
     // we're a bit low on bits for the Flex encoding, so we need to squeeze stuff
     REALM_ASSERT_DEBUG(get_encoding(header) == Encoding::Flex);
@@ -522,7 +522,7 @@ inline void NodeHeader::set_elementB_size(char* header, size_t bits_per_element)
     (reinterpret_cast<uint16_t*>(header))[3] = word;
 }
 
-inline size_t NodeHeader::get_elementA_size(const char* header)
+inline uint8_t NodeHeader::get_elementA_size(const char* header)
 {
     const auto encoding = get_encoding(header);
     REALM_ASSERT_DEBUG(encoding == Encoding::Flex);
@@ -536,7 +536,7 @@ inline size_t NodeHeader::get_elementA_size(const char* header)
     return bits_per_element;
 }
 
-inline size_t NodeHeader::get_elementB_size(const char* header)
+inline uint8_t NodeHeader::get_elementB_size(const char* header)
 {
     REALM_ASSERT_DEBUG(get_encoding(header) == Encoding::Flex);
     uint16_t word = (reinterpret_cast<const uint16_t*>(header))[3];
@@ -643,7 +643,7 @@ inline size_t NodeHeader::calc_size(size_t num_elements)
     return calc_byte_size(wtype_Ignore, num_elements, 0);
 }
 
-inline size_t NodeHeader::calc_size(size_t num_elements, size_t element_size, Encoding encoding)
+inline size_t NodeHeader::calc_size(size_t num_elements, uint8_t element_size, Encoding encoding)
 {
     using Encoding = NodeHeader::Encoding;
     switch (encoding) {
@@ -660,8 +660,8 @@ inline size_t NodeHeader::calc_size(size_t num_elements, size_t element_size, En
     }
 }
 
-inline size_t NodeHeader::calc_size(size_t arrayA_num_elements, size_t arrayB_num_elements, size_t elementA_size,
-                                    size_t elementB_size)
+inline size_t NodeHeader::calc_size(size_t arrayA_num_elements, size_t arrayB_num_elements, uint8_t elementA_size,
+                                    uint8_t elementB_size)
 {
     return NodeHeader::header_size +
            align_bits_to8(arrayA_num_elements * elementA_size + arrayB_num_elements * elementB_size);
@@ -757,6 +757,7 @@ static inline void init_header(char* header, realm::NodeHeader::Encoding enc, ui
     REALM_ASSERT_DEBUG(num_elemsB < 1024);
     hw[1] = static_cast<uint16_t>(((bits_pr_elemA - 1) << 10) | num_elemsA);
     hw[3] = static_cast<uint16_t>(((bits_pr_elemB - 1) << 10) | num_elemsB);
+    REALM_ASSERT_DEBUG(realm::NodeHeader::get_encoding(header) == realm::NodeHeader::Encoding::Flex);
 }
 } // namespace
 
diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp
index 8a1267029b9..eb8138dd8f5 100644
--- a/src/realm/obj.cpp
+++ b/src/realm/obj.cpp
@@ -549,12 +549,9 @@ int64_t Obj::_get<int64_t>(ColKey::Idx col_ndx) const
     if (current_version != m_storage_version) {
         update();
     }
-
     ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1));
     char* header = alloc.translate(ref);
-    int width = Array::get_width_from_header(header);
-    char* data = Array::get_data_from_header(header);
-    REALM_TEMPEX(return get_direct, width, (data, m_row_ndx));
+    return Array::get(header, m_row_ndx);
 }
 
 template <>
diff --git a/src/realm/query_conditions.hpp b/src/realm/query_conditions.hpp
index cf3cf9e73d8..ea16fb4a736 100644
--- a/src/realm/query_conditions.hpp
+++ b/src/realm/query_conditions.hpp
@@ -1002,6 +1002,155 @@ struct GreaterEqual : public HackClass {
     static const int condition = -1;
 };
 
+/* Unsigned LT.
+
+ This can be determined by trial subtaction. However, some care must be exercised
+ since simply subtracting one vector from another will allow carries from one
+ bitfield to flow into the next one. To avoid this, we isolate bitfields by clamping
+ the MSBs to 1 in A and 0 in B before subtraction. After the subtraction the MSBs in
+ the result indicate borrows from the MSB. We then compute overflow (borrow OUT of MSB)
+ using boolean logic as described below.
+
+ Unsigned LT is also used to find all zero fields or all non-zero fields, so it is
+ the backbone of all comparisons returning vectors.
+ */
+
+// compute the overflows in unsigned trial subtraction A-B. The overflows
+// will be marked by 1 in the sign bit of each field in the result. Other
+// bits in the result are zero.
+// Overflow are detected for each field pair where A is less than B.
+inline uint64_t unsigned_LT_vector(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // 1. compute borrow from most significant bit
+    // Isolate bitfields inside A and B before subtraction (prevent carries from spilling over)
+    // do this by clamping most significant bit in A to 1, and msb in B to 0
+    auto A_isolated = A | MSBs;                              // 1 op
+    auto B_isolated = B & ~MSBs;                             // 2 ops
+    auto borrows_into_sign_bit = ~(A_isolated - B_isolated); // 2 ops (total latency 4)
+
+    // 2. determine what subtraction against most significant bit would give:
+    // A B borrow-in:   (A-B-borrow-in)
+    // 0 0 0            (0-0-0) = 0
+    // 0 0 1            (0-0-1) = 1 + borrow-out
+    // 0 1 0            (0-1-0) = 1 + borrow-out
+    // 0 1 1            (0-1-1) = 0 + borrow-out
+    // 1 0 0            (1-0-0) = 1
+    // 1 0 1            (1-0-1) = 0
+    // 1 1 0            (1-1-0) = 0
+    // 1 1 1            (1-1-1) = 1 + borrow-out
+    // borrow-out = (~A & B) | (~A & borrow-in) | (A & B & borrow-in)
+    // The overflows are simply the borrow-out, now encoded into the sign bits of each field.
+    auto overflows = (~A & B) | (~A & borrows_into_sign_bit) | (A & B & borrows_into_sign_bit);
+    // ^ 6 ops, total latency 6 (4+2)
+    return overflows & MSBs; // 1 op, total latency 7
+    // total of 12 ops and a latency of 7. On a beefy CPU 3-4 of those can run in parallel
+    // and still reach a combined latency of 10 or less.
+}
+
+template <typename Cond>
+uint64_t find_all_fields_unsigned(uint64_t MSBs, uint64_t A, uint64_t B);
+
+template <typename Cond>
+uint64_t find_all_fields(uint64_t MSBs, uint64_t A, uint64_t B);
+
+template <>
+inline uint64_t find_all_fields<NotEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // 0 != A^B, same as asking 0 - (A^B) overflows.
+    return unsigned_LT_vector(MSBs, 0, A ^ B);
+}
+
+template <>
+inline uint64_t find_all_fields<Equal>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // get the fields which are EQ and negate the result
+    auto all_fields_NE = find_all_fields<NotEqual>(MSBs, A, B);
+    auto all_fields_NE_negated = ~all_fields_NE;
+    // must filter the negated vector so only MSB are left.
+    return MSBs & all_fields_NE_negated;
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<Equal>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return find_all_fields<Equal>(MSBs, A, B);
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<NotEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return find_all_fields<NotEqual>(MSBs, A, B);
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<Less>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return unsigned_LT_vector(MSBs, A, B);
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<LessEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // Now A <= B is the same as !(A > B) so...
+    // reverse A and B to turn (A>B) --> (B<A)
+    auto GT = unsigned_LT_vector(MSBs, B, A);
+    // Negate the matches
+    auto GT_negated = ~GT;
+    // and since this negates all bits, filter so we only have MSBs again
+    return MSBs & GT_negated;
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<GreaterEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return find_all_fields_unsigned<LessEqual>(MSBs, B, A);
+}
+
+template <>
+inline uint64_t find_all_fields_unsigned<Greater>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    return find_all_fields_unsigned<Less>(MSBs, B, A);
+}
+
+/*
+ Handling signed values
+
+ Trial subtraction only works as is for unsigned. We simply transform signed into unsigned
+ by pusing all values up by 1<<(field_width-1). This makes all negative values positive and positive
+ values remain positive, although larger. Any overflow during the push can be ignored.
+ After that transformation Trial subtraction should correctly detect the LT condition.
+
+ */
+
+
+template <>
+inline uint64_t find_all_fields<Less>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    auto sign_bits = MSBs;
+    return unsigned_LT_vector(MSBs, A ^ sign_bits, B ^ sign_bits);
+}
+
+template <>
+inline uint64_t find_all_fields<LessEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    auto sign_bits = MSBs;
+    return find_all_fields_unsigned<LessEqual>(MSBs, A ^ sign_bits, B ^ sign_bits);
+}
+
+template <>
+inline uint64_t find_all_fields<Greater>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // A > B is the same as B < A
+    return find_all_fields<Less>(MSBs, B, A);
+}
+
+template <>
+inline uint64_t find_all_fields<GreaterEqual>(uint64_t MSBs, uint64_t A, uint64_t B)
+{
+    // A >= B is the same as B <= A
+    return find_all_fields<LessEqual>(MSBs, B, A);
+}
+
 } // namespace realm
 
 #endif // REALM_QUERY_CONDITIONS_HPP
diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp
index 8b7ecf2d1e8..26a07377536 100644
--- a/src/realm/query_engine.hpp
+++ b/src/realm/query_engine.hpp
@@ -449,6 +449,7 @@ static size_t find_first_haystack(LeafType& leaf, NeedleContainer& needles, size
 {
     // for a small number of conditions, it is faster to do a linear search than to compute the hash
     // the exact thresholds were found experimentally
+
     if (needles.size() < linear_search_threshold) {
         for (size_t i = start; i < end; ++i) {
             auto element = leaf.get(i);
diff --git a/src/realm/query_state.hpp b/src/realm/query_state.hpp
index b2812276539..ac0480d7166 100644
--- a/src/realm/query_state.hpp
+++ b/src/realm/query_state.hpp
@@ -22,8 +22,6 @@
 #include <cstdlib> // size_t
 #include <cstdint> // unint8_t etc
 
-#include <realm/node.hpp>
-
 namespace realm {
 
 enum Action { act_ReturnFirst, act_Sum, act_Max, act_Min, act_Count, act_FindAll, act_Average };
@@ -34,6 +32,7 @@ enum { cond_Equal, cond_NotEqual, cond_Greater, cond_Less, cond_VTABLE_FINDER_CO
 
 class ArrayUnsigned;
 class Mixed;
+class ArrayPayload;
 
 class QueryStateBase {
 public:
diff --git a/src/realm/table.hpp b/src/realm/table.hpp
index 3709669400d..0830d7c733f 100644
--- a/src/realm/table.hpp
+++ b/src/realm/table.hpp
@@ -544,6 +544,10 @@ class Table {
         return false;
     }
 
+    ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out, bool deep, bool only_modified,
+                         bool compress) const;
+    void typed_print(std::string prefix, ref_type ref) const;
+
 private:
     template <class T>
     TableView find_all(ColKey col_key, T value);
@@ -689,7 +693,6 @@ class Table {
     };
 
     ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out) const;
-    void typed_print(std::string prefix, ref_type ref) const;
 
 private:
     enum LifeCycleCookie {
diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp
index 5333e464dfc..b837834796b 100644
--- a/test/benchmark-common-tasks/main.cpp
+++ b/test/benchmark-common-tasks/main.cpp
@@ -1413,7 +1413,6 @@ struct BenchmarkQueryChainedOrIntsIndexed : BenchmarkQueryChainedOrInts {
     }
 };
 
-
 struct BenchmarkQueryIntEquality : BenchmarkQueryChainedOrInts {
     const char* name() const
     {
diff --git a/test/object-store/results.cpp b/test/object-store/results.cpp
index 5815d258e84..3fedacfeaec 100644
--- a/test/object-store/results.cpp
+++ b/test/object-store/results.cpp
@@ -103,7 +103,6 @@ struct TestContext : CppContext {
     }
 };
 
-
 TEST_CASE("notifications: async delivery", "[notifications]") {
     _impl::RealmCoordinator::assert_no_open_realms();
     TestFile config;
diff --git a/test/test_array.cpp b/test/test_array.cpp
index 8a86ac15718..a77c698b7fa 100644
--- a/test/test_array.cpp
+++ b/test/test_array.cpp
@@ -96,6 +96,27 @@ void has_zero_byte(TestContext& test_context, int64_t value, size_t reps)
 
 } // anonymous namespace
 
+TEST(Array_Bits)
+{
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(0), 0);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(1), 1);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(2), 2);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(3), 2);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(4), 3);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(5), 3);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(7), 3);
+    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(8), 4);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(0), 1);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(1), 2);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-1), 1);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-2), 2);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-3), 3);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-4), 3);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(3), 3);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(4), 4);
+    CHECK_EQUAL(NodeHeader::signed_to_num_bits(7), 4);
+}
+
 TEST(Array_General)
 {
     Array c(Allocator::get_default());
@@ -1560,25 +1581,56 @@ NONCONCURRENT_TEST(Array_count)
     c.destroy();
 }
 
-TEST(Array_Bits)
+TEST(DirectBitFields)
 {
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(0), 0);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(1), 1);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(2), 2);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(3), 2);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(4), 3);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(5), 3);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(7), 3);
-    CHECK_EQUAL(NodeHeader::unsigned_to_num_bits(8), 4);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(0), 1);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(1), 2);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-1), 1);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-2), 2);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-3), 3);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(-4), 3);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(3), 3);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(4), 4);
-    CHECK_EQUAL(NodeHeader::signed_to_num_bits(7), 4);
+    uint64_t a[2];
+    a[0] = a[1] = 0;
+    {
+        BfIterator it(a, 0, 7, 7, 8);
+        REALM_ASSERT(*it == 0);
+        auto it2(it);
+        ++it2;
+        it2.set_value(127 + 128);
+        REALM_ASSERT(*it == 0);
+        ++it;
+        REALM_ASSERT(*it == 127);
+        ++it;
+        REALM_ASSERT(*it == 0);
+    }
+    // reverse polarity
+    a[0] = a[1] = -1ULL;
+    {
+        BfIterator it(a, 0, 7, 7, 8);
+        REALM_ASSERT(*it == 127);
+        auto it2(it);
+        ++it2;
+        it2.set_value(42 + 128);
+        REALM_ASSERT(*it == 127);
+        ++it;
+        REALM_ASSERT(*it == 42);
+        ++it;
+        REALM_ASSERT(*it == 127);
+    }
+}
+
+TEST(Extended_Array_encoding)
+{
+    using Encoding = NodeHeader::Encoding;
+    Array array(Allocator::get_default());
+    auto mem = array.get_alloc().alloc(10);
+    init_header(mem.get_addr(), Encoding::Flex, 7, 1, 1, 1, 1);
+    array.init_from_mem(mem);
+    auto array_header = array.get_header();
+    auto encoding = array.get_encoding(array_header);
+    CHECK(encoding == Encoding::Flex);
+
+    Array another_array(Allocator::get_default());
+    another_array.init_from_ref(array.get_ref());
+    auto another_header = another_array.get_header();
+    auto another_encoding = another_array.get_encoding(another_header);
+    CHECK(encoding == another_encoding);
+
+    array.get_alloc().free_(mem);
 }
 
 TEST(Array_cares_about)
@@ -1710,9 +1762,8 @@ TEST(VerifyIterationAcrossWords)
         // unaligned iterator
         UnalignedWordIter u_it(a, 0);
         for (size_t i = 0; i < 51; ++i) {
-            const auto v = sign_extend_value(5, u_it.get(5) & 0x1F);
+            const auto v = sign_extend_value(5, u_it.consume(5) & 0x1F);
             CHECK_EQUAL(v, values[i]);
-            u_it.bump(5);
         }
     }
 }
@@ -1859,7 +1910,7 @@ TEST(ParallelSearchEqualMatch)
 
             // Now use the optimized version
             static auto vector_compare_eq = [](auto msb, auto a, auto b) {
-                return find_all_fields_EQ(msb, a, b);
+                return find_all_fields<Equal>(msb, a, b);
             };
 
             start = 0;
@@ -1901,7 +1952,7 @@ TEST(ParallelSearchEqualNoMatch)
     const auto search_vector = populate(width, key);
 
     static auto vector_compare_eq = [](auto msb, auto a, auto b) {
-        return find_all_fields_EQ(msb, a, b);
+        return find_all_fields<Equal>(msb, a, b);
     };
 
     size_t start = 0;
@@ -1951,7 +2002,7 @@ TEST(ParallelSearchNotEqual)
     const auto search_vector = populate(width, key);
 
     static auto vector_compare_neq = [](auto msb, auto a, auto b) {
-        return find_all_fields_NE(msb, a, b);
+        return find_all_fields<NotEqual>(msb, a, b);
     };
 
     size_t start = 0;
@@ -2002,7 +2053,7 @@ TEST(ParallelSearchLessThan)
     const auto search_vector = populate(width, key);
 
     static auto vector_compare_lt = [](auto msb, auto a, auto b) {
-        return find_all_fields_signed_LT(msb, a, b);
+        return find_all_fields<Less>(msb, a, b);
     };
 
     size_t start = 0;
@@ -2052,7 +2103,7 @@ TEST(ParallelSearchGreaterThan)
     const auto search_vector = populate(width, key);
 
     static auto vector_compare_gt = [](auto msb, auto a, auto b) {
-        return find_all_fields_signed_GT(msb, a, b);
+        return find_all_fields<Greater>(msb, a, b);
     };
 
     size_t start = 0;
diff --git a/test/test_array_integer.cpp b/test/test_array_integer.cpp
index a26cecf52b2..9ccdf25653e 100644
--- a/test/test_array_integer.cpp
+++ b/test/test_array_integer.cpp
@@ -19,6 +19,7 @@
 #include "testsettings.hpp"
 
 #include <limits>
+#include <iostream>
 
 #include <realm/array_integer.hpp>
 #include <realm/array_ref.hpp>
@@ -31,6 +32,1575 @@
 using namespace realm;
 using namespace realm::test_util;
 
+// #define ARRAY_PERFORMANCE_TESTING
+#if !defined(REALM_DEBUG) && defined(ARRAY_PERFORMANCE_TESTING)
+NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   < 32 bit values " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+    t1 = high_resolution_clock::now();
+
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(compressed_a.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - ArrayCompress::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << std::endl;
+
+    std::cout << "   Negative values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(compressed_a.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - ArrayCompress::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+
+NONCONCURRENT_TEST(Test_basic_find_EQ_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto v = a.find_first(input_array[i]);
+            auto v1 = compressed_a.find_first(input_array[i]);
+            REALM_ASSERT(v == v1);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = compressed_a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(compressed_a.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto v = a.find_first(input_array[i]);
+            auto v1 = compressed_a.find_first(input_array[i]);
+            REALM_ASSERT(v == v1);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = compressed_a.find_first(input_array[i]);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(compressed_a.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_NEQ_value_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(i, 0, a.size(), &state1);
+            compressed_a.find<NotEqual>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            compressed_a.find<NotEqual>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // NEQ for signed integers is not working. TODO: investigate this.
+    // verify that both find the same thing
+
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-i, 0, a.size(), &state1);
+            compressed_a.find<NotEqual>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            compressed_a.find<NotEqual>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_LT_value_less_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1{};
+    QueryStateFindFirst state2{};
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0
+            a.find<Less>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Positive values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(i, 0, a.size(), &state1);
+            compressed_a.find<Less>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // there is nothing less than 0
+            compressed_a.find<Less>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(-i, 0, a.size(), &state1);
+            compressed_a.find<Less>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number
+            a.find<Less>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Negative values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing less than the biggest negative number
+            compressed_a.find<Less>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_GT_value_less_32bit)
+{
+    // GT subword parallel search is not working... TODO : investigate
+    using namespace std;
+    using namespace std::chrono;
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth < 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing greatest than the last number
+            a.find<Greater>(i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Greater>(i, 0, a.size(), &state1);
+            compressed_a.find<Greater>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) { // nothing bigger than the last val
+            compressed_a.find<Greater>(i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Greater>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Greater>(-i, 0, a.size(), &state1);
+            compressed_a.find<Greater>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0
+            a.find<Greater>(-i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) { // nothing bigger than 0
+            compressed_a.find<Greater>(-i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Greater>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(perf_array_encode_get_vs_array_get_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   >= 32 bit values " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+    t1 = high_resolution_clock::now();
+
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(compressed_a.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Positive values - ArrayCompress::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-i);
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i)
+            REALM_ASSERT(a.get(i) == input_array[i]);
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << std::endl;
+
+    std::cout << "   Negative values - Array::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - Array::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(compressed_a.get(i) == a.get(i));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::get(): " << duration_cast<nanoseconds>(t2 - t1).count() << " ns"
+              << std::endl;
+    std::cout << "   Negative values - ArrayCompress::get(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_EQ_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x000001000000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(start_value + i);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            REALM_ASSERT(a.find_first(start_value + i) == compressed_a.find_first(start_value + i));
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = compressed_a.find_first(start_value + i);
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(compressed_a.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            const auto k = -(start_value + i);
+            const auto v1 = a.find_first(k);
+            const auto v2 = compressed_a.find_first(k);
+            REALM_ASSERT(v1 == v2);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = a.find_first(-(start_value + i));
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(a.get(ndx) == input_array[ndx]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            auto ndx = compressed_a.find_first(-(start_value + i));
+            REALM_ASSERT(ndx != realm::not_found);
+            REALM_ASSERT(compressed_a.get(ndx) == a.get(ndx));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Equal>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Equal>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_NEQ_value_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(start_value + i, 0, a.size(), &state1);
+            compressed_a.find<NotEqual>(start_value + i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            compressed_a.find<NotEqual>(start_value + i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-(start_value + i), 0, a.size(), &state1);
+            compressed_a.find<NotEqual>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<NotEqual>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<NotEqual>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            compressed_a.find<NotEqual>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<NotEqual>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<NotEqual>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_LT_value_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a.find<Less>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Positive values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(start_value + i, 0, a.size(), &state1);
+            compressed_a.find<Less>(start_value + i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            compressed_a.find<Less>(start_value + i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(-(start_value + i), 0, a.size(), &state1);
+            compressed_a.find<Less>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Less>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count() << " ms"
+              << std::endl;
+    std::cout << "   Negative values - Array::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            compressed_a.find<Less>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Less>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Less>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+NONCONCURRENT_TEST(Test_basic_find_GT_value_greater_32bit)
+{
+    using namespace std;
+    using namespace std::chrono;
+    size_t start_value = 0x0000100000000; // 32 bit val
+    size_t n_values = 1000;
+    size_t n_runs = 100;
+    std::cout << "   Value with bitwidth >= 32 " << std::endl;
+    std::cout << "   N values = " << n_values << std::endl;
+    std::cout << "   N runs = " << n_runs << std::endl;
+
+    std::vector<int64_t> input_array;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger compressed_a(Allocator::get_default());
+    a.create();
+
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(start_value + i);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(input_array.begin(), input_array.end(), g);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    QueryStateFindFirst state1;
+    QueryStateFindFirst state2;
+    auto t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) {
+            a.find<Greater>(start_value + i, 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    auto t2 = high_resolution_clock::now();
+
+    std::cout << "   Positive values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Positive values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    state1 = {};
+    state2 = {};
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            const auto k = start_value + i;
+            a.find<Greater>(k, 0, a.size(), &state1);
+            compressed_a.find<Greater>(k, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values - 1; ++i) {
+            compressed_a.find<Greater>(start_value + i, 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Positive values - ArrayCompress::find<Greater>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Positive values - ArrayCompress::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    std::cout << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+    a.create();
+    input_array.clear();
+    for (size_t i = 0; i < n_values; i++)
+        input_array.push_back(-(start_value + i));
+    std::random_device rd1;
+    std::mt19937 g1(rd1());
+    std::shuffle(input_array.begin(), input_array.end(), g1);
+    for (const auto& v : input_array)
+        a.add(v);
+
+    a.try_compress(compressed_a);
+    CHECK(compressed_a.is_compressed());
+    CHECK(compressed_a.size() == a.size());
+
+    // verify that both find the same thing
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            a.find<Greater>(-(start_value + i), 0, a.size(), &state1);
+            compressed_a.find<Greater>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state1.m_state == state2.m_state);
+        }
+    }
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 0; i < n_values; ++i) {
+            a.find<Greater>(-(start_value + i), 0, a.size(), &state1);
+            REALM_ASSERT(state1.m_state != realm::not_found);
+            REALM_ASSERT(a.get(state1.m_state) == input_array[state1.m_state]);
+        }
+    }
+    t2 = high_resolution_clock::now();
+
+    std::cout << "   Negative values - Array::find<Greater>(): " << duration_cast<milliseconds>(t2 - t1).count()
+              << " ms" << std::endl;
+    std::cout << "   Negative values - Array::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    t1 = high_resolution_clock::now();
+    for (size_t j = 0; j < n_runs; ++j) {
+        for (size_t i = 1; i < n_values; ++i) {
+            compressed_a.find<Greater>(-(start_value + i), 0, compressed_a.size(), &state2);
+            REALM_ASSERT(state2.m_state != realm::not_found);
+            REALM_ASSERT(compressed_a.get(state2.m_state) == a.get(state2.m_state));
+        }
+    }
+    t2 = high_resolution_clock::now();
+    std::cout << "   Negative values - ArrayCompress::find<Greater>(): "
+              << duration_cast<milliseconds>(t2 - t1).count() << " ms" << std::endl;
+    std::cout << "   Negative values - ArrayCompress::find<Greater>(): "
+              << (double)duration_cast<nanoseconds>(t2 - t1).count() / n_values / n_runs << " ns/value" << std::endl;
+
+    a.destroy();
+    compressed_a.destroy();
+}
+
+#endif
+
+// disable this test if forcing compression to Packed.
+#if !REALM_COMPRESS
+TEST(Test_ArrayInt_no_compress)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    a.add(10);
+    a.add(11);
+    a.add(12);
+    // the original array is never encoded. a1 is the array to write down to disk
+    // in this case compression is not needed
+    CHECK_NOT(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a.get(0) == 10);
+    CHECK(a.get(1) == 11);
+    CHECK(a.get(2) == 12);
+    a.destroy();
+    a1.destroy();
+}
+
+TEST(Test_ArrayInt_compress_decompress_needed)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    a.add(10);
+    a.add(5);
+    a.add(5);
+    // uncompressed requires 3 x 4 bits, compressed takes 2 x 5 bits + 3 x 2 bits
+    // with 8 byte alignment this is both 16 bytes.
+    CHECK_NOT(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    a.add(10);
+    a.add(15);
+    // uncompressed is 5x4 bits, compressed is 3x5 bits + 5x2 bits
+    // with 8 byte alignment this is both 16 bytes
+    CHECK_NOT(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    a.add(10);
+    a.add(15);
+    a.add(10);
+    a.add(15);
+    // uncompressed is 9x4 bits, compressed is 3x5 bits + 9x2 bits
+    // with 8 byte alignment this is both 16 bytes
+    CHECK_NOT(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    a.add(-1);
+    // the addition of -1 forces the array from unsigned to signed form
+    // changing from 4 bits per element to 8 bits.
+    // (1,2,4 bit elements are unsigned, larger elements are signed)
+    // uncompressed is 10x8 bits, compressed is 3x5 bits + 10x2 bits
+    // with alignment, this is 24 bytes uncompressed and 16 bytes compressed
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a.get(0) == 10);
+    CHECK(a.get(1) == 5);
+    CHECK(a.get(2) == 5);
+    CHECK(a.get(3) == 10);
+    CHECK(a.get(4) == 15);
+    CHECK(a1.is_compressed());
+    auto v = a1.get(0);
+    CHECK(v == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    a.destroy();
+    a1.destroy();
+}
+#endif
+
+TEST(Test_ArrayInt_get_all)
+{
+    std::vector<int64_t> vs = {3656152302, 2814021986, 4195757081, 3272933168, 3466127978, 2777289082,
+                               4247467684, 3825361855, 2496524560, 4052938301, 3765455798, 2527633011,
+                               3448934593, 3699340964, 4057735040, 3294068800};
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    for (const auto i : vs)
+        a.add(i);
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+    auto res = a1.get_all(0, a1.size());
+    CHECK(res == vs);
+    a.destroy();
+    a1.destroy();
+}
+
+TEST(Test_array_same_size_less_bits)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    a.add(1000000);
+    a.add(1000000);
+    a.add(1000000);
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a.get_any(0) == 1000000);
+    CHECK(a.get_any(1) == 1000000);
+    CHECK(a.get_any(2) == 1000000);
+    CHECK(a1.is_compressed());
+    CHECK(a1.get_any(0) == 1000000);
+    CHECK(a1.get_any(1) == 1000000);
+    CHECK(a1.get_any(2) == 1000000);
+    a.destroy();
+    a1.destroy();
+}
+
+TEST(Test_ArrayInt_negative_nums)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    a.add(-1000000);
+    a.add(0);
+    a.add(1000000);
+    CHECK_NOT(a.is_compressed());
+    CHECK(a.try_compress(a1));
+    a1.destroy();
+    CHECK(a.get(0) == -1000000);
+    CHECK(a.get(1) == 0);
+    CHECK(a.get(2) == 1000000);
+    a.add(-1000000);
+    a.add(-1000000);
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a.get(0) == -1000000);
+    CHECK(a.get(1) == 0);
+    CHECK(a.get(2) == 1000000);
+    CHECK(a.get(3) == -1000000);
+    CHECK(a.get(4) == -1000000);
+    a.add(0);
+    a1.destroy();
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a1.is_compressed());
+
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    CHECK(a1.get(5) == a.get(5));
+
+    a.add(1000000);
+    a1.destroy(); // this decodes the array
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a1.is_compressed());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.try_decompress());
+    a.add(-1000000);
+    a1.destroy();
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a1.is_compressed());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    a.add(0);
+    a1.destroy();
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a1.is_compressed());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    a.add(1000000);
+    a1.destroy();
+    CHECK(a.try_compress(a1));
+    CHECK_NOT(a.is_compressed());
+    CHECK(a1.is_compressed());
+    CHECK(a.size() == 10);
+    CHECK(a.size() == a1.size());
+    CHECK(a1.is_compressed());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    CHECK(a1.get(5) == a.get(5));
+    CHECK(a1.get(6) == a.get(6));
+    CHECK(a1.get(7) == a.get(7));
+    CHECK(a1.get(8) == a.get(8));
+    a.destroy();
+    a1.destroy();
+}
+
+TEST(Test_ArrayInt_compress_data)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+
+    a.create();
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.add(4);
+    a.add(5);
+    a.add(6);
+    a.add(7);
+    a.add(8);
+    a.add(4);
+    a.try_compress(a1);
+    bool ok = a1.is_compressed();
+    CHECK(ok);
+    CHECK(a1.is_compressed());
+    CHECK(a1.is_attached());
+    CHECK(a.is_attached());
+    for (size_t i = 0; i < a.size(); ++i) {
+        auto v0 = a1.get(i);
+        auto v1 = a.get(i);
+        CHECK(v0 == v1);
+    }
+    a.destroy();
+    a1.destroy();
+
+    a.create();
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.add(-4427957085475570907);
+    a.try_compress(a1);
+    for (size_t i = 0; i < a.size(); ++i)
+        CHECK(a1.get(i) == a.get(i));
+    a.destroy();
+    a1.destroy();
+
+    a.create();
+
+    a.add(16388);
+    a.add(409);
+    a.add(16388);
+    a.add(16388);
+    a.add(409);
+    a.add(16388);
+    CHECK(a.size() == 6);
+    // Current: [16388:16, 409:16, 16388:16, 16388:16, 409:16, 16388:16], space needed: 6*16 bits = 96 bits +
+    // header
+    // compress the array is a good option.
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+    // Compressed: [409:16, 16388:16][1:1,0:1,1:1,1:1,0:1,1:1], space needed: 2*16 bits + 6 * 1 bit = 38 bits +
+    // header
+    CHECK(a1.size() == a.size());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    CHECK(a1.get(5) == a.get(5));
+    // decompress
+    CHECK(a1.try_decompress());
+    a.add(20);
+    // compress again, it should be a viable option
+    a1.destroy();
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+    CHECK(a1.size() == 7);
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    CHECK(a1.get(5) == a.get(5));
+    CHECK(a1.get(6) == a.get(6));
+    CHECK(a1.try_decompress());
+    CHECK_NOT(a1.is_compressed());
+    CHECK(a1.get(0) == a.get(0));
+    CHECK(a1.get(1) == a.get(1));
+    CHECK(a1.get(2) == a.get(2));
+    CHECK(a1.get(3) == a.get(3));
+    CHECK(a1.get(4) == a.get(4));
+    CHECK(a1.get(5) == a.get(5));
+    CHECK(a1.get(6) == a.get(6));
+    a.destroy();
+    a1.destroy();
+}
+
+TEST(Test_ArrayInt_compress_data_init_from_mem)
+{
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+    a.add(16388);
+    a.add(409);
+    a.add(16388);
+    a.add(16388);
+    a.add(409);
+    a.add(16388);
+    const auto sz = a.size();
+    CHECK(sz == 6);
+    // Current: [16388:16, 409:16, 16388:16, 16388:16, 409:16, 16388:16],
+    // space needed: 6*16 bits = 96 bits + header
+    // compress the array is a good option (it should already be compressed).
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+    //  Array should be in compressed form now
+    auto mem = a1.get_mem();
+    ArrayInteger a2(Allocator::get_default());
+    a2.init_from_mem(mem); // initialise a1 with a
+    // check a2
+    CHECK(a2.is_compressed());
+    const auto sz2 = a2.size();
+    CHECK(sz2 == 6);
+    CHECK(a2.get(0) == 16388);
+    CHECK(a2.get(1) == 409);
+    CHECK(a2.get(2) == 16388);
+    CHECK(a2.get(3) == 16388);
+    CHECK(a2.get(4) == 409);
+    CHECK(a2.get(5) == 16388);
+    // decompress a2 and compresses again
+    CHECK(a2.is_compressed());
+    CHECK(a2.try_decompress());
+    CHECK_NOT(a2.is_compressed());
+    a2.add(20);
+    CHECK(a2.try_compress(a1));
+    CHECK(a1.is_compressed());
+    CHECK(a1.size() == 7);
+    CHECK(a1.get(0) == 16388);
+    CHECK(a1.get(1) == 409);
+    CHECK(a1.get(2) == 16388);
+    CHECK(a1.get(3) == 16388);
+    CHECK(a1.get(4) == 409);
+    CHECK(a1.get(5) == 16388);
+    CHECK(a1.get(6) == 20);
+    CHECK(a1.try_decompress());
+    a.destroy();
+    a1.destroy();
+    a2.destroy();
+    CHECK_NOT(a.is_attached());
+    CHECK_NOT(a1.is_attached());
+    CHECK_NOT(a2.is_attached());
+}
 
 TEST(ArrayIntNull_SetNull)
 {
@@ -244,3 +1814,114 @@ TEST(ArrayRef_Basic)
 
     a.destroy();
 }
+
+TEST_TYPES(ArrayInt_comparison, Equal, NotEqual, Less, Greater)
+{
+    using Cond = TEST_TYPE;
+    ArrayInteger a(Allocator::get_default());
+    ArrayInteger a1(Allocator::get_default());
+    a.create();
+
+    // check first positive values < 32 bits
+    constexpr auto N = 300;
+    constexpr auto M = 3;
+    for (size_t i = 0; i < N; i++)
+        for (size_t j = 0; j < M; ++j)
+            a.add(i);
+
+    auto sz = a.size();
+    CHECK(sz == M * N);
+
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+
+    //  Array should be in compressed form now and values should match
+    for (size_t i = 0; i < sz; ++i)
+        CHECK(a.get(i) == a1.get(i));
+
+    for (int i = (int)(sz)-1; i >= 0; --i) {
+        QueryStateFindFirst m_first1, m_first2;
+        CHECK(a.find<Less>(i, 0, sz, &m_first1) == a1.find<Less>(i, 0, sz, &m_first2));
+        CHECK(m_first1.m_state == m_first2.m_state);
+    }
+
+    IntegerColumn accu1(Allocator::get_default());
+    IntegerColumn accu2(Allocator::get_default());
+    accu1.create();
+    accu2.create();
+    for (int i = (int)(sz)-1; i >= 0; --i) {
+        QueryStateFindAll m1{accu1}, m2{accu2};
+        CHECK(a.find<Cond>(i, 0, sz, &m1) == a1.find<Cond>(i, 0, sz, &m2));
+        CHECK(m1.match_count() == m2.match_count());
+    }
+
+    // check negative numbers now.
+    a1.destroy();
+    a.clear();
+
+    for (size_t i = 0; i < N; i++)
+        for (size_t j = 0; j < M; ++j)
+            a.add(-i);
+
+    sz = a.size();
+    CHECK(sz == M * N);
+
+    CHECK(a.try_compress(a1));
+    CHECK(a1.is_compressed());
+
+    //  Array should be in compressed form now and values should match
+    for (size_t i = 0; i < sz; ++i)
+        CHECK(a.get(i) == a1.get(i));
+
+    for (int64_t i = (int64_t)(sz)-1; i >= 0; --i) {
+        QueryStateFindFirst m_first1, m_first2;
+        CHECK(a.find<Cond>(-i, 0, sz, &m_first1) == a1.find<Cond>(-i, 0, sz, &m_first2));
+        CHECK(m_first1.m_state == m_first2.m_state);
+    }
+
+    accu1.clear();
+    accu2.clear();
+    for (int i = (int)(sz)-1; i >= 0; --i) {
+        QueryStateFindAll m1{accu1}, m2{accu2};
+        CHECK(a.find<Cond>(-i, 0, sz, &m1) == a1.find<Cond>(-i, 0, sz, &m2));
+        CHECK(m1.match_count() == m2.match_count());
+    }
+
+    accu1.destroy();
+    accu2.destroy();
+    a.destroy();
+    a1.destroy();
+
+#if REALM_COMPRESS
+    a.create();
+    std::random_device dev;
+    std::mt19937 rng(dev());
+    const auto min_range_t = (size_t)std::numeric_limits<int>::min();
+    const auto max_range_t = (size_t)std::numeric_limits<int>::max();
+    std::uniform_int_distribution<std::mt19937::result_type> dist(min_range_t, max_range_t);
+    sz = 100;
+    for (size_t i = 0; i < sz; ++i) {
+        auto v = (int)dist(rng);
+        a.add(v);
+    }
+    a.try_compress(a1);
+
+    for (size_t i = 0; i < sz; ++i)
+        CHECK(a.get(i) == a1.get(i));
+
+    CHECK(a1.is_compressed());
+    for (size_t i = 0; i < sz; ++i) {
+        QueryStateFindFirst m_first1, m_first2;
+        CHECK(a.find<Cond>(a.get(i), 0, sz, &m_first1) == a1.find<Cond>(a1.get(i), 0, sz, &m_first2));
+        CHECK(m_first1.m_state == m_first2.m_state);
+        if (m_first1.m_state != realm::not_found)
+            CHECK(a.get(m_first1.m_state) == a1.get(m_first2.m_state));
+    }
+
+    a.destroy();
+    a1.destroy();
+#endif
+
+    CHECK_NOT(a.is_attached());
+    CHECK_NOT(a1.is_attached());
+}
diff --git a/test/test_group.cpp b/test/test_group.cpp
index 54cd141485b..651a582463c 100644
--- a/test/test_group.cpp
+++ b/test/test_group.cpp
@@ -2315,4 +2315,198 @@ TEST(Group_UniqueColumnKeys)
     CHECK_NOT_EQUAL(col_foo, col_bar);
 }
 
+TEST(Group_ArrayCompression_Correctness)
+{
+    GROUP_TEST_PATH(path);
+
+    // Create group with one list<int> which maps to array_integer
+    Group to_disk;
+    TableRef table = to_disk.add_table("test");
+    auto col_key = table->add_column_list(type_Int, "lint");
+    auto obj = table->create_object();
+    auto array = obj.get_list<int64_t>(col_key);
+    array.add(16388);
+    array.add(409);
+    array.add(16388);
+    array.add(16388);
+    array.add(409);
+    array.add(16388);
+    CHECK_EQUAL(array.size(), 6);
+    CHECK_EQUAL(array.get_any(0).get_int(), 16388);
+    CHECK_EQUAL(array.get_any(1).get_int(), 409);
+    CHECK_EQUAL(array.get_any(2).get_int(), 16388);
+    CHECK_EQUAL(array.get_any(3).get_int(), 16388);
+    CHECK_EQUAL(array.get_any(4).get_int(), 409);
+    CHECK_EQUAL(array.get_any(5).get_int(), 16388);
+
+    // Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
+    to_disk.write(path, crypt_key());
+
+#ifdef REALM_DEBUG
+    to_disk.verify();
+#endif
+
+    // Load the tables
+    Group from_disk(path, crypt_key());
+    TableRef read_table = from_disk.get_table("test");
+    auto col_key1 = read_table->get_column_key("lint");
+    auto obj1 = read_table->get_object(0);
+    auto l1 = obj1.get_list<int64_t>(col_key1);
+    CHECK(l1.size() == array.size());
+    CHECK(*read_table == *table);
+    for (size_t i = 0; i < l1.size(); ++i) {
+        CHECK_EQUAL(l1.get_any(i), array.get_any(i));
+    }
+
+#ifdef REALM_DEBUG
+    from_disk.verify();
+#endif
+}
+
+TEST(Group_ArrayCompression_Correctness_Negative)
+{
+    GROUP_TEST_PATH(path);
+
+    // Create group with one list<int> which maps to array_integer
+    Group to_disk;
+    TableRef table = to_disk.add_table("test");
+    auto col_key = table->add_column_list(type_Int, "lint");
+    auto obj = table->create_object();
+    auto array = obj.get_list<int64_t>(col_key);
+
+    array.add(-1);
+    array.add(-1);
+    array.add(-1);
+    array.add(-1);
+    array.add(std::numeric_limits<int64_t>::max());
+    array.add(std::numeric_limits<int64_t>::max());
+
+    CHECK_EQUAL(array.size(), 6);
+    CHECK_EQUAL(array.get_any(0).get_int(), -1);
+    CHECK_EQUAL(array.get_any(1).get_int(), -1);
+    CHECK_EQUAL(array.get_any(2).get_int(), -1);
+    CHECK_EQUAL(array.get_any(3).get_int(), -1);
+    CHECK_EQUAL(array.get_any(4).get_int(), std::numeric_limits<int64_t>::max());
+    CHECK_EQUAL(array.get_any(5).get_int(), std::numeric_limits<int64_t>::max());
+
+    // Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
+    to_disk.write(path, crypt_key());
+
+#ifdef REALM_DEBUG
+    to_disk.verify();
+#endif
+
+    // Load the tables
+    Group from_disk(path, crypt_key());
+    TableRef read_table = from_disk.get_table("test");
+    auto col_key1 = read_table->get_column_key("lint");
+    auto obj1 = read_table->get_object(0);
+    auto l1 = obj1.get_list<int64_t>(col_key1);
+    CHECK(l1.size() == array.size());
+    CHECK(*read_table == *table);
+    for (size_t i = 0; i < l1.size(); ++i) {
+        CHECK_EQUAL(l1.get_any(i), array.get_any(i));
+    }
+
+#ifdef REALM_DEBUG
+    from_disk.verify();
+#endif
+}
+
+TEST(Group_ArrayCompression_Correctness_Funny_Values)
+{
+    GROUP_TEST_PATH(path);
+
+    // Create group with one list<int> which maps to array_integer
+    Group to_disk;
+    TableRef table = to_disk.add_table("test");
+    auto col_key = table->add_column_list(type_Int, "lint");
+    auto obj = table->create_object();
+    auto array = obj.get_list<int64_t>(col_key);
+
+    std::vector<int64_t> vs = {3656152302, 2814021986, 4195757081, 3272933168, 3466127978, 2777289082,
+                               4247467684, 3825361855, 2496524560, 4052938301, 3765455798, 2527633011,
+                               3448934593, 3699340964, 4057735040, 3294068800};
+
+    size_t ndx = 0;
+    for (const auto v : vs) {
+        array.add(v);
+        CHECK_EQUAL(v, array.get(ndx++));
+    }
+    CHECK_EQUAL(array.size(), vs.size());
+
+    // Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
+    to_disk.write(path, crypt_key());
+
+#ifdef REALM_DEBUG
+    to_disk.verify();
+#endif
+
+    // Load the tables
+    Group from_disk(path, crypt_key());
+    TableRef read_table = from_disk.get_table("test");
+    auto col_key1 = read_table->get_column_key("lint");
+    auto obj1 = read_table->get_object(0);
+    auto l1 = obj1.get_list<int64_t>(col_key1);
+    CHECK(l1.size() == array.size());
+    CHECK(*read_table == *table);
+    for (size_t i = 0; i < l1.size(); ++i) {
+        CHECK_EQUAL(l1.get_any(i), array.get_any(i));
+    }
+
+#ifdef REALM_DEBUG
+    from_disk.verify();
+#endif
+}
+
+
+TEST(Group_ArrayCompression_Correctness_Random_Input)
+{
+    GROUP_TEST_PATH(path);
+
+    // Create group with one list<int> which maps to array_integer
+    Group to_disk;
+    TableRef table = to_disk.add_table("test");
+    auto col_key = table->add_column_list(type_Int, "lint");
+    auto obj = table->create_object();
+    auto array = obj.get_list<int64_t>(col_key);
+
+    std::random_device dev;
+    std::mt19937 rng(dev());
+    constexpr auto min = std::numeric_limits<int64_t>::min();
+    constexpr auto max = std::numeric_limits<int64_t>::max();
+    std::uniform_int_distribution<std::mt19937::result_type> dist6(static_cast<unsigned int>(min),
+                                                                   static_cast<unsigned int>(max));
+    for (size_t i = 0; i < 1000; ++i) {
+        const auto v = dist6(rng);
+        array.add(v);
+        const auto stored_v = array.get_any(i).get_int();
+        CHECK_EQUAL(stored_v, v);
+    }
+
+    // Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
+    to_disk.write(path, crypt_key());
+
+#ifdef REALM_DEBUG
+    to_disk.verify();
+#endif
+
+    // Load the tables
+    Group from_disk(path, crypt_key());
+    TableRef read_table = from_disk.get_table("test");
+    auto col_key1 = read_table->get_column_key("lint");
+    auto obj1 = read_table->get_object(0);
+    auto l1 = obj1.get_list<int64_t>(col_key1);
+    CHECK(l1.size() == array.size());
+    CHECK(*read_table == *table);
+    for (size_t i = 0; i < l1.size(); ++i) {
+        CHECK_EQUAL(l1.get_any(i), array.get_any(i));
+    }
+
+#ifdef REALM_DEBUG
+    from_disk.verify();
+#endif
+}
+
+
 #endif // TEST_GROUP
diff --git a/test/test_links.cpp b/test/test_links.cpp
index be08d2c7392..7561364089b 100644
--- a/test/test_links.cpp
+++ b/test/test_links.cpp
@@ -1167,11 +1167,13 @@ TEST(Links_FormerMemLeakCase)
         auto col = origin->add_column(*target, "link");
         origin->create_object().set(col, k);
         origin->create_object().set(col, k);
+        wt.get_group().verify();
         wt.commit();
     }
     {
         WriteTransaction wt(sg_w);
         TableRef target = wt.get_table("target");
+        wt.get_group().verify();
         target->begin()->remove();
         wt.get_group().verify();
         wt.commit();
diff --git a/test/test_list.cpp b/test/test_list.cpp
index b29935981b1..d8e3f1fc1de 100644
--- a/test/test_list.cpp
+++ b/test/test_list.cpp
@@ -633,6 +633,41 @@ TEST(List_AggOps)
     test_lists_numeric_agg<Decimal128>(test_context, sg, type_Decimal, Decimal128(realm::null()), true);
 }
 
+TEST(Test_Write_List_Nested_In_Mixed)
+{
+    SHARED_GROUP_TEST_PATH(path);
+    std::string message;
+    DBOptions options;
+    options.logger = test_context.logger;
+    DBRef db = DB::create(make_in_realm_history(), path, options);
+    auto tr = db->start_write();
+    auto table = tr->add_table("table");
+    auto col_any = table->add_column(type_Mixed, "something");
+
+    Obj obj = table->create_object();
+    obj.set_any(col_any, Mixed{20});
+    tr->verify();
+    tr->commit_and_continue_writing(); // commit simple mixed
+    tr->verify();
+
+    obj.set_collection(col_any, CollectionType::List);
+    auto list = obj.get_list_ptr<Mixed>(col_any);
+    list->add(Mixed{10});
+    list->add(Mixed{11});
+    tr->verify();
+    tr->commit_and_continue_writing(); // commit nested list in mixed
+    tr->verify();
+
+    // spicy it up a little bit...
+    list->insert_collection(2, CollectionType::List);
+    list->insert_collection(3, CollectionType::List);
+    list->get_list(2)->add(Mixed{20});
+    list->get_list(3)->add(Mixed{21});
+    tr->commit_and_continue_writing();
+    tr->verify();
+    tr->close();
+}
+
 TEST(List_Nested_InMixed)
 {
     SHARED_GROUP_TEST_PATH(path);
diff --git a/test/test_query.cpp b/test/test_query.cpp
index 6df86fb2b1f..c2d6b196b32 100644
--- a/test/test_query.cpp
+++ b/test/test_query.cpp
@@ -5772,4 +5772,38 @@ TEST(Query_NestedLinkCount)
     CHECK_EQUAL(q.count(), 3);
 }
 
+TEST_TYPES(Query_IntCompressed, Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual)
+{
+    TEST_TYPE c;
+    SHARED_GROUP_TEST_PATH(path);
+    int ints[] = {-120, -111, -70, -61, -55, -45, -22, -15, -3, 2, 7, 18, 25, 33, 55, 56, 66, 78, 104, 125};
+    std::vector<int> values;
+    for (int j = 1; j < 21; j++) {
+        for (int i = 0; i < j; i++) {
+            values.push_back(ints[i]);
+        }
+    }
+
+    auto db = DB::create(path);
+    auto wt = db->start_write();
+    auto t = wt->add_table("table");
+    auto col = t->add_column(type_Int, "id");
+    for (auto val : values) {
+        t->create_object().set(col, val);
+    }
+    wt->commit_and_continue_as_read();
+
+    for (int val : {-1000, -125, 2, 3, 6, 126, 1000}) {
+        size_t num_matches = 0;
+        for (auto i : values) {
+            if (c(i, val))
+                num_matches++;
+        }
+
+        char query_str[20];
+        snprintf(query_str, 20, "id %s %d", c.description().c_str(), val);
+        CHECK_EQUAL(t->query(query_str).count(), num_matches);
+    }
+}
+
 #endif // TEST_QUERY
diff --git a/test/test_shared.cpp b/test/test_shared.cpp
index 78ede3b4a0c..85c3de4f8ab 100644
--- a/test/test_shared.cpp
+++ b/test/test_shared.cpp
@@ -95,34 +95,32 @@ using unit_test::TestContext;
 // `experiments/testcase.cpp` and then run `sh build.sh
 // check-testcase` (or one of its friends) from the command line.
 
-#if 0
+
 // Sorting benchmark
-ONLY(Query_QuickSort2)
+TEST(Query_QuickSort2)
 {
     Random random(random_int<unsigned long>()); // Seed from slow global generator
 
     // Triggers QuickSort because range > len
     Table ttt;
-    auto ints = ttt.add_column(type_Int, "1");
+    // auto ints = ttt.add_column(type_Int, "1");
     auto strings = ttt.add_column(type_String, "2");
 
     for (size_t t = 0; t < 10000; t++) {
         Obj o = ttt.create_object();
-        //        o.set<int64_t>(ints, random.draw_int_mod(1100));
+        // o.set<int64_t>(ints, random.draw_int_mod(1100));
         o.set<StringData>(strings, "a");
     }
 
     Query q = ttt.where();
 
-    std::cerr << "GO";
-
     for (size_t t = 0; t < 1000; t++) {
         TableView tv = q.find_all();
         tv.sort(strings);
-        //        tv.ints(strings);
+        // tv.ints(strings);
     }
 }
-#endif
+
 
 #if REALM_WINDOWS
 namespace {
diff --git a/test/test_table.cpp b/test/test_table.cpp
index 80df42e1824..52e06fb2659 100644
--- a/test/test_table.cpp
+++ b/test/test_table.cpp
@@ -46,7 +46,7 @@ using namespace std::chrono;
 #include "test_types_helper.hpp"
 
 // #include <valgrind/callgrind.h>
-// #define PERFORMACE_TESTING
+// #define PERFORMANCE_TESTING
 
 using namespace realm;
 using namespace realm::util;
@@ -2954,9 +2954,122 @@ NONCONCURRENT_TEST(Table_QuickSort2)
     std::cout << "    time: " << duration_cast<nanoseconds>(t2 - t1).count() / nb_reps << " ns/rep" << std::endl;
 }
 
+NONCONCURRENT_TEST(Table_object_timestamp)
+{
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
+    int nb_rows = 10'000'000;
+    int num_runs = 100;
+#else
+    int nb_rows = 100'000;
+    int num_runs = 1;
+#endif
+    SHARED_GROUP_TEST_PATH(path);
+    std::unique_ptr<Replication> hist(make_in_realm_history());
+    DBRef sg = DB::create(*hist, path, DBOptions(crypt_key()));
+    ColKey c0;
+
+    CALLGRIND_START_INSTRUMENTATION;
+
+    std::cout << nb_rows << " rows - timestamps" << std::endl;
+
+    {
+        WriteTransaction wt(sg);
+        auto table = wt.add_table("test");
+
+        c0 = table->add_column(type_Timestamp, "ts");
+
+
+        auto t1 = steady_clock::now();
+
+        for (int i = 0; i < nb_rows; i++) {
+            Timestamp t(i, i);
+            table->create_object(ObjKey(i)).set_all(t);
+        }
+
+        auto t2 = steady_clock::now();
+        std::cout << "   insertion time: " << duration_cast<nanoseconds>(t2 - t1).count() / nb_rows << " ns/key"
+                  << std::endl;
+
+        CHECK_EQUAL(table->size(), nb_rows);
+        wt.commit();
+    }
+    {
+        ReadTransaction rt(sg);
+        auto table = rt.get_table("test");
+
+        auto t1 = steady_clock::now();
+        Timestamp t(nb_rows / 2, nb_rows / 2);
+        for (int j = 0; j < num_runs; ++j) {
+            auto result = table->where().equal(c0, t).find_all();
+        }
+
+        auto t2 = steady_clock::now();
+
+        std::cout << "   find all    : " << duration_cast<milliseconds>(t2 - t1).count() / num_runs << " ms"
+                  << std::endl;
+    }
+}
+
+NONCONCURRENT_TEST(Table_object_search)
+{
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
+    int nb_rows = 10'000'000;
+    int num_runs = 100;
+#else
+    int nb_rows = 100'000;
+    int num_runs = 1;
+#endif
+    SHARED_GROUP_TEST_PATH(path);
+    std::unique_ptr<Replication> hist(make_in_realm_history());
+    DBRef sg = DB::create(*hist, path, DBOptions(crypt_key()));
+    ColKey c0;
+    ColKey c1;
+
+    CALLGRIND_START_INSTRUMENTATION;
+
+    std::cout << nb_rows << " rows - sequential" << std::endl;
+
+    {
+        WriteTransaction wt(sg);
+        auto table = wt.add_table("test");
+
+        c0 = table->add_column(type_Int, "int1");
+        c1 = table->add_column(type_Int, "int2", true);
+
+
+        auto t1 = steady_clock::now();
+
+        for (int i = 0; i < nb_rows; i++) {
+            table->create_object(ObjKey(i)).set_all(i << 1, i << 2);
+        }
+
+        auto t2 = steady_clock::now();
+        std::cout << "   insertion time: " << duration_cast<nanoseconds>(t2 - t1).count() / nb_rows << " ns/key"
+                  << std::endl;
+
+        CHECK_EQUAL(table->size(), nb_rows);
+        wt.commit();
+    }
+    {
+        ReadTransaction rt(sg);
+        auto table = rt.get_table("test");
+
+        auto t1 = steady_clock::now();
+
+        for (int j = 0; j < num_runs; ++j) {
+            auto result = table->find_all_int(c0, nb_rows / 2);
+        }
+
+        auto t2 = steady_clock::now();
+
+        std::cout << "   find all    : " << duration_cast<milliseconds>(t2 - t1).count() / num_runs << " ms"
+                  << std::endl;
+    }
+}
+
 NONCONCURRENT_TEST(Table_object_sequential)
 {
-#ifdef PERFORMACE_TESTING
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
     int nb_rows = 10'000'000;
     int num_runs = 1;
 #else
@@ -3106,7 +3219,7 @@ NONCONCURRENT_TEST(Table_object_sequential)
 
 NONCONCURRENT_TEST(Table_object_seq_rnd)
 {
-#ifdef PERFORMACE_TESTING
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
     size_t rows = 1'000'000;
     int runs = 100; // runs for building scenario
 #else
@@ -3149,7 +3262,7 @@ NONCONCURRENT_TEST(Table_object_seq_rnd)
     }
     // scenario established!
     int nb_rows = int(key_values.size());
-#ifdef PERFORMACE_TESTING
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
     int num_runs = 10; // runs for timing access
 #else
     int num_runs = 1; // runs for timing access
@@ -3221,7 +3334,7 @@ NONCONCURRENT_TEST(Table_object_seq_rnd)
 
 NONCONCURRENT_TEST(Table_object_random)
 {
-#ifdef PERFORMACE_TESTING
+#if !defined(REALM_DEBUG) && defined(PERFORMANCE_TESTING)
     int nb_rows = 1'000'000;
     int num_runs = 10;
 #else
diff --git a/test/test_unresolved_links.cpp b/test/test_unresolved_links.cpp
index adaf6981130..60f50ee3488 100644
--- a/test/test_unresolved_links.cpp
+++ b/test/test_unresolved_links.cpp
@@ -837,35 +837,6 @@ TEST(Links_ManyObjects)
     tr->commit();
 }
 
-TEST(Unresolved_PerformanceLinks)
-{
-    constexpr int nb_objects = 1000;
-    using namespace std::chrono;
-
-    SHARED_GROUP_TEST_PATH(path);
-    auto hist = make_in_realm_history();
-    DBRef db = DB::create(*hist, path);
-
-    auto tr = db->start_write();
-    auto table = tr->add_table_with_primary_key("table", type_Int, "id");
-    auto origin = tr->add_table("origin");
-    auto col = origin->add_column(*table, "link");
-    auto key = table->get_objkey_from_primary_key(1);
-    for (int i = 0; i < nb_objects; i++) {
-        origin->create_object().set(col, key);
-    }
-    tr->commit_and_continue_as_read();
-    tr->promote_to_write();
-    auto t1 = steady_clock::now();
-    table->create_object_with_primary_key(1);
-    auto t2 = steady_clock::now();
-    tr->commit_and_continue_as_read();
-    CHECK(t2 > t1);
-    // std::cout << "Time: " << duration_cast<microseconds>(t2 - t1).count() << " us" << std::endl;
-    tr->promote_to_write();
-    tr->verify();
-}
-
 TEST(Unresolved_PerformanceLinkList)
 {
     constexpr int nb_objects = 1000;
@@ -889,6 +860,7 @@ TEST(Unresolved_PerformanceLinkList)
         ll.add(key3);
     }
     tr->commit_and_continue_as_read();
+    // compresses
     tr->promote_to_write();
     auto t1 = steady_clock::now();
     table->create_object_with_primary_key(1);
@@ -897,7 +869,6 @@ TEST(Unresolved_PerformanceLinkList)
     auto t2 = steady_clock::now();
     tr->commit_and_continue_as_read();
     CHECK(t2 > t1);
-    // std::cout << "Time: " << duration_cast<microseconds>(t2 - t1).count() << " us" << std::endl;
     tr->promote_to_write();
     tr->verify();
 }