Fix Bug - Handle unsigned long in sorting order assertion of LongHash…

…Set (#17207) * Fix Bug - handle unsigned long in assertion of LongHashSet Signed-off-by: Shailesh Singh <[email protected]> * renamed TestDocValuesUnsignedLongHashSet.java to DocValuesUnsignedLongHashSetTests.java Signed-off-by: Shailesh Singh <[email protected]> * Update server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java Co-authored-by: Andriy Redko <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> --------- Signed-off-by: Shailesh Singh <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> Co-authored-by: Shailesh Singh <[email protected]> Co-authored-by: Andriy Redko <[email protected]>
opensearch-project · Mar 5, 2025 · 82bbdfb · 82bbdfb
1 parent 09af518
commit 82bbdfb
Show file tree

Hide file tree

Showing 4 changed files with 284 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Introduce a setting to disable download of full cluster state from remote on term mismatch([#16798](https://github.com/opensearch-project/OpenSearch/pull/16798/))
 - Added ability to retrieve value from DocValues in a flat_object filed([#16802](https://github.com/opensearch-project/OpenSearch/pull/16802))
 - Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
+- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207))
 - Implemented computation of segment replication stats at shard level ([#17055](https://github.com/opensearch-project/OpenSearch/pull/17055))
 - [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))
 

diff --git a/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java b/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java
@@ -25,7 +25,7 @@
 import org.apache.lucene.search.ScorerSupplier;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.search.Weight;
-import org.opensearch.lucene.util.LongHashSet;
+import org.opensearch.lucene.util.UnsignedLongHashSet;
 
 import java.io.IOException;
 import java.math.BigInteger;
@@ -40,12 +40,12 @@
 public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {
 
     private final String field;
-    private final LongHashSet numbers;
+    private final UnsignedLongHashSet numbers;
 
     SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
         this.field = Objects.requireNonNull(field);
         Arrays.sort(numbers);
-        this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
+        this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
     }
 
     @Override

diff --git a/server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java b/server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java
@@ -0,0 +1,139 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.lucene.util;
+
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.PackedInts;
+import org.opensearch.common.Numbers;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+/** Set of unsigned-longs, optimized for docvalues usage */
+public final class UnsignedLongHashSet implements Accountable {
+    private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class);
+
+    private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG;
+
+    final long[] table;
+    final int mask;
+    final boolean hasMissingValue;
+    final int size;
+    /** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
+    public final long minValue;
+    /** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
+    public final long maxValue;
+
+    /** Construct a set. Values must be in sorted order. */
+    public UnsignedLongHashSet(long[] values) {
+        int tableSize = Math.toIntExact(values.length * 3L / 2);
+        tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
+        assert tableSize >= values.length * 3L / 2;
+        table = new long[tableSize];
+        Arrays.fill(table, MISSING);
+        mask = tableSize - 1;
+        boolean hasMissingValue = false;
+        int size = 0;
+        long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert
+        for (long value : values) {
+            if (value == MISSING) {
+                size += hasMissingValue ? 0 : 1;
+                hasMissingValue = true;
+            } else if (add(value)) {
+                ++size;
+            }
+            assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order";
+            previousValue = value;
+        }
+        this.hasMissingValue = hasMissingValue;
+        this.size = size;
+        this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0];
+        this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1];
+    }
+
+    private boolean add(long l) {
+        assert l != MISSING;
+        final int slot = Long.hashCode(l) & mask;
+        for (int i = slot;; i = (i + 1) & mask) {
+            if (table[i] == MISSING) {
+                table[i] = l;
+                return true;
+            } else if (table[i] == l) {
+                // already added
+                return false;
+            }
+        }
+    }
+
+    /**
+     * check for membership in the set.
+     *
+     * <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
+     * calling this.
+     */
+    public boolean contains(long l) {
+        if (l == MISSING) {
+            return hasMissingValue;
+        }
+        final int slot = Long.hashCode(l) & mask;
+        for (int i = slot;; i = (i + 1) & mask) {
+            if (table[i] == MISSING) {
+                return false;
+            } else if (table[i] == l) {
+                return true;
+            }
+        }
+    }
+
+    /** returns a stream of all values contained in this set */
+    public LongStream stream() {
+        LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
+        if (hasMissingValue) {
+            stream = LongStream.concat(LongStream.of(MISSING), stream);
+        }
+        return stream;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj != null && obj instanceof UnsignedLongHashSet) {
+            UnsignedLongHashSet that = (UnsignedLongHashSet) obj;
+            return size == that.size
+                && minValue == that.minValue
+                && maxValue == that.maxValue
+                && mask == that.mask
+                && hasMissingValue == that.hasMissingValue
+                && Arrays.equals(table, that.table);
+        }
+        return false;
+    }
+
+    @Override
+    public String toString() {
+        return stream().mapToObj(Long::toUnsignedString).collect(Collectors.joining(", ", "[", "]"));
+    }
+
+    /** number of elements in the set */
+    public int size() {
+        return size;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+    }
+}
diff --git a/server/src/test/java/org/opensearch/lucene/util/DocValuesUnsignedLongHashSetTests.java b/server/src/test/java/org/opensearch/lucene/util/DocValuesUnsignedLongHashSetTests.java
@@ -0,0 +1,141 @@
+package org.opensearch.lucene.util;
+
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.opensearch.common.Numbers;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+public class DocValuesUnsignedLongHashSetTests extends LuceneTestCase {
+
+    private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
+        assertEquals(set1.size(), unsignedLongHashSet.size());
+
+        Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
+        LuceneTestCase.assertEquals(set1, set2);
+
+        if (set1.isEmpty() == false) {
+            Set<Long> set3 = new HashSet<>(set1);
+            long removed = set3.iterator().next();
+            while (true) {
+                long next = random().nextLong();
+                if (next != removed && set3.add(next)) {
+                    assertFalse(unsignedLongHashSet.contains(next));
+                    break;
+                }
+            }
+            assertNotEquals(set3, unsignedLongHashSet);
+        }
+
+        assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains));
+    }
+
+    private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
+        Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
+
+        LuceneTestCase.assertNotEquals(set1, set2);
+
+        UnsignedLongHashSet set3 = new UnsignedLongHashSet(
+            set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray()
+        );
+
+        LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet()));
+
+        assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains));
+    }
+
+    public void testEmpty() {
+        Set<Long> set1 = new HashSet<>();
+        UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {});
+        assertEquals(0, set2.size());
+        assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
+        assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
+        assertEquals(set1, set2);
+    }
+
+    public void testOneValue() {
+        Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
+        UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L });
+        assertEquals(1, set2.size());
+        assertEquals(42L, set2.minValue);
+        assertEquals(42L, set2.maxValue);
+        assertEquals(set1, set2);
+
+        set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG));
+        set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG });
+        assertEquals(1, set2.size());
+        assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
+        assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
+        assertEquals(set1, set2);
+    }
+
+    public void testTwoValues() {
+        Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG));
+        UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG });
+        assertEquals(2, set2.size());
+        assertEquals(42, set2.minValue);
+        assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
+        assertEquals(set1, set2);
+
+        set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L));
+        set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L });
+        assertEquals(2, set2.size());
+        assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
+        assertEquals(42, set2.maxValue);
+        assertEquals(set1, set2);
+    }
+
+    public void testSameValue() {
+        UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L });
+        assertEquals(1, set2.size());
+        assertEquals(42L, set2.minValue);
+        assertEquals(42L, set2.maxValue);
+    }
+
+    public void testSameMissingPlaceholder() {
+        UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE });
+        assertEquals(1, set2.size());
+        assertEquals(Long.MIN_VALUE, set2.minValue);
+        assertEquals(Long.MIN_VALUE, set2.maxValue);
+    }
+
+    public void testRandom() {
+        final int iters = atLeast(10);
+        for (int iter = 0; iter < iters; ++iter) {
+            long[] values = new long[random().nextInt(1 << random().nextInt(16))];
+            for (int i = 0; i < values.length; ++i) {
+                if (i == 0 || random().nextInt(10) < 9) {
+                    values[i] = random().nextLong();
+                } else {
+                    values[i] = values[random().nextInt(i)];
+                }
+            }
+            if (values.length > 0 && random().nextBoolean()) {
+                values[values.length / 2] = Long.MIN_VALUE;
+            }
+            Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet());
+            Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new);
+            // Sort using compareUnsigned
+            Arrays.sort(longObjects, Long::compareUnsigned);
+
+            long[] arr = new long[values.length];
+            // Convert back to long[]
+            for (int i = 0; i < arr.length; i++) {
+                arr[i] = longObjects[i];
+            }
+            UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr);
+            assertEquals(set1, set2);
+        }
+    }
+}