Skip to content

Commit

Permalink
Fix Bug - Handle unsigned long in sorting order assertion of LongHash…
Browse files Browse the repository at this point in the history
…Set (#17207)

* Fix Bug - handle unsigned long in assertion of LongHashSet

Signed-off-by: Shailesh Singh <[email protected]>

* renamed TestDocValuesUnsignedLongHashSet.java to DocValuesUnsignedLongHashSetTests.java

Signed-off-by: Shailesh Singh <[email protected]>

* Update server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java

Co-authored-by: Andriy Redko <[email protected]>
Signed-off-by: Shailesh Singh <[email protected]>

---------

Signed-off-by: Shailesh Singh <[email protected]>
Signed-off-by: Shailesh Singh <[email protected]>
Co-authored-by: Shailesh Singh <[email protected]>
Co-authored-by: Andriy Redko <[email protected]>
  • Loading branch information
3 people authored Mar 5, 2025
1 parent 09af518 commit 82bbdfb
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Introduce a setting to disable download of full cluster state from remote on term mismatch([#16798](https://github.com/opensearch-project/OpenSearch/pull/16798/))
- Added ability to retrieve value from DocValues in a flat_object filed([#16802](https://github.com/opensearch-project/OpenSearch/pull/16802))
- Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
- Fix Bug - Handle unsigned long in sorting order assertion of LongHashSet ([#17207](https://github.com/opensearch-project/OpenSearch/pull/17207))
- Implemented computation of segment replication stats at shard level ([#17055](https://github.com/opensearch-project/OpenSearch/pull/17055))
- [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.opensearch.lucene.util.LongHashSet;
import org.opensearch.lucene.util.UnsignedLongHashSet;

import java.io.IOException;
import java.math.BigInteger;
Expand All @@ -40,12 +40,12 @@
public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {

private final String field;
private final LongHashSet numbers;
private final UnsignedLongHashSet numbers;

SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
this.field = Objects.requireNonNull(field);
Arrays.sort(numbers);
this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
this.numbers = new UnsignedLongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.lucene.util;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

/** Set of unsigned-longs, optimized for docvalues usage */
public final class UnsignedLongHashSet implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class);

private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG;

final long[] table;
final int mask;
final boolean hasMissingValue;
final int size;
/** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long minValue;
/** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */
public final long maxValue;

/** Construct a set. Values must be in sorted order. */
public UnsignedLongHashSet(long[] values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
assert tableSize >= values.length * 3L / 2;
table = new long[tableSize];
Arrays.fill(table, MISSING);
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert
for (long value : values) {
if (value == MISSING) {
size += hasMissingValue ? 0 : 1;
hasMissingValue = true;
} else if (add(value)) {
++size;
}
assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order";
previousValue = value;
}
this.hasMissingValue = hasMissingValue;
this.size = size;
this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0];
this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1];
}

private boolean add(long l) {
assert l != MISSING;
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
table[i] = l;
return true;
} else if (table[i] == l) {
// already added
return false;
}
}
}

/**
* check for membership in the set.
*
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
* calling this.
*/
public boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
}
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
return false;
} else if (table[i] == l) {
return true;
}
}
}

/** returns a stream of all values contained in this set */
public LongStream stream() {
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
if (hasMissingValue) {
stream = LongStream.concat(LongStream.of(MISSING), stream);
}
return stream;
}

@Override
public int hashCode() {
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
}

@Override
public boolean equals(Object obj) {
if (obj != null && obj instanceof UnsignedLongHashSet) {
UnsignedLongHashSet that = (UnsignedLongHashSet) obj;
return size == that.size
&& minValue == that.minValue
&& maxValue == that.maxValue
&& mask == that.mask
&& hasMissingValue == that.hasMissingValue
&& Arrays.equals(table, that.table);
}
return false;
}

@Override
public String toString() {
return stream().mapToObj(Long::toUnsignedString).collect(Collectors.joining(", ", "[", "]"));
}

/** number of elements in the set */
public int size() {
return size;
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package org.opensearch.lucene.util;

/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import org.apache.lucene.tests.util.LuceneTestCase;
import org.opensearch.common.Numbers;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

public class DocValuesUnsignedLongHashSetTests extends LuceneTestCase {

private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
assertEquals(set1.size(), unsignedLongHashSet.size());

Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());
LuceneTestCase.assertEquals(set1, set2);

if (set1.isEmpty() == false) {
Set<Long> set3 = new HashSet<>(set1);
long removed = set3.iterator().next();
while (true) {
long next = random().nextLong();
if (next != removed && set3.add(next)) {
assertFalse(unsignedLongHashSet.contains(next));
break;
}
}
assertNotEquals(set3, unsignedLongHashSet);
}

assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains));
}

private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) {
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet());

LuceneTestCase.assertNotEquals(set1, set2);

UnsignedLongHashSet set3 = new UnsignedLongHashSet(
set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray()
);

LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet()));

assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains));
}

public void testEmpty() {
Set<Long> set1 = new HashSet<>();
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {});
assertEquals(0, set2.size());
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testOneValue() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(1, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);
}

public void testTwoValues() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG));
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG });
assertEquals(2, set2.size());
assertEquals(42, set2.minValue);
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue);
assertEquals(set1, set2);

set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L));
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L });
assertEquals(2, set2.size());
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue);
assertEquals(42, set2.maxValue);
assertEquals(set1, set2);
}

public void testSameValue() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L });
assertEquals(1, set2.size());
assertEquals(42L, set2.minValue);
assertEquals(42L, set2.maxValue);
}

public void testSameMissingPlaceholder() {
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE });
assertEquals(1, set2.size());
assertEquals(Long.MIN_VALUE, set2.minValue);
assertEquals(Long.MIN_VALUE, set2.maxValue);
}

public void testRandom() {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
long[] values = new long[random().nextInt(1 << random().nextInt(16))];
for (int i = 0; i < values.length; ++i) {
if (i == 0 || random().nextInt(10) < 9) {
values[i] = random().nextLong();
} else {
values[i] = values[random().nextInt(i)];
}
}
if (values.length > 0 && random().nextBoolean()) {
values[values.length / 2] = Long.MIN_VALUE;
}
Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet());
Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new);
// Sort using compareUnsigned
Arrays.sort(longObjects, Long::compareUnsigned);

long[] arr = new long[values.length];
// Convert back to long[]
for (int i = 0; i < arr.length; i++) {
arr[i] = longObjects[i];
}
UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr);
assertEquals(set1, set2);
}
}
}

0 comments on commit 82bbdfb

Please sign in to comment.