-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix Bug - Handle unsigned long in sorting order assertion of LongHash…
…Set (#17207) * Fix Bug - handle unsigned long in assertion of LongHashSet Signed-off-by: Shailesh Singh <[email protected]> * renamed TestDocValuesUnsignedLongHashSet.java to DocValuesUnsignedLongHashSetTests.java Signed-off-by: Shailesh Singh <[email protected]> * Update server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java Co-authored-by: Andriy Redko <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> --------- Signed-off-by: Shailesh Singh <[email protected]> Signed-off-by: Shailesh Singh <[email protected]> Co-authored-by: Shailesh Singh <[email protected]> Co-authored-by: Andriy Redko <[email protected]>
- Loading branch information
1 parent
09af518
commit 82bbdfb
Showing
4 changed files
with
284 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
139 changes: 139 additions & 0 deletions
139
server/src/main/java/org/opensearch/lucene/util/UnsignedLongHashSet.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.lucene.util; | ||
|
||
import org.apache.lucene.util.Accountable; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
import org.apache.lucene.util.packed.PackedInts; | ||
import org.opensearch.common.Numbers; | ||
|
||
import java.util.Arrays; | ||
import java.util.Objects; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.LongStream; | ||
|
||
/** Set of unsigned-longs, optimized for docvalues usage */ | ||
public final class UnsignedLongHashSet implements Accountable { | ||
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(UnsignedLongHashSet.class); | ||
|
||
private static final long MISSING = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; | ||
|
||
final long[] table; | ||
final int mask; | ||
final boolean hasMissingValue; | ||
final int size; | ||
/** minimum value in the set, or Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */ | ||
public final long minValue; | ||
/** maximum value in the set, or Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG for an empty set */ | ||
public final long maxValue; | ||
|
||
/** Construct a set. Values must be in sorted order. */ | ||
public UnsignedLongHashSet(long[] values) { | ||
int tableSize = Math.toIntExact(values.length * 3L / 2); | ||
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2 | ||
assert tableSize >= values.length * 3L / 2; | ||
table = new long[tableSize]; | ||
Arrays.fill(table, MISSING); | ||
mask = tableSize - 1; | ||
boolean hasMissingValue = false; | ||
int size = 0; | ||
long previousValue = Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG; // for assert | ||
for (long value : values) { | ||
if (value == MISSING) { | ||
size += hasMissingValue ? 0 : 1; | ||
hasMissingValue = true; | ||
} else if (add(value)) { | ||
++size; | ||
} | ||
assert Long.compareUnsigned(value, previousValue) >= 0 : " values must be provided in sorted order"; | ||
previousValue = value; | ||
} | ||
this.hasMissingValue = hasMissingValue; | ||
this.size = size; | ||
this.minValue = values.length == 0 ? Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG : values[0]; | ||
this.maxValue = values.length == 0 ? Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG : values[values.length - 1]; | ||
} | ||
|
||
private boolean add(long l) { | ||
assert l != MISSING; | ||
final int slot = Long.hashCode(l) & mask; | ||
for (int i = slot;; i = (i + 1) & mask) { | ||
if (table[i] == MISSING) { | ||
table[i] = l; | ||
return true; | ||
} else if (table[i] == l) { | ||
// already added | ||
return false; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* check for membership in the set. | ||
* | ||
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before | ||
* calling this. | ||
*/ | ||
public boolean contains(long l) { | ||
if (l == MISSING) { | ||
return hasMissingValue; | ||
} | ||
final int slot = Long.hashCode(l) & mask; | ||
for (int i = slot;; i = (i + 1) & mask) { | ||
if (table[i] == MISSING) { | ||
return false; | ||
} else if (table[i] == l) { | ||
return true; | ||
} | ||
} | ||
} | ||
|
||
/** returns a stream of all values contained in this set */ | ||
public LongStream stream() { | ||
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING); | ||
if (hasMissingValue) { | ||
stream = LongStream.concat(LongStream.of(MISSING), stream); | ||
} | ||
return stream; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table)); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object obj) { | ||
if (obj != null && obj instanceof UnsignedLongHashSet) { | ||
UnsignedLongHashSet that = (UnsignedLongHashSet) obj; | ||
return size == that.size | ||
&& minValue == that.minValue | ||
&& maxValue == that.maxValue | ||
&& mask == that.mask | ||
&& hasMissingValue == that.hasMissingValue | ||
&& Arrays.equals(table, that.table); | ||
} | ||
return false; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return stream().mapToObj(Long::toUnsignedString).collect(Collectors.joining(", ", "[", "]")); | ||
} | ||
|
||
/** number of elements in the set */ | ||
public int size() { | ||
return size; | ||
} | ||
|
||
@Override | ||
public long ramBytesUsed() { | ||
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table); | ||
} | ||
} |
141 changes: 141 additions & 0 deletions
141
server/src/test/java/org/opensearch/lucene/util/DocValuesUnsignedLongHashSetTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
package org.opensearch.lucene.util; | ||
|
||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
import org.apache.lucene.tests.util.LuceneTestCase; | ||
import org.opensearch.common.Numbers; | ||
|
||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.LongStream; | ||
|
||
public class DocValuesUnsignedLongHashSetTests extends LuceneTestCase { | ||
|
||
private void assertEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) { | ||
assertEquals(set1.size(), unsignedLongHashSet.size()); | ||
|
||
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet()); | ||
LuceneTestCase.assertEquals(set1, set2); | ||
|
||
if (set1.isEmpty() == false) { | ||
Set<Long> set3 = new HashSet<>(set1); | ||
long removed = set3.iterator().next(); | ||
while (true) { | ||
long next = random().nextLong(); | ||
if (next != removed && set3.add(next)) { | ||
assertFalse(unsignedLongHashSet.contains(next)); | ||
break; | ||
} | ||
} | ||
assertNotEquals(set3, unsignedLongHashSet); | ||
} | ||
|
||
assertTrue(set1.stream().allMatch(unsignedLongHashSet::contains)); | ||
} | ||
|
||
private void assertNotEquals(Set<Long> set1, UnsignedLongHashSet unsignedLongHashSet) { | ||
Set<Long> set2 = unsignedLongHashSet.stream().boxed().collect(Collectors.toSet()); | ||
|
||
LuceneTestCase.assertNotEquals(set1, set2); | ||
|
||
UnsignedLongHashSet set3 = new UnsignedLongHashSet( | ||
set1.stream().sorted(Long::compareUnsigned).mapToLong(Long::longValue).toArray() | ||
); | ||
|
||
LuceneTestCase.assertNotEquals(set2, set3.stream().boxed().collect(Collectors.toSet())); | ||
|
||
assertFalse(set1.stream().allMatch(unsignedLongHashSet::contains)); | ||
} | ||
|
||
public void testEmpty() { | ||
Set<Long> set1 = new HashSet<>(); | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] {}); | ||
assertEquals(0, set2.size()); | ||
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); | ||
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); | ||
assertEquals(set1, set2); | ||
} | ||
|
||
public void testOneValue() { | ||
Set<Long> set1 = new HashSet<>(Arrays.asList(42L)); | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L }); | ||
assertEquals(1, set2.size()); | ||
assertEquals(42L, set2.minValue); | ||
assertEquals(42L, set2.maxValue); | ||
assertEquals(set1, set2); | ||
|
||
set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG)); | ||
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG }); | ||
assertEquals(1, set2.size()); | ||
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); | ||
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); | ||
assertEquals(set1, set2); | ||
} | ||
|
||
public void testTwoValues() { | ||
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG)); | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG }); | ||
assertEquals(2, set2.size()); | ||
assertEquals(42, set2.minValue); | ||
assertEquals(Numbers.MAX_UNSIGNED_LONG_VALUE_AS_LONG, set2.maxValue); | ||
assertEquals(set1, set2); | ||
|
||
set1 = new HashSet<>(Arrays.asList(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L)); | ||
set2 = new UnsignedLongHashSet(new long[] { Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, 42L }); | ||
assertEquals(2, set2.size()); | ||
assertEquals(Numbers.MIN_UNSIGNED_LONG_VALUE_AS_LONG, set2.minValue); | ||
assertEquals(42, set2.maxValue); | ||
assertEquals(set1, set2); | ||
} | ||
|
||
public void testSameValue() { | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { 42L, 42L }); | ||
assertEquals(1, set2.size()); | ||
assertEquals(42L, set2.minValue); | ||
assertEquals(42L, set2.maxValue); | ||
} | ||
|
||
public void testSameMissingPlaceholder() { | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(new long[] { Long.MIN_VALUE, Long.MIN_VALUE }); | ||
assertEquals(1, set2.size()); | ||
assertEquals(Long.MIN_VALUE, set2.minValue); | ||
assertEquals(Long.MIN_VALUE, set2.maxValue); | ||
} | ||
|
||
public void testRandom() { | ||
final int iters = atLeast(10); | ||
for (int iter = 0; iter < iters; ++iter) { | ||
long[] values = new long[random().nextInt(1 << random().nextInt(16))]; | ||
for (int i = 0; i < values.length; ++i) { | ||
if (i == 0 || random().nextInt(10) < 9) { | ||
values[i] = random().nextLong(); | ||
} else { | ||
values[i] = values[random().nextInt(i)]; | ||
} | ||
} | ||
if (values.length > 0 && random().nextBoolean()) { | ||
values[values.length / 2] = Long.MIN_VALUE; | ||
} | ||
Set<Long> set1 = LongStream.of(values).boxed().collect(Collectors.toSet()); | ||
Long[] longObjects = Arrays.stream(values).boxed().toArray(Long[]::new); | ||
// Sort using compareUnsigned | ||
Arrays.sort(longObjects, Long::compareUnsigned); | ||
|
||
long[] arr = new long[values.length]; | ||
// Convert back to long[] | ||
for (int i = 0; i < arr.length; i++) { | ||
arr[i] = longObjects[i]; | ||
} | ||
UnsignedLongHashSet set2 = new UnsignedLongHashSet(arr); | ||
assertEquals(set1, set2); | ||
} | ||
} | ||
} |