From bb386ab10a158198836442f26621d0695f57337c Mon Sep 17 00:00:00 2001 From: Federico Lois Date: Tue, 9 Apr 2024 13:34:51 -0300 Subject: [PATCH] RavenDB-20103: ARM support for the different variants of VectorizedAnd --- src/Corax/Querying/IndexSearcher.cs | 3 +- .../Querying/Matches/Meta/MergeHelper.cs | 29 ++++++++++--------- src/Corax/Querying/Matches/TermMatch.cs | 10 +++---- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Corax/Querying/IndexSearcher.cs b/src/Corax/Querying/IndexSearcher.cs index 3509b253d18d..0efe82d1c2cd 100644 --- a/src/Corax/Querying/IndexSearcher.cs +++ b/src/Corax/Querying/IndexSearcher.cs @@ -27,6 +27,7 @@ using InvalidOperationException = System.InvalidOperationException; using static Voron.Data.CompactTrees.CompactTree; using Voron.Util; +using System.Runtime.Intrinsics; namespace Corax.Querying; @@ -47,7 +48,7 @@ public sealed unsafe partial class IndexSearcher : IDisposable /// public bool ForceNonAccelerated { get; set; } - public bool IsAccelerated => Avx2.IsSupported && !ForceNonAccelerated; + public bool IsAccelerated => Vector256.IsHardwareAccelerated && !ForceNonAccelerated; public long NumberOfEntries => _numberOfEntries ??= _metadataTree?.ReadInt64(Constants.IndexWriter.NumberOfEntriesSlice) ?? 0; diff --git a/src/Corax/Querying/Matches/Meta/MergeHelper.cs b/src/Corax/Querying/Matches/Meta/MergeHelper.cs index 9aa3b0a9e369..5d9f028b380e 100644 --- a/src/Corax/Querying/Matches/Meta/MergeHelper.cs +++ b/src/Corax/Querying/Matches/Meta/MergeHelper.cs @@ -24,16 +24,17 @@ public static int And(Span dst, Span left, Span right) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int And(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) return AndVectorized(dst, dstLength, left, leftLength, right, rightLength); + return AndScalar(dst, dstLength, left, leftLength, right, rightLength); } /// - /// AVX2 implementation of vectorized AND. + /// Vector256 implementation of vectorized AND that works on both Intel/AMD and ARM. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe int AndVectorized(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + internal static int AndVectorized(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { // This is effectively a constant. uint N = (uint)Vector256.Count; @@ -64,8 +65,8 @@ internal static unsafe int AndVectorized(long* dst, int dstLength, long* left, i { while (true) { - // TODO: In here we can do SIMD galloping with gather operations. Therefore we will be able to do - // multiple checks at once and find the right amount of skipping using a table. + // TODO: In here we can do SIMD galloping with gather operations. Therefore, we will be able to do + // multiple checks at once and find the right amount of skipping using a table. // If the value to compare is bigger than the biggest element in the block, we advance the block. if ((ulong)*smallerPtr > (ulong)*(largerPtr + N - 1)) @@ -91,10 +92,10 @@ internal static unsafe int AndVectorized(long* dst, int dstLength, long* left, i break; //In case when block is smaller than N we've to use scalar version. Vector256 value = Vector256.Create((ulong)*smallerPtr); - Vector256 blockValues = Avx.LoadVector256((ulong*)largerPtr); + Vector256 blockValues = Vector256.Load((ulong*)largerPtr); // We are going to select which direction we are going to be moving forward. - if (!Avx2.CompareEqual(value, blockValues).Equals(Vector256.Zero)) + if (Vector256.EqualsAny(value, blockValues)) { // We found the value, therefore we need to store this value in the destination. *dstPtr = *smallerPtr; @@ -107,7 +108,7 @@ internal static unsafe int AndVectorized(long* dst, int dstLength, long* left, i } } - // The scalar version. This shouldnt cost much either way. + // The scalar version. This shouldn't cost much either way. while (smallerPtr < smallerEndPtr && largerPtr < largerEndPtr) { ulong leftValue = (ulong)*smallerPtr; @@ -160,7 +161,7 @@ internal static int AndScalar(Span dst, Span left, Span right) /// is also used for testing purposes. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe int AndScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + internal static int AndScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { long* dstPtr = dst; long* leftPtr = left; @@ -209,7 +210,7 @@ public static int Or(Span dst, Span left, Span right) [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe int Or(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + public static int Or(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { if (Sse2.IsSupported) return OrNonTemporal(dst, dstLength, left, leftLength, right, rightLength); @@ -220,7 +221,7 @@ public static unsafe int Or(long* dst, int dstLength, long* left, int leftLength /// dst and left may *not* be the same buffer /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe int OrNonTemporal(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + public static int OrNonTemporal(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { long* dstPtr = dst; long* dstEndPtr = dst + dstLength; @@ -280,7 +281,7 @@ public static unsafe int OrNonTemporal(long* dst, int dstLength, long* left, int /// dst and left may *not* be the same buffer /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe int OrScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + public static int OrScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { long* dstPtr = dst; long* dstEndPtr = dst + dstLength; @@ -347,7 +348,7 @@ public static int AndNot(Span dst, Span left, Span right) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe int AndNot(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + public static int AndNot(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { // PERF: This can be improved implementing support Sse2 implementation. This type of algorithms // are very suitable for instruction level parallelism. @@ -359,7 +360,7 @@ public static unsafe int AndNot(long* dst, int dstLength, long* left, int leftLe /// is also used for testing purposes. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static unsafe int AndNotScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) + internal static int AndNotScalar(long* dst, int dstLength, long* left, int leftLength, long* right, int rightLength) { long* dstPtr = dst; long* leftPtr = left; diff --git a/src/Corax/Querying/Matches/TermMatch.cs b/src/Corax/Querying/Matches/TermMatch.cs index d77e57e10665..e7909a655a88 100644 --- a/src/Corax/Querying/Matches/TermMatch.cs +++ b/src/Corax/Querying/Matches/TermMatch.cs @@ -477,12 +477,12 @@ static int AndWithVectorizedFunc(ref TermMatch term, Span b if (largerEndPtr - largerPtr < N) break; // boundary guardian for vector load. - + Vector256 value = Vector256.Create((ulong)*smallerPtr); - Vector256 blockValues = Avx.LoadVector256((ulong*)largerPtr); + Vector256 blockValues = Vector256.Load((ulong*)largerPtr); // We are going to select which direction we are going to be moving forward. - if (!Avx2.CompareEqual(value, blockValues).Equals(Vector256.Zero)) + if (Vector256.EqualsAny(value, blockValues)) { // We found the value, therefore we need to store this value in the destination. *dstPtr = *smallerPtr; @@ -567,7 +567,7 @@ static void ScoreFunc(ref TermMatch term, Span matches, Span scores term._bm25Relevance.Score(matches, scores, boostFactor); } - if (Avx2.IsSupported == false) + if (Vector256.IsHardwareAccelerated == false) useAccelerated = false; var bm25Relevance = isBoosting @@ -577,7 +577,7 @@ static void ScoreFunc(ref TermMatch term, Span matches, Span scores var isStored = isBoosting && bm25Relevance.IsStored; - // We will select the AVX version if supported. + // We will select the Vector256 version if supported. return new TermMatch(indexSearcher, ctx, postingList.State.NumberOfEntries, (isBoosting, isStored) switch {