Tiny opts for simd APIs (zilliztech#1090)

Signed-off-by: CaiYudong <[email protected]>
adarshs1310 · Feb 26, 2025 · 2a02afd · 2a02afd
1 parent a6d5dd4
commit 2a02afd
Show file tree

Hide file tree

Showing 15 changed files with 96 additions and 203 deletions.
diff --git a/ci/E2E-arm.groovy b/ci/E2E-arm.groovy
@@ -1,4 +1,4 @@
-int total_timeout_minutes = 60
+int total_timeout_minutes = 90
 def knowhere_wheel=''
 pipeline {
     agent {

diff --git a/ci/E2E2.groovy b/ci/E2E2.groovy
@@ -1,4 +1,4 @@
-int total_timeout_minutes = 60
+int total_timeout_minutes = 90
 def knowhere_wheel=''
 pipeline {
     agent {

diff --git a/ci/E2E_GPU.groovy b/ci/E2E_GPU.groovy
@@ -1,4 +1,4 @@
-int total_timeout_minutes = 60
+int total_timeout_minutes = 90
 def knowhere_wheel=''
 pipeline {
     agent {

diff --git a/src/simd/distances_avx.cc b/src/simd/distances_avx.cc
@@ -85,10 +85,9 @@ _mm256_reduce_add_ps(const __m256 res) {
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_avx(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += x[i] * y[i];
     }
     return res;
@@ -99,10 +98,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const float tmp = x[i] - y[i];
         res += tmp * tmp;
     }
@@ -209,10 +207,8 @@ void
 fvec_inner_product_batch_4_avx(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
                                const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
                                float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         d0 += x[i] * y0[i];
@@ -233,10 +229,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 void
 fvec_L2sqr_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
                        const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         const float q0 = x[i] - y0[i];
@@ -465,9 +459,8 @@ fvec_L2sqr_ny_nearest_avx(float* __restrict distances_tmp_buffer, const float* _
 // trust the compiler to unroll this properly
 int32_t
 ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
-    size_t i;
     int32_t res = 0;
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += (int32_t)x[i] * y[i];
     }
     return res;
@@ -476,9 +469,8 @@ ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
 // trust the compiler to unroll this properly
 int32_t
 ivec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d) {
-    size_t i;
     int32_t res = 0;
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
         res += tmp * tmp;
     }
@@ -633,7 +625,6 @@ fp16_vec_inner_product_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16
     dis1 = _mm256_reduce_add_ps(msum_1);
     dis2 = _mm256_reduce_add_ps(msum_2);
     dis3 = _mm256_reduce_add_ps(msum_3);
-    return;
 }
 
 void
@@ -685,7 +676,6 @@ fp16_vec_L2sqr_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16* y0, co
     dis1 = _mm256_reduce_add_ps(msum_1);
     dis2 = _mm256_reduce_add_ps(msum_2);
     dis3 = _mm256_reduce_add_ps(msum_3);
-    return;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -833,8 +823,6 @@ bf16_vec_inner_product_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16
     dis1 = _mm256_reduce_add_ps(msum_1);
     dis2 = _mm256_reduce_add_ps(msum_2);
     dis3 = _mm256_reduce_add_ps(msum_3);
-
-    return;
 }
 
 void
@@ -886,7 +874,6 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
     dis1 = _mm256_reduce_add_ps(msum_1);
     dis2 = _mm256_reduce_add_ps(msum_2);
     dis3 = _mm256_reduce_add_ps(msum_3);
-    return;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -895,10 +882,9 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_bf16_patch_avx(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += x[i] * bf16_float(y[i]);
     }
     return res;
@@ -908,10 +894,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_bf16_patch_avx(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const float tmp = x[i] - bf16_float(y[i]);
         res += tmp * tmp;
     }
@@ -925,10 +910,8 @@ fvec_inner_product_batch_4_bf16_patch_avx(const float* __restrict x, const float
                                           const float* __restrict y1, const float* __restrict y2,
                                           const float* __restrict y3, const size_t d, float& dis0, float& dis1,
                                           float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         d0 += x[i] * bf16_float(y0[i]);
@@ -948,10 +931,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 void
 fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
                                   const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         const float q0 = x[i] - bf16_float(y0[i]);

diff --git a/src/simd/distances_avx.h b/src/simd/distances_avx.h
@@ -9,8 +9,7 @@
 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 // or implied. See the License for the specific language governing permissions and limitations under the License.
 
-#ifndef DISTANCES_AVX_H
-#define DISTANCES_AVX_H
+#pragma once
 
 #include <cstddef>
 #include <cstdint>
@@ -126,5 +125,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float*
                                   const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
 
 }  // namespace faiss
-
-#endif /* DISTANCES_AVX_H */
diff --git a/src/simd/distances_avx512.cc b/src/simd/distances_avx512.cc
@@ -10,6 +10,7 @@
 // or implied. See the License for the specific language governing permissions and limitations under the License.
 
 #if defined(__x86_64__)
+
 #include "distances_avx512.h"
 
 #include <immintrin.h>
@@ -50,10 +51,9 @@ _mm512_bf16_to_fp32(const __m256i& x) {
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += x[i] * y[i];
     }
     return res;
@@ -64,10 +64,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const float tmp = x[i] - y[i];
         res += tmp * tmp;
     }
@@ -202,10 +201,8 @@ void
 fvec_inner_product_batch_4_avx512(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
                                   const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
                                   float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         d0 += x[i] * y0[i];
@@ -225,10 +222,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 void
 fvec_L2sqr_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
                           const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         const float q0 = x[i] - y0[i];
@@ -280,19 +275,17 @@ fvec_norm_L2sqr_avx512(const float* x, size_t d) {
 
 int32_t
 ivec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d) {
-    size_t i;
     int32_t res = 0;
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += (int32_t)x[i] * y[i];
     }
     return res;
 }
 
 int32_t
 ivec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d) {
-    size_t i;
     int32_t res = 0;
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
         res += tmp * tmp;
     }
@@ -439,7 +432,6 @@ fp16_vec_inner_product_batch_4_avx512(const knowhere::fp16* x, const knowhere::f
     dis1 = _mm512_reduce_add_ps(m512_res_1);
     dis2 = _mm512_reduce_add_ps(m512_res_2);
     dis3 = _mm512_reduce_add_ps(m512_res_3);
-    return;
 }
 
 void
@@ -492,7 +484,6 @@ fp16_vec_L2sqr_batch_4_avx512(const knowhere::fp16* x, const knowhere::fp16* y0,
     dis1 = _mm512_reduce_add_ps(m512_res_1);
     dis2 = _mm512_reduce_add_ps(m512_res_2);
     dis3 = _mm512_reduce_add_ps(m512_res_3);
-    return;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -637,7 +628,6 @@ bf16_vec_inner_product_batch_4_avx512(const knowhere::bf16* x, const knowhere::b
     dis1 = _mm512_reduce_add_ps(m512_res_1);
     dis2 = _mm512_reduce_add_ps(m512_res_2);
     dis3 = _mm512_reduce_add_ps(m512_res_3);
-    return;
 }
 
 void
@@ -690,7 +680,6 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
     dis1 = _mm512_reduce_add_ps(m512_res_1);
     dis2 = _mm512_reduce_add_ps(m512_res_2);
     dis3 = _mm512_reduce_add_ps(m512_res_3);
-    return;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -699,10 +688,9 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_bf16_patch_avx512(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         res += x[i] * bf16_float(y[i]);
     }
     return res;
@@ -712,10 +700,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_bf16_patch_avx512(const float* x, const float* y, size_t d) {
-    size_t i;
     float res = 0;
     FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
+    for (size_t i = 0; i < d; i++) {
         const float tmp = x[i] - bf16_float(y[i]);
         res += tmp * tmp;
     }
@@ -729,10 +716,8 @@ fvec_inner_product_batch_4_bf16_patch_avx512(const float* __restrict x, const fl
                                              const float* __restrict y1, const float* __restrict y2,
                                              const float* __restrict y3, const size_t d, float& dis0, float& dis1,
                                              float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         d0 += x[i] * bf16_float(y0[i]);
@@ -752,10 +737,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 void
 fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
                                      const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
-    float d0 = 0;
-    float d1 = 0;
-    float d2 = 0;
-    float d3 = 0;
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
     FAISS_PRAGMA_IMPRECISE_LOOP
     for (size_t i = 0; i < d; ++i) {
         const float q0 = x[i] - bf16_float(y0[i]);
@@ -776,5 +759,4 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
 FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 }  // namespace faiss
-
 #endif
diff --git a/src/simd/distances_avx512.h b/src/simd/distances_avx512.h
@@ -9,8 +9,7 @@
 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 // or implied. See the License for the specific language governing permissions and limitations under the License.
 
-#ifndef DISTANCES_AVX512_H
-#define DISTANCES_AVX512_H
+#pragma once
 
 #include <cstddef>
 #include <cstdint>
@@ -120,5 +119,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
                                      const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
 
 }  // namespace faiss
-
-#endif /* DISTANCES_AVX512_H */