Add Int8Vector SIMD APIs for avx512/avx2/sse/ref

Signed-off-by: CaiYudong <[email protected]>
zilliztech · Feb 26, 2025 · 18afefb · 18afefb
1 parent e886c73
commit 18afefb
Show file tree

Hide file tree

Showing 13 changed files with 531 additions and 1 deletion.
diff --git a/src/simd/distances_avx.cc b/src/simd/distances_avx.cc
@@ -876,6 +876,95 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
     dis3 = _mm256_reduce_add_ps(msum_3);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)y[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        const float tmp = (float)x[i] - (float)y[i];
+        res += tmp * tmp;
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_norm_L2sqr_avx(const int8_t* x, size_t d) {
+    double res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)x[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+int8_vec_inner_product_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                   const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                   float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        d0 += x_i * (float)y0[i];
+        d1 += x_i * (float)y1[i];
+        d2 += x_i * (float)y2[i];
+        d3 += x_i * (float)y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+int8_vec_L2sqr_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                           const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        const float q0 = x_i - (float)y0[i];
+        const float q1 = x_i - (float)y1[i];
+        const float q2 = x_i - (float)y2[i];
+        const float q3 = x_i - (float)y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_avx.h b/src/simd/distances_avx.h
@@ -106,6 +106,27 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
                            const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0, float& dis1,
                            float& dis2, float& dis3);
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+float
+int8_vec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_norm_L2sqr_avx(const int8_t* x, size_t d);
+
+void
+int8_vec_inner_product_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                   const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                   float& dis3);
+
+void
+int8_vec_L2sqr_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                           const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_avx512.cc b/src/simd/distances_avx512.cc
@@ -682,6 +682,95 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
     dis3 = _mm512_reduce_add_ps(m512_res_3);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)y[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        const float tmp = (float)x[i] - (float)y[i];
+        res += tmp * tmp;
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float
+int8_vec_norm_L2sqr_avx512(const int8_t* x, size_t d) {
+    double res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)x[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+int8_vec_inner_product_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                      const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                      float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        d0 += x_i * (float)y0[i];
+        d1 += x_i * (float)y1[i];
+        d2 += x_i * (float)y2[i];
+        d3 += x_i * (float)y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+int8_vec_L2sqr_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                              const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        const float q0 = x_i - (float)y0[i];
+        const float q1 = x_i - (float)y1[i];
+        const float q2 = x_i - (float)y2[i];
+        const float q3 = x_i - (float)y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_avx512.h b/src/simd/distances_avx512.h
@@ -100,6 +100,27 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
                               const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0,
                               float& dis1, float& dis2, float& dis3);
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+float
+int8_vec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_norm_L2sqr_avx512(const int8_t* x, size_t d);
+
+void
+int8_vec_inner_product_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                      const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                      float& dis3);
+
+void
+int8_vec_L2sqr_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                              const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_neon.cc b/src/simd/distances_neon.cc
@@ -2113,6 +2113,37 @@ bf16_vec_L2sqr_batch_4_neon(const knowhere::bf16* x, const knowhere::bf16* y0, c
     dis3 = vaddvq_f32(res.val[3]);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+float
+int8_vec_inner_product_neon(const int8_t* x, const int8_t* y, size_t d) {
+    throw std::runtime_error("not implemented");
+}
+
+float
+int8_vec_L2sqr_neon(const int8_t* x, const int8_t* y, size_t d) {
+    throw std::runtime_error("not implemented");
+}
+
+float
+int8_vec_norm_L2sqr_neon(const int8_t* x, size_t d) {
+    throw std::runtime_error("not implemented");
+}
+
+void
+int8_vec_inner_product_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                    const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                    float& dis3) {
+    throw std::runtime_error("not implemented");
+}
+
+void
+int8_vec_L2sqr_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                            const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    throw std::runtime_error("not implemented");
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_neon.h b/src/simd/distances_neon.h
@@ -117,6 +117,27 @@ bf16_vec_L2sqr_batch_4_neon(const knowhere::bf16* x, const knowhere::bf16* y0, c
                             const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0,
                             float& dis1, float& dis2, float& dis3);
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+float
+int8_vec_inner_product_neon(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_L2sqr_neon(const int8_t* x, const int8_t* y, size_t d);
+
+float
+int8_vec_norm_L2sqr_neon(const int8_t* x, size_t d);
+
+void
+int8_vec_inner_product_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                    const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                    float& dis3);
+
+void
+int8_vec_L2sqr_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                            const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal
 

diff --git a/src/simd/distances_ref.cc b/src/simd/distances_ref.cc
@@ -379,6 +379,80 @@ bf16_vec_L2sqr_batch_4_ref(const knowhere::bf16* x, const knowhere::bf16* y0, co
     dis3 = d3;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// int8
+
+float
+int8_vec_inner_product_ref(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)y[i];
+    }
+    return res;
+}
+
+float
+int8_vec_L2sqr_ref(const int8_t* x, const int8_t* y, size_t d) {
+    float res = 0;
+    for (size_t i = 0; i < d; i++) {
+        const float tmp = (float)x[i] - (float)y[i];
+        res += tmp * tmp;
+    }
+    return res;
+}
+
+float
+int8_vec_norm_L2sqr_ref(const int8_t* x, size_t d) {
+    double res = 0;
+    for (size_t i = 0; i < d; i++) {
+        res += (float)x[i] * (float)x[i];
+    }
+    return res;
+}
+
+void
+int8_vec_inner_product_batch_4_ref(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
+                                   const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
+                                   float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        d0 += x_i * (float)y0[i];
+        d1 += x_i * (float)y1[i];
+        d2 += x_i * (float)y2[i];
+        d3 += x_i * (float)y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+
+void
+int8_vec_L2sqr_batch_4_ref(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
+                           const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0, d1 = 0, d2 = 0, d3 = 0;
+
+    for (size_t i = 0; i < d; ++i) {
+        auto x_i = (float)x[i];
+        const float q0 = x_i - (float)y0[i];
+        const float q1 = x_i - (float)y1[i];
+        const float q2 = x_i - (float)y2[i];
+        const float q3 = x_i - (float)y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // for cardinal