Skip to content

Commit

Permalink
Tiny opts for simd APIs (zilliztech#1090)
Browse files Browse the repository at this point in the history
Signed-off-by: CaiYudong <[email protected]>
  • Loading branch information
cydrain authored and adarshs1310 committed Feb 26, 2025
1 parent a6d5dd4 commit 2a02afd
Show file tree
Hide file tree
Showing 15 changed files with 96 additions and 203 deletions.
2 changes: 1 addition & 1 deletion ci/E2E-arm.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
2 changes: 1 addition & 1 deletion ci/E2E2.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
2 changes: 1 addition & 1 deletion ci/E2E_GPU.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
47 changes: 14 additions & 33 deletions src/simd/distances_avx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,9 @@ _mm256_reduce_add_ps(const __m256 res) {
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * y[i];
}
return res;
Expand All @@ -99,10 +98,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -209,10 +207,8 @@ void
fvec_inner_product_batch_4_avx(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * y0[i];
Expand All @@ -233,10 +229,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - y0[i];
Expand Down Expand Up @@ -465,9 +459,8 @@ fvec_L2sqr_ny_nearest_avx(float* __restrict distances_tmp_buffer, const float* _
// trust the compiler to unroll this properly
int32_t
ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * y[i];
}
return res;
Expand All @@ -476,9 +469,8 @@ ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
// trust the compiler to unroll this properly
int32_t
ivec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -633,7 +625,6 @@ fp16_vec_inner_product_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

void
Expand Down Expand Up @@ -685,7 +676,6 @@ fp16_vec_L2sqr_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16* y0, co
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -833,8 +823,6 @@ bf16_vec_inner_product_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);

return;
}

void
Expand Down Expand Up @@ -886,7 +874,6 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand All @@ -895,10 +882,9 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_bf16_patch_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * bf16_float(y[i]);
}
return res;
Expand All @@ -908,10 +894,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_bf16_patch_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - bf16_float(y[i]);
res += tmp * tmp;
}
Expand All @@ -925,10 +910,8 @@ fvec_inner_product_batch_4_bf16_patch_avx(const float* __restrict x, const float
const float* __restrict y1, const float* __restrict y2,
const float* __restrict y3, const size_t d, float& dis0, float& dis1,
float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * bf16_float(y0[i]);
Expand All @@ -948,10 +931,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - bf16_float(y0[i]);
Expand Down
5 changes: 1 addition & 4 deletions src/simd/distances_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#ifndef DISTANCES_AVX_H
#define DISTANCES_AVX_H
#pragma once

#include <cstddef>
#include <cstdint>
Expand Down Expand Up @@ -126,5 +125,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float*
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

} // namespace faiss

#endif /* DISTANCES_AVX_H */
48 changes: 15 additions & 33 deletions src/simd/distances_avx512.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License.

#if defined(__x86_64__)

#include "distances_avx512.h"

#include <immintrin.h>
Expand Down Expand Up @@ -50,10 +51,9 @@ _mm512_bf16_to_fp32(const __m256i& x) {
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * y[i];
}
return res;
Expand All @@ -64,10 +64,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -202,10 +201,8 @@ void
fvec_inner_product_batch_4_avx512(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * y0[i];
Expand All @@ -225,10 +222,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - y0[i];
Expand Down Expand Up @@ -280,19 +275,17 @@ fvec_norm_L2sqr_avx512(const float* x, size_t d) {

int32_t
ivec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * y[i];
}
return res;
}

int32_t
ivec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -439,7 +432,6 @@ fp16_vec_inner_product_batch_4_avx512(const knowhere::fp16* x, const knowhere::f
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

void
Expand Down Expand Up @@ -492,7 +484,6 @@ fp16_vec_L2sqr_batch_4_avx512(const knowhere::fp16* x, const knowhere::fp16* y0,
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -637,7 +628,6 @@ bf16_vec_inner_product_batch_4_avx512(const knowhere::bf16* x, const knowhere::b
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

void
Expand Down Expand Up @@ -690,7 +680,6 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand All @@ -699,10 +688,9 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_bf16_patch_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * bf16_float(y[i]);
}
return res;
Expand All @@ -712,10 +700,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_bf16_patch_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - bf16_float(y[i]);
res += tmp * tmp;
}
Expand All @@ -729,10 +716,8 @@ fvec_inner_product_batch_4_bf16_patch_avx512(const float* __restrict x, const fl
const float* __restrict y1, const float* __restrict y2,
const float* __restrict y3, const size_t d, float& dis0, float& dis1,
float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * bf16_float(y0[i]);
Expand All @@ -752,10 +737,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - bf16_float(y0[i]);
Expand All @@ -776,5 +759,4 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

} // namespace faiss

#endif
5 changes: 1 addition & 4 deletions src/simd/distances_avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#ifndef DISTANCES_AVX512_H
#define DISTANCES_AVX512_H
#pragma once

#include <cstddef>
#include <cstdint>
Expand Down Expand Up @@ -120,5 +119,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

} // namespace faiss

#endif /* DISTANCES_AVX512_H */
Loading

0 comments on commit 2a02afd

Please sign in to comment.