Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tiny opts for simd APIs #1090

Merged
merged 1 commit into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/E2E-arm.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
2 changes: 1 addition & 1 deletion ci/E2E2.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
2 changes: 1 addition & 1 deletion ci/E2E_GPU.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int total_timeout_minutes = 60
int total_timeout_minutes = 90
def knowhere_wheel=''
pipeline {
agent {
Expand Down
47 changes: 14 additions & 33 deletions src/simd/distances_avx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,9 @@ _mm256_reduce_add_ps(const __m256 res) {
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * y[i];
}
return res;
Expand All @@ -99,10 +98,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -209,10 +207,8 @@ void
fvec_inner_product_batch_4_avx(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * y0[i];
Expand All @@ -233,10 +229,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - y0[i];
Expand Down Expand Up @@ -465,9 +459,8 @@ fvec_L2sqr_ny_nearest_avx(float* __restrict distances_tmp_buffer, const float* _
// trust the compiler to unroll this properly
int32_t
ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * y[i];
}
return res;
Expand All @@ -476,9 +469,8 @@ ivec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
// trust the compiler to unroll this properly
int32_t
ivec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -633,7 +625,6 @@ fp16_vec_inner_product_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

void
Expand Down Expand Up @@ -685,7 +676,6 @@ fp16_vec_L2sqr_batch_4_avx(const knowhere::fp16* x, const knowhere::fp16* y0, co
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -833,8 +823,6 @@ bf16_vec_inner_product_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);

return;
}

void
Expand Down Expand Up @@ -886,7 +874,6 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
dis1 = _mm256_reduce_add_ps(msum_1);
dis2 = _mm256_reduce_add_ps(msum_2);
dis3 = _mm256_reduce_add_ps(msum_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand All @@ -895,10 +882,9 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_bf16_patch_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * bf16_float(y[i]);
}
return res;
Expand All @@ -908,10 +894,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_bf16_patch_avx(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - bf16_float(y[i]);
res += tmp * tmp;
}
Expand All @@ -925,10 +910,8 @@ fvec_inner_product_batch_4_bf16_patch_avx(const float* __restrict x, const float
const float* __restrict y1, const float* __restrict y2,
const float* __restrict y3, const size_t d, float& dis0, float& dis1,
float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * bf16_float(y0[i]);
Expand All @@ -948,10 +931,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - bf16_float(y0[i]);
Expand Down
5 changes: 1 addition & 4 deletions src/simd/distances_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#ifndef DISTANCES_AVX_H
#define DISTANCES_AVX_H
#pragma once

#include <cstddef>
#include <cstdint>
Expand Down Expand Up @@ -126,5 +125,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx(const float* x, const float* y0, const float*
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

} // namespace faiss

#endif /* DISTANCES_AVX_H */
48 changes: 15 additions & 33 deletions src/simd/distances_avx512.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License.

#if defined(__x86_64__)

#include "distances_avx512.h"

#include <immintrin.h>
Expand Down Expand Up @@ -50,10 +51,9 @@ _mm512_bf16_to_fp32(const __m256i& x) {
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * y[i];
}
return res;
Expand All @@ -64,10 +64,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -202,10 +201,8 @@ void
fvec_inner_product_batch_4_avx512(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * y0[i];
Expand All @@ -225,10 +222,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - y0[i];
Expand Down Expand Up @@ -280,19 +275,17 @@ fvec_norm_L2sqr_avx512(const float* x, size_t d) {

int32_t
ivec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * y[i];
}
return res;
}

int32_t
ivec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d) {
size_t i;
int32_t res = 0;
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
Expand Down Expand Up @@ -439,7 +432,6 @@ fp16_vec_inner_product_batch_4_avx512(const knowhere::fp16* x, const knowhere::f
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

void
Expand Down Expand Up @@ -492,7 +484,6 @@ fp16_vec_L2sqr_batch_4_avx512(const knowhere::fp16* x, const knowhere::fp16* y0,
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -637,7 +628,6 @@ bf16_vec_inner_product_batch_4_avx512(const knowhere::bf16* x, const knowhere::b
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

void
Expand Down Expand Up @@ -690,7 +680,6 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
dis1 = _mm512_reduce_add_ps(m512_res_1);
dis2 = _mm512_reduce_add_ps(m512_res_2);
dis3 = _mm512_reduce_add_ps(m512_res_3);
return;
}

///////////////////////////////////////////////////////////////////////////////
Expand All @@ -699,10 +688,9 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_bf16_patch_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
res += x[i] * bf16_float(y[i]);
}
return res;
Expand All @@ -712,10 +700,9 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_END
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_bf16_patch_avx512(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
for (size_t i = 0; i < d; i++) {
const float tmp = x[i] - bf16_float(y[i]);
res += tmp * tmp;
}
Expand All @@ -729,10 +716,8 @@ fvec_inner_product_batch_4_bf16_patch_avx512(const float* __restrict x, const fl
const float* __restrict y1, const float* __restrict y2,
const float* __restrict y3, const size_t d, float& dis0, float& dis1,
float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
d0 += x[i] * bf16_float(y0[i]);
Expand All @@ -752,10 +737,8 @@ FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
float d0 = 0;
float d1 = 0;
float d2 = 0;
float d3 = 0;
float d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
const float q0 = x[i] - bf16_float(y0[i]);
Expand All @@ -776,5 +759,4 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

} // namespace faiss

#endif
5 changes: 1 addition & 4 deletions src/simd/distances_avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.

#ifndef DISTANCES_AVX512_H
#define DISTANCES_AVX512_H
#pragma once

#include <cstddef>
#include <cstdint>
Expand Down Expand Up @@ -120,5 +119,3 @@ fvec_L2sqr_batch_4_bf16_patch_avx512(const float* x, const float* y0, const floa
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

} // namespace faiss

#endif /* DISTANCES_AVX512_H */
Loading
Loading