Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Int8Vector SIMD APIs for avx512/avx2/sse/ref #1098

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions src/simd/distances_avx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,95 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
dis3 = _mm256_reduce_add_ps(msum_3);
}

///////////////////////////////////////////////////////////////////////////////
// int8

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * (int32_t)y[i];
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_norm_L2sqr_avx(const int8_t* x, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * (int32_t)x[i];
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
int8_vec_inner_product_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3) {
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
d0 += x_i * (int32_t)y0[i];
d1 += x_i * (int32_t)y1[i];
d2 += x_i * (int32_t)y2[i];
d3 += x_i * (int32_t)y3[i];
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
int8_vec_L2sqr_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
const int32_t q0 = x_i - (int32_t)y0[i];
const int32_t q1 = x_i - (int32_t)y1[i];
const int32_t q2 = x_i - (int32_t)y2[i];
const int32_t q3 = x_i - (int32_t)y3[i];
d0 += q0 * q0;
d1 += q1 * q1;
d2 += q2 * q2;
d3 += q3 * q3;
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
21 changes: 21 additions & 0 deletions src/simd/distances_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,27 @@ bf16_vec_L2sqr_batch_4_avx(const knowhere::bf16* x, const knowhere::bf16* y0, co
const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0, float& dis1,
float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// int8

float
int8_vec_inner_product_avx(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_L2sqr_avx(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_norm_L2sqr_avx(const int8_t* x, size_t d);

void
int8_vec_inner_product_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3);

void
int8_vec_L2sqr_batch_4_avx(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
89 changes: 89 additions & 0 deletions src/simd/distances_avx512.cc
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,95 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
dis3 = _mm512_reduce_add_ps(m512_res_3);
}

///////////////////////////////////////////////////////////////////////////////
// int8

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * (int32_t)y[i];
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
int8_vec_norm_L2sqr_avx512(const int8_t* x, size_t d) {
int32_t res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * (int32_t)x[i];
}
return (float)res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
int8_vec_inner_product_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3) {
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
d0 += x_i * (int32_t)y0[i];
d1 += x_i * (int32_t)y1[i];
d2 += x_i * (int32_t)y2[i];
d3 += x_i * (int32_t)y3[i];
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
void
int8_vec_L2sqr_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

FAISS_PRAGMA_IMPRECISE_LOOP
for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
const int32_t q0 = x_i - (int32_t)y0[i];
const int32_t q1 = x_i - (int32_t)y1[i];
const int32_t q2 = x_i - (int32_t)y2[i];
const int32_t q3 = x_i - (int32_t)y3[i];
d0 += q0 * q0;
d1 += q1 * q1;
d2 += q2 * q2;
d3 += q3 * q3;
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
21 changes: 21 additions & 0 deletions src/simd/distances_avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,27 @@ bf16_vec_L2sqr_batch_4_avx512(const knowhere::bf16* x, const knowhere::bf16* y0,
const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// int8

float
int8_vec_inner_product_avx512(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_L2sqr_avx512(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_norm_L2sqr_avx512(const int8_t* x, size_t d);

void
int8_vec_inner_product_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3);

void
int8_vec_L2sqr_batch_4_avx512(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
79 changes: 79 additions & 0 deletions src/simd/distances_neon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2113,6 +2113,85 @@ bf16_vec_L2sqr_batch_4_neon(const knowhere::bf16* x, const knowhere::bf16* y0, c
dis3 = vaddvq_f32(res.val[3]);
}

///////////////////////////////////////////////////////////////////////////////
// int8

float
int8_vec_inner_product_neon(const int8_t* x, const int8_t* y, size_t d) {
// TODO caiyd: use ref implementation temporarily
int32_t res = 0;
for (size_t i = 0; i < d; i++) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated code here
{
return int8_vec_inner_product_ref(x, y, d);
}

Copy link
Collaborator

@alexanderguzhva alexanderguzhva Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, the duplicated code is correct, bcz it will be compiled with different compiler options

res += (int32_t)x[i] * (int32_t)y[i];
}
return (float)res;
}

float
int8_vec_L2sqr_neon(const int8_t* x, const int8_t* y, size_t d) {
// TODO caiyd: use ref implementation temporarily
int32_t res = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

for (size_t i = 0; i < d; i++) {
const int32_t tmp = (int32_t)x[i] - (int32_t)y[i];
res += tmp * tmp;
}
return (float)res;
}

float
int8_vec_norm_L2sqr_neon(const int8_t* x, size_t d) {
// TODO caiyd: use ref implementation temporarily
int32_t res = 0;
for (size_t i = 0; i < d; i++) {
res += (int32_t)x[i] * (int32_t)x[i];
}
return (float)res;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

implement a neon version.

}

void
int8_vec_inner_product_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3) {
// TODO caiyd: use ref implementation temporarily
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
d0 += x_i * (int32_t)y0[i];
d1 += x_i * (int32_t)y1[i];
d2 += x_i * (int32_t)y2[i];
d3 += x_i * (int32_t)y3[i];
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}

void
int8_vec_L2sqr_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
// TODO caiyd: use ref implementation temporarily
int32_t d0 = 0, d1 = 0, d2 = 0, d3 = 0;

for (size_t i = 0; i < d; ++i) {
auto x_i = (int32_t)x[i];
const int32_t q0 = x_i - (int32_t)y0[i];
const int32_t q1 = x_i - (int32_t)y1[i];
const int32_t q2 = x_i - (int32_t)y2[i];
const int32_t q3 = x_i - (int32_t)y3[i];
d0 += q0 * q0;
d1 += q1 * q1;
d2 += q2 * q2;
d3 += q3 * q3;
}

dis0 = (float)d0;
dis1 = (float)d1;
dis2 = (float)d2;
dis3 = (float)d3;
}

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
21 changes: 21 additions & 0 deletions src/simd/distances_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,27 @@ bf16_vec_L2sqr_batch_4_neon(const knowhere::bf16* x, const knowhere::bf16* y0, c
const knowhere::bf16* y2, const knowhere::bf16* y3, const size_t d, float& dis0,
float& dis1, float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// int8

float
int8_vec_inner_product_neon(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_L2sqr_neon(const int8_t* x, const int8_t* y, size_t d);

float
int8_vec_norm_L2sqr_neon(const int8_t* x, size_t d);

void
int8_vec_inner_product_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2,
const int8_t* y3, const size_t d, float& dis0, float& dis1, float& dis2,
float& dis3);

void
int8_vec_L2sqr_batch_4_neon(const int8_t* x, const int8_t* y0, const int8_t* y1, const int8_t* y2, const int8_t* y3,
const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);

///////////////////////////////////////////////////////////////////////////////
// for cardinal

Expand Down
Loading
Loading