zilliztech · sre-ci-robot · Feb 19, 2025 · Feb 18, 2025 · alexanderguzhva · Feb 18, 2025
diff --git a/benchmark/hdf5/gen_fbin_file.cpp b/benchmark/hdf5/gen_fbin_file.cpp
@@ -176,7 +176,7 @@ class Create_FBIN : public Benchmark_hdf5, public ::testing::Test {
             // convert golden_ids to int32
             auto elem_cnt = result.value()->GetLims()[nq];
             std::vector<uint32_t> gt_ids_int(elem_cnt);
-            for (int32_t i = 0; i < elem_cnt; i++) {
+            for (size_t i = 0; i < elem_cnt; i++) {
                 gt_ids_int[i] = result.value()->GetIds()[i];
             }
 

diff --git a/cmake/utils/compile_flags.cmake b/cmake/utils/compile_flags.cmake
@@ -16,6 +16,7 @@ if(WITH_ASAN)
 endif()
 
 set(CMAKE_CXX_FLAGS "-Wall -fPIC ${CMAKE_CXX_FLAGS}")
+#set(CMAKE_CXX_FLAGS "-Wall -Werror -fPIC ${CMAKE_CXX_FLAGS}")
 
 if(__X86_64)
   set(CMAKE_CXX_FLAGS "-msse4.2 ${CMAKE_CXX_FLAGS}")

diff --git a/include/knowhere/comp/knowhere_check.h b/include/knowhere/comp/knowhere_check.h
@@ -19,7 +19,8 @@
 
 namespace knowhere {
 namespace KnowhereCheck {
-static bool
+
+__attribute__((unused)) static bool
 IndexTypeAndDataTypeCheck(const std::string& index_name, VecType data_type) {
     auto& static_index_table = std::get<0>(IndexFactory::StaticIndexTableInstance());
     auto key = std::pair<std::string, VecType>(index_name, data_type);
@@ -30,7 +31,7 @@ IndexTypeAndDataTypeCheck(const std::string& index_name, VecType data_type) {
     }
 }
 
-static bool
+__attribute__((unused)) static bool
 SupportMmapIndexTypeCheck(const std::string& index_name) {
     auto& mmap_index_table = std::get<1>(IndexFactory::StaticIndexTableInstance());
     if (mmap_index_table.find(index_name) != mmap_index_table.end()) {

diff --git a/include/knowhere/utils.h b/include/knowhere/utils.h
@@ -143,7 +143,13 @@ data_type_conversion(const DataSet& src, const std::optional<int64_t> start = st
 
     // map
     auto* des_data = new OutType[out_dim * count_rows];
-    std::memset(des_data, 0, sizeof(OutType) * out_dim * count_rows);
+
+    // only do memset() for intrinsic data types, such as float;
+    // for fp16/bf16, they will be initialized by the constructor
+    if constexpr (std::is_arithmetic_v<OutType>) {
+        std::memset(des_data, 0, sizeof(OutType) * out_dim * count_rows);
+    }
+
     auto* src_data = (const InType*)src.GetTensor();
     for (auto i = 0; i < count_rows; i++) {
         for (auto d = 0; d < in_dim; d++) {

diff --git a/src/simd/distances_avx.cc b/src/simd/distances_avx.cc
@@ -18,14 +18,14 @@
 #include <cassert>
 
 #include "faiss/impl/platform_macros.h"
-#include "simd_util.h"
 
 namespace faiss {
 
 #define ALIGNED(x) __attribute__((aligned(x)))
 
+namespace {
 // reads 0 <= d < 4 floats as __m128
-static inline __m128
+inline __m128
 masked_read(int d, const float* x) {
     assert(0 <= d && d < 4);
     ALIGNED(16) float buf[4] = {0, 0, 0, 0};
@@ -41,6 +41,46 @@ masked_read(int d, const float* x) {
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
+inline __m128i
+mm_masked_read_short(int d, const uint16_t* x) {
+    assert(0 <= d && d < 8);
+    ALIGNED(16) uint16_t buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    switch (d) {
+        case 7:
+            buf[6] = x[6];
+        case 6:
+            buf[5] = x[5];
+        case 5:
+            buf[4] = x[4];
+        case 4:
+            buf[3] = x[3];
+        case 3:
+            buf[2] = x[2];
+        case 2:
+            buf[1] = x[1];
+        case 1:
+            buf[0] = x[0];
+    }
+    return _mm_loadu_si128((__m128i*)buf);
+}
+
+inline __m256
+_mm256_bf16_to_fp32(const __m128i& a) {
+    __m256i o = _mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16);
+    return _mm256_castsi256_ps(o);
+}
+
+inline float
+_mm256_reduce_add_ps(const __m256 res) {
+    const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(res), _mm256_extractf128_ps(res, 1));
+    const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
+    const __m128 v1 = _mm_add_ps(sum, v0);
+    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+    const __m128 v3 = _mm_add_ps(v1, v2);
+    return _mm_cvtss_f32(v3);
+}
+}  // namespace
+
 // trust the compiler to unroll this properly
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float

diff --git a/src/simd/distances_avx512.cc b/src/simd/distances_avx512.cc
@@ -19,12 +19,12 @@
 #include <string>
 
 #include "faiss/impl/platform_macros.h"
-#include "simd_util.h"
 
 namespace faiss {
 
+namespace {
 // reads 0 <= d < 4 floats as __m128
-static inline __m128
+inline __m128
 masked_read(int d, const float* x) {
     assert(0 <= d && d < 4);
     __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
@@ -40,6 +40,12 @@ masked_read(int d, const float* x) {
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
+inline __m512
+_mm512_bf16_to_fp32(const __m256i& x) {
+    return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(x), 16));
+}
+}  // namespace
+
 // trust the compiler to unroll this properly
 FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float

diff --git a/src/simd/distances_neon.cc b/src/simd/distances_neon.cc
@@ -15,9 +15,51 @@
 #include <arm_neon.h>
 #include <math.h>
 
-#include "simd_util.h"
 namespace faiss {
 
+namespace {
+inline float32x4x4_t
+vcvt4_f32_f16(const float16x4x4_t a) {
+    float32x4x4_t c;
+    c.val[0] = vcvt_f32_f16(a.val[0]);
+    c.val[1] = vcvt_f32_f16(a.val[1]);
+    c.val[2] = vcvt_f32_f16(a.val[2]);
+    c.val[3] = vcvt_f32_f16(a.val[3]);
+    return c;
+}
+
+inline float32x4x2_t
+vcvt2_f32_f16(const float16x4x2_t a) {
+    float32x4x2_t c;
+    c.val[0] = vcvt_f32_f16(a.val[0]);
+    c.val[1] = vcvt_f32_f16(a.val[1]);
+    return c;
+}
+
+inline float32x4x4_t
+vcvt4_f32_half(const uint16x4x4_t x) {
+    float32x4x4_t c;
+    c.val[0] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[0]), 16));
+    c.val[1] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[1]), 16));
+    c.val[2] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[2]), 16));
+    c.val[3] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[3]), 16));
+    return c;
+}
+
+inline float32x4x2_t
+vcvt2_f32_half(const uint16x4x2_t x) {
+    float32x4x2_t c;
+    c.val[0] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[0]), 16));
+    c.val[1] = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x.val[1]), 16));
+    return c;
+}
+
+inline float32x4_t
+vcvt_f32_half(const uint16x4_t x) {
+    return vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(x), 16));
+}
+}  // namespace
+
 // The main goal is to reduce the original precision of floats to maintain consistency with the distance result
 // precision of the cardinal index.
 __attribute__((always_inline)) inline float32x4_t

diff --git a/src/simd/distances_sse.cc b/src/simd/distances_sse.cc
@@ -18,7 +18,6 @@
 #include <cstdint>
 
 #include "distances_ref.h"
-#include "simd_util.h"
 
 namespace faiss {
 
@@ -28,8 +27,9 @@ namespace faiss {
  * SSE and AVX implementations
  */
 
+namespace {
 // reads 0 <= d < 4 floats as __m128
-static inline __m128
+inline __m128
 masked_read(int d, const float* x) {
     assert(0 <= d && d < 4);
     ALIGNED(16) float buf[4] = {0, 0, 0, 0};
@@ -45,6 +45,36 @@ masked_read(int d, const float* x) {
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
+inline __m128i
+mm_masked_read_short(int d, const uint16_t* x) {
+    assert(0 <= d && d < 8);
+    ALIGNED(16) uint16_t buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    switch (d) {
+        case 7:
+            buf[6] = x[6];
+        case 6:
+            buf[5] = x[5];
+        case 5:
+            buf[4] = x[4];
+        case 4:
+            buf[3] = x[3];
+        case 3:
+            buf[2] = x[2];
+        case 2:
+            buf[1] = x[1];
+        case 1:
+            buf[0] = x[0];
+    }
+    return _mm_loadu_si128((__m128i*)buf);
+}
+
+inline __m128
+_mm_bf16_to_fp32(const __m128i& a) {
+    auto o = _mm_slli_epi32(_mm_cvtepu16_epi32(a), 16);
+    return _mm_castsi128_ps(o);
+}
+}  // namespace
+
 float
 fvec_norm_L2sqr_sse(const float* x, size_t d) {
     __m128 mx;

diff --git a/src/simd/simd_util.h b/src/simd/simd_util.h
diff --git a/tests/ut/test_data_view_index.cc b/tests/ut/test_data_view_index.cc
@@ -170,7 +170,7 @@ TEST_CASE("Ensure topk test", "[float metrics]") {
         {knowhere::meta::TOPK, topk},
     };
     knowhere::ViewDataOp data_view = [&train_ds, data_size = sizeof(float) * dim](size_t id) {
-        auto data = train_ds->GetTensor();
+        auto data = (const char*)train_ds->GetTensor();
         return data + data_size * id;
     };
     auto data_view_pack = knowhere::Pack(data_view);
@@ -206,7 +206,6 @@ BaseTest(const knowhere::DataSetPtr train_ds, const knowhere::DataSetPtr query_d
     auto base = knowhere::ConvertToDataTypeIfNeeded<DataType>(train_ds);
     auto query = knowhere::ConvertToDataTypeIfNeeded<DataType>(query_ds);
     auto dim = base->GetDim();
-    auto nb = base->GetRows();
     auto nq = query->GetRows();
 
     auto knn_gt = knowhere::BruteForce::Search<DataType>(base, query, conf, nullptr);