diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index d6870ae1c..73baae6be 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -66,13 +66,13 @@ class hnsw_lib : public algo<T> {
   struct build_param {
     int m;
     int ef_construction;
-    int num_threads = omp_get_num_procs();
+    int num_threads = omp_get_max_threads();
   };
 
   using search_param_base = typename algo<T>::search_param;
   struct search_param : public search_param_base {
     int ef;
-    int num_threads = 1;
+    int num_threads = omp_get_max_threads();
   };
 
   hnsw_lib(Metric metric, int dim, const build_param& param);
@@ -175,12 +175,7 @@ void hnsw_lib<T>::set_search_param(const search_param_base& param_, const void*
   auto param     = dynamic_cast<const search_param&>(param_);
   appr_alg_->ef_ = param.ef;
   num_threads_   = param.num_threads;
-  // bench_mode_ = param.metric_objective;
   bench_mode_ = Mode::kLatency;  // TODO(achirkin): pass the benchmark mode in the algo parameters
-
-  // Create a pool if multiple query threads have been set and the pool hasn't been created already
-  bool create_pool = (bench_mode_ == Mode::kLatency && num_threads_ > 1 && !thread_pool_);
-  if (create_pool) { thread_pool_ = std::make_shared<fixed_thread_pool>(num_threads_); }
 }
 
 template <typename T>
@@ -192,7 +187,10 @@ void hnsw_lib<T>::search(
     get_search_knn_results(query + i * dim_, k, indices + i * k, distances + i * k);
   };
   if (bench_mode_ == Mode::kLatency && num_threads_ > 1) {
-    thread_pool_->submit(f, batch_size);
+#pragma omp parallel for num_threads(num_threads_)
+    for (int i = 0; i < batch_size; i++) {
+      f(i);
+    }
   } else {
     for (int i = 0; i < batch_size; i++) {
       f(i);
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 6ab8631d4..787646ee6 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -489,26 +489,15 @@ void search(raft::resources const& res,
   auto const* hnswlib_index =
     reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const*>(
       idx.get_index());
+  auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
 
-  // when num_threads == 0, automatically maximize parallelism
-  if (params.num_threads) {
 #pragma omp parallel for num_threads(params.num_threads)
-    for (int64_t i = 0; i < queries.extent(0); ++i) {
-      get_search_knn_results(hnswlib_index,
-                             queries.data_handle() + i * queries.extent(1),
-                             neighbors.extent(1),
-                             neighbors.data_handle() + i * neighbors.extent(1),
-                             distances.data_handle() + i * distances.extent(1));
-    }
-  } else {
-#pragma omp parallel for
-    for (int64_t i = 0; i < queries.extent(0); ++i) {
-      get_search_knn_results(hnswlib_index,
-                             queries.data_handle() + i * queries.extent(1),
-                             neighbors.extent(1),
-                             neighbors.data_handle() + i * neighbors.extent(1),
-                             distances.data_handle() + i * distances.extent(1));
-    }
+  for (int64_t i = 0; i < queries.extent(0); ++i) {
+    get_search_knn_results(hnswlib_index,
+                           queries.data_handle() + i * queries.extent(1),
+                           neighbors.extent(1),
+                           neighbors.data_handle() + i * neighbors.extent(1),
+                           distances.data_handle() + i * distances.extent(1));
   }
 }
 
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index 063502290..bf5cd35d3 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -8,7 +8,7 @@ groups:
       graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
-      hierarchy: ["none", "cpu"]
+      hierarchy: ["none", "cpu", "gpu"]
       ef_construction: [64, 128, 256, 512]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]