From 2c180f1ce1ecef5087bb270dc60143296de6838d Mon Sep 17 00:00:00 2001 From: rhdong Date: Mon, 10 Feb 2025 04:42:05 -0800 Subject: [PATCH] fix: initial graph logical error on small dataset --- .../neighbors/detail/cagra/cagra_build.cuh | 4 +-- cpp/src/neighbors/detail/cagra/graph_core.cuh | 4 +-- cpp/src/neighbors/detail/nn_descent.cuh | 28 +++++++++++-------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 4807d7642..4d559a662 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -574,10 +574,10 @@ index build( size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; if (intermediate_degree >= static_cast(dataset.extent(0))) { - intermediate_degree = dataset.extent(0) - 1; RAFT_LOG_WARN( "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu", - intermediate_degree); + dataset.extent(0)); + intermediate_degree = dataset.extent(0) - 1; } if (intermediate_degree < graph_degree) { RAFT_LOG_WARN( diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 33a5e2c6e..4adaa7a93 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1242,10 +1242,8 @@ void optimize( } if (pk != output_graph_degree) { RAFT_LOG_DEBUG( - "Couldn't find the output_graph_degree (%lu vs pk: %lu) smallest detourable count nodes " - "for " + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " "node %lu in the rank-based node reranking process", - pk, output_graph_degree, i); invalid_neighbor_list = true; diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index b9dda3617..3d94b390f 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -973,7 +973,7 @@ int insert_to_ordered_list(InternalID_t* list, bool position_found = false; for (int i = 0; i < width; i++) { if (list[i].id() == neighb_id.id()) { - if (dist_list[i] == std::numeric_limits::max() && dist != dist_list[i]) { + if (dist_list[i] == std::numeric_limits::max()) { idx_insert = i; dist_list[i] = dist; } @@ -1050,30 +1050,34 @@ void GnndGraph::sample_graph_new(InternalID_t* new_neighbors, template void GnndGraph::init_random_graph() { - const bool small_dataset = (nrow <= 4 * segment_size); for (size_t seg_idx = 0; seg_idx < static_cast(num_segments); seg_idx++) { // random sequence (range: 0~nrow) // segment_x stores neighbors which id % num_segments == x - std::vector rand_seq(nrow / num_segments); + std::vector rand_seq((nrow + num_segments - 1) / num_segments); std::iota(rand_seq.begin(), rand_seq.end(), 0); auto gen = std::default_random_engine{seg_idx}; std::shuffle(rand_seq.begin(), rand_seq.end(), gen); #pragma omp parallel for for (size_t i = 0; i < nrow; i++) { - size_t base_idx = i * node_degree + seg_idx * segment_size; - auto h_neighbor_list = h_graph + base_idx; - auto h_dist_list = h_dists.data_handle() + base_idx; - size_t idx = base_idx; + size_t base_idx = i * node_degree + seg_idx * segment_size; + auto h_neighbor_list = h_graph + base_idx; + auto h_dist_list = h_dists.data_handle() + base_idx; + size_t idx = base_idx; + size_t self_in_this_seg = 0; for (size_t j = 0; j < static_cast(segment_size); j++) { Index_t id = rand_seq[idx % rand_seq.size()] * num_segments + seg_idx; - if ((size_t)id == i) { - idx = small_dataset ? (idx + 1) : (idx + segment_size); - id = rand_seq[idx % rand_seq.size()] * num_segments + seg_idx; + while ((size_t)id == i) { + idx++; + id = rand_seq[idx % rand_seq.size()] * num_segments + seg_idx; + self_in_this_seg = 1; } - h_neighbor_list[j].id_with_flag() = id; - h_dist_list[j] = std::numeric_limits::max(); + h_neighbor_list[j].id_with_flag() = + j < (rand_seq.size() - self_in_this_seg) && size_t(id) < nrow + ? id + : std::numeric_limits::max(); + h_dist_list[j] = std::numeric_limits::max(); idx++; } }