Add log-level control in pylibwholegraph to avoid redundant log info.

Merge remote-tracking branch 'upstream/branch-24.02' into logging-level
rapidsai · Jan 29, 2024 · eeb972d · eeb972d
2 parents 6792748 + 8cb7c5c
commit eeb972d
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -87,3 +87,4 @@ cpp/.idea/
 cpp/cmake-build-debug/
 pylibwholegraph/.idea/
 pylibwholegraph/cmake-build-debug/
+compile_commands.json
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

diff --git a/ci/build_python.sh b/ci/build_python.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

diff --git a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
@@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[embedding_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     grad_value += weight_decay * embedding_value;
     embedding_value -= lr * grad_value;
@@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     if (AdamW) {
       embedding_value -= lr * weight_decay * embedding_value;
@@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[embedding_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float state_sum              = state_sum_ptr[embedding_idx];
@@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float v                      = v_ptr[embedding_idx];

diff --git a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh
@@ -29,7 +29,7 @@ class nvshmem_device_reference {
     : pointer_(static_cast<DataTypeT*>(nvshmem_ref.pointer)),
       typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT))
   {
-    assert(gref.stride % sizeof(DataTypeT) == 0);
+    assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0);
   }
 
   __device__ nvshmem_device_reference() = delete;

diff --git a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
@@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem(
                                      p_env_fns,
                                      stream);
     // ungistre
+    WM_CUDA_CHECK(cudaStreamSynchronize(stream));
     if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) {
       WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem");
     }

diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
@@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams {
   wholememory_optimizer_type_t optimizer_type         = WHOLEMEMORY_OPT_SGD;
   float cache_ratio                                   = 0.2;
   bool use_cache                                      = false;
-  int run_count                                       = 1;
+  int run_count                                       = 3;
 
   float lr_ = 0.1;
 
@@ -428,7 +428,7 @@ void prepare_data_and_reference(
                    int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size;
                    CPUOptimizer cpu_optimizer(&params, start_entry, end_entry);
                    int embedding_dim = params.grad_description.sizes[1];
-                   for (int step = 0; step <= params.run_count; step++) {
+                   for (int step = 0; step < params.run_count; step++) {
                      int step_id = std::min(step, params.run_count - 1);
                      std::vector<int64_t> indices;
                      std::vector<std::vector<float>> grads;
@@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
       EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess);
       EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS);
 
-      for (int run = 0; run <= params.run_count; run++) {
+      for (int run = 0; run < params.run_count; run++) {
         int step_id            = std::min(run, params.run_count - 1);
         auto& rank_indices_vec = step_rank_indices[step_id][world_rank];
         auto& rank_grads_vec   = step_rank_grads[step_id][world_rank];
@@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P(
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD),
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM),
 #endif
+    EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4),
+    EmbeddingBackwardTestParams().set_embedding_dim(3),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type(
       WHOLEMEMORY_OPT_RMSPROP),

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -192,21 +192,23 @@ dependencies:
         packages: []
   test_cpp:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         packages:
           - nccl
   test_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         packages:
           - c-compiler
           - cxx-compiler
+          - nccl
+      - output_types: [conda, requirements]
+        packages:
           - ninja
           - numpy>=1.17
           - pytest
           - pytest-forked
           - pytest-xdist
-          - nccl
     specific:
       - output_types: [conda, requirements]
         matrices:
@@ -273,10 +275,12 @@ dependencies:
             packages:
   docs:
     common:
+      - output_types: [conda]
+        packages:
+          - *doxygen
       - output_types: [conda, requirements]
         packages:
           - breathe
-          - *doxygen
           - graphviz
           - ipython
           - ipykernel
@@ -297,10 +301,12 @@ dependencies:
   clang_tools:
     common:
       - output_types: [conda, requirements]
+        packages:
+          - gitpython
+      - output_types: conda
         packages:
           - clangxx==16.0.6
           - clang-tools==16.0.6
-          - gitpython
   python_build_wheel:
     common:
       - output_types: [pyproject]

diff --git a/python/pylibwholegraph/pylibwholegraph/torch/comm.py b/python/pylibwholegraph/pylibwholegraph/torch/comm.py
@@ -32,6 +32,19 @@
 all_comm_local_size = 1
 
 
+def reset_communicators():
+    global all_comm_world_rank, all_comm_world_size, all_comm_local_rank, all_comm_local_size
+    global global_communicators, local_node_communicator, local_device_communicator
+    global_communicators = {}
+    local_node_communicator = None
+    local_device_communicator = None
+
+    all_comm_world_rank = 0
+    all_comm_world_size = 1
+    all_comm_local_rank = 0
+    all_comm_local_size = 1
+
+
 def set_world_info(world_rank: int, world_size: int, local_rank: int, local_size: int):
     """
     Set the global world's information. This is used for create common used communicators, like local node communicator,

diff --git a/python/pylibwholegraph/pylibwholegraph/torch/initialize.py b/python/pylibwholegraph/pylibwholegraph/torch/initialize.py
@@ -15,7 +15,7 @@
 import torch
 import torch.utils.dlpack
 import pylibwholegraph.binding.wholememory_binding as wmb
-from .comm import set_world_info, get_global_communicator, get_local_node_communicator
+from .comm import set_world_info, get_global_communicator, get_local_node_communicator, reset_communicators
 
 
 def init(world_rank: int, world_size: int, local_rank: int, local_size: int, wm_log_level="info"):
@@ -80,3 +80,5 @@ def finalize():
     :return: None
     """
     wmb.finalize()
+    reset_communicators()
+    torch.distributed.destroy_process_group() if torch.distributed.is_initialized() else None