Skip to content
This repository has been archived by the owner on Nov 25, 2024. It is now read-only.

Commit

Permalink
Add log-level control in pylibwholegraph to avoid redundant log info.
Browse files Browse the repository at this point in the history
Merge remote-tracking branch 'upstream/branch-24.02' into logging-level
  • Loading branch information
linhu-nv committed Jan 29, 2024
2 parents 6792748 + 8cb7c5c commit eeb972d
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,4 @@ cpp/.idea/
cpp/cmake-build-debug/
pylibwholegraph/.idea/
pylibwholegraph/cmake-build-debug/
compile_commands.json
6 changes: 5 additions & 1 deletion ci/build_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

set -euo pipefail

source rapids-env-update
rapids-configure-conda-channels

source rapids-configure-sccache

source rapids-date-string

export CMAKE_GENERATOR=Ninja

Expand Down
6 changes: 5 additions & 1 deletion ci/build_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

set -euo pipefail

source rapids-env-update
rapids-configure-conda-channels

source rapids-configure-sccache

source rapids-date-string

export CMAKE_GENERATOR=Ninja

Expand Down
12 changes: 8 additions & 4 deletions cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[embedding_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value += weight_decay * embedding_value;
embedding_value -= lr * grad_value;
Expand Down Expand Up @@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[local_dim_idx + loop_start_idx];
float embedding_value = embedding_ptr[embedding_idx];
if (AdamW) {
embedding_value -= lr * weight_decay * embedding_value;
Expand Down Expand Up @@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[embedding_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value = grad_value + weight_decay * embedding_value;
float state_sum = state_sum_ptr[embedding_idx];
Expand Down Expand Up @@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[local_dim_idx + loop_start_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value = grad_value + weight_decay * embedding_value;
float v = v_ptr[embedding_idx];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class nvshmem_device_reference {
: pointer_(static_cast<DataTypeT*>(nvshmem_ref.pointer)),
typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT))
{
assert(gref.stride % sizeof(DataTypeT) == 0);
assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0);
}

__device__ nvshmem_device_reference() = delete;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem(
p_env_fns,
stream);
// ungistre
WM_CUDA_CHECK(cudaStreamSynchronize(stream));
if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) {
WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams {
wholememory_optimizer_type_t optimizer_type = WHOLEMEMORY_OPT_SGD;
float cache_ratio = 0.2;
bool use_cache = false;
int run_count = 1;
int run_count = 3;

float lr_ = 0.1;

Expand Down Expand Up @@ -428,7 +428,7 @@ void prepare_data_and_reference(
int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size;
CPUOptimizer cpu_optimizer(&params, start_entry, end_entry);
int embedding_dim = params.grad_description.sizes[1];
for (int step = 0; step <= params.run_count; step++) {
for (int step = 0; step < params.run_count; step++) {
int step_id = std::min(step, params.run_count - 1);
std::vector<int64_t> indices;
std::vector<std::vector<float>> grads;
Expand Down Expand Up @@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess);
EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS);

for (int run = 0; run <= params.run_count; run++) {
for (int run = 0; run < params.run_count; run++) {
int step_id = std::min(run, params.run_count - 1);
auto& rank_indices_vec = step_rank_indices[step_id][world_rank];
auto& rank_grads_vec = step_rank_grads[step_id][world_rank];
Expand Down Expand Up @@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P(
EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD),
EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM),
#endif
EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4),
EmbeddingBackwardTestParams().set_embedding_dim(3),
EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131),
EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type(
WHOLEMEMORY_OPT_RMSPROP),
Expand Down
16 changes: 11 additions & 5 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,23 @@ dependencies:
packages: []
test_cpp:
common:
- output_types: [conda, requirements]
- output_types: [conda]
packages:
- nccl
test_python:
common:
- output_types: [conda, requirements]
- output_types: [conda]
packages:
- c-compiler
- cxx-compiler
- nccl
- output_types: [conda, requirements]
packages:
- ninja
- numpy>=1.17
- pytest
- pytest-forked
- pytest-xdist
- nccl
specific:
- output_types: [conda, requirements]
matrices:
Expand Down Expand Up @@ -273,10 +275,12 @@ dependencies:
packages:
docs:
common:
- output_types: [conda]
packages:
- *doxygen
- output_types: [conda, requirements]
packages:
- breathe
- *doxygen
- graphviz
- ipython
- ipykernel
Expand All @@ -297,10 +301,12 @@ dependencies:
clang_tools:
common:
- output_types: [conda, requirements]
packages:
- gitpython
- output_types: conda
packages:
- clangxx==16.0.6
- clang-tools==16.0.6
- gitpython
python_build_wheel:
common:
- output_types: [pyproject]
Expand Down
13 changes: 13 additions & 0 deletions python/pylibwholegraph/pylibwholegraph/torch/comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@
all_comm_local_size = 1


def reset_communicators():
global all_comm_world_rank, all_comm_world_size, all_comm_local_rank, all_comm_local_size
global global_communicators, local_node_communicator, local_device_communicator
global_communicators = {}
local_node_communicator = None
local_device_communicator = None

all_comm_world_rank = 0
all_comm_world_size = 1
all_comm_local_rank = 0
all_comm_local_size = 1


def set_world_info(world_rank: int, world_size: int, local_rank: int, local_size: int):
"""
Set the global world's information. This is used for create common used communicators, like local node communicator,
Expand Down
4 changes: 3 additions & 1 deletion python/pylibwholegraph/pylibwholegraph/torch/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import torch
import torch.utils.dlpack
import pylibwholegraph.binding.wholememory_binding as wmb
from .comm import set_world_info, get_global_communicator, get_local_node_communicator
from .comm import set_world_info, get_global_communicator, get_local_node_communicator, reset_communicators


def init(world_rank: int, world_size: int, local_rank: int, local_size: int, wm_log_level="info"):
Expand Down Expand Up @@ -80,3 +80,5 @@ def finalize():
:return: None
"""
wmb.finalize()
reset_communicators()
torch.distributed.destroy_process_group() if torch.distributed.is_initialized() else None

0 comments on commit eeb972d

Please sign in to comment.