Skip to content

Commit

Permalink
TL/CUDA: Linear Broadcast for GPU (#948)
Browse files Browse the repository at this point in the history
Adding linear CUDA Broadcast implementation with Active set feature support.
It gives functional improvement, and parity with others communication libraries.
- Ability to place many ranks on single GPU
- No GPU blocking, communication initiated from host
- Active set can be used to emulate P2P send/receive on top of broadcast collective
  • Loading branch information
ikryukov authored Feb 11, 2025
1 parent 770c272 commit 08e7639
Show file tree
Hide file tree
Showing 21 changed files with 681 additions and 110 deletions.
8 changes: 7 additions & 1 deletion src/components/tl/cuda/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
#

Expand Down Expand Up @@ -27,6 +27,11 @@ alltoallv = \
alltoallv/alltoallv.c \
alltoallv/alltoallv_ce.c

bcast = \
bcast/bcast.h \
bcast/bcast.c \
bcast/bcast_linear.c

reduce_scatter = \
reduce_scatter/reduce_scatter.h \
reduce_scatter/reduce_scatter.c \
Expand Down Expand Up @@ -54,6 +59,7 @@ sources = \
$(allgatherv) \
$(alltoall) \
$(alltoallv) \
$(bcast) \
$(reduce_scatter) \
$(reduce_scatterv)

Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgather/allgather.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
} else {
return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p);
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgather/allgather_linear.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
4 changes: 2 additions & 2 deletions src/components/tl/cuda/allgatherv/allgatherv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
} else {
return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p);
Expand Down
20 changes: 2 additions & 18 deletions src/components/tl/cuda/allgatherv/allgatherv_linear.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -55,22 +55,6 @@ enum
* other ranks to finish */
};

static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
int step_id)
{
ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);

return sync->seq_num[step_id];
}

static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
int step, int step_id)
{
ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);

sync->seq_num[step_id] = step;
}

ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
Expand Down Expand Up @@ -432,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
28 changes: 28 additions & 0 deletions src/components/tl/cuda/bcast/bcast.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "bcast.h"
#include "components/mc/ucc_mc.h"

ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = {
[UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR,
.name = "linear",
.desc = "linear bcast algorithm"},
[UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}};

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);

if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
} else {
return UCC_ERR_NOT_SUPPORTED;
}
}
43 changes: 43 additions & 0 deletions src/components/tl/cuda/bcast/bcast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#ifndef BCAST_H_
#define BCAST_H_

#include "tl_cuda.h"
#include "tl_cuda_coll.h"

enum
{
UCC_TL_CUDA_BCAST_ALG_LINEAR,
UCC_TL_CUDA_BCAST_ALG_LAST
};

extern ucc_base_coll_alg_info_t
ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1];

#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0"

ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str)
{
int i;
for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) {
if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) {
break;
}
}
return i;
}

#endif
Loading

0 comments on commit 08e7639

Please sign in to comment.