Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/components/tl/cuda/alltoall/alltoall.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down
36 changes: 28 additions & 8 deletions src/components/tl/cuda/alltoall/alltoall_ce.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
*
* See file LICENSE for terms.
Expand Down Expand Up @@ -32,12 +32,11 @@ size_t ucc_tl_cuda_alltoall_get_offset(const ucc_tl_cuda_task_t *task,
ucc_status_t ucc_tl_cuda_alltoall_ce_init(ucc_tl_cuda_task_t *task)
{
ucc_tl_cuda_team_t *team = TASK_TEAM(task);
ucc_tl_cuda_lib_t *lib = UCC_TL_CUDA_TEAM_LIB(team);
ucc_coll_args_t *args = &TASK_ARGS(task);
ucc_status_t status;
size_t data_len;

task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;

task->alltoallv_ce.get_size = ucc_tl_cuda_alltoall_get_size;
task->alltoallv_ce.get_offset = ucc_tl_cuda_alltoall_get_offset;
task->alltoallv_ce.sdt = args->src.info.datatype;
Expand All @@ -55,17 +54,41 @@ ucc_status_t ucc_tl_cuda_alltoall_ce_init(ucc_tl_cuda_task_t *task)
status = ucc_tl_cuda_mem_info_get(args->src.info.buffer, data_len,
&task->alltoallv_ce.mem_info_src);
if (ucc_unlikely(status != UCC_OK)) {
goto exit_err;
return status;
}

if (team->topo->proxy_needed) {
status = ucc_tl_cuda_mem_info_get(args->dst.info.buffer, data_len,
&task->alltoallv_ce.mem_info_dst);
if (ucc_unlikely(status != UCC_OK)) {
goto exit_err;
return status;
}
}

if (ucc_tl_cuda_task_is_cl_hier(task)) {
tl_debug(lib, "CL hier does not support copy engine, fallback to executor");
task->alltoallv_ce.use_copy_engine = 0;
} else {
task->alltoallv_ce.use_copy_engine = lib->cfg.alltoall_use_copy_engine;
}

if (task->alltoallv_ce.use_copy_engine) {
tl_trace(lib, "ucc_tl_cuda_alltoall_ce_init: copy engine");
task->super.triggered_post = ucc_tl_cuda_alltoallv_ce_triggered_post;

status = ucc_ec_create_event(&task->alltoallv_ce.evtCompletion, UCC_EE_CUDA_STREAM);
if (ucc_unlikely(status != UCC_OK)) {
tl_error(lib, "failed to create CUDA event");
ucc_tl_cuda_task_put(task);
return status;
}
task->alltoallv_ce.copy_post = cuda_copy_post;
} else {
tl_trace(lib, "ucc_tl_cuda_alltoall_ce_init: executor");
task->alltoallv_ce.copy_post = ee_copy_post;
task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
}

task->super.post = ucc_tl_cuda_alltoallv_ce_start;
task->super.triggered_post_setup =
ucc_tl_cuda_alltoallv_ce_triggered_post_setup;
Expand All @@ -74,7 +97,4 @@ ucc_status_t ucc_tl_cuda_alltoall_ce_init(ucc_tl_cuda_task_t *task)
task->bar = TASK_BAR(task);

return UCC_OK;

exit_err:
return status;
}
2 changes: 1 addition & 1 deletion src/components/tl/cuda/alltoallv/alltoallv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
*
* See file LICENSE for terms.
Expand Down
45 changes: 44 additions & 1 deletion src/components/tl/cuda/alltoallv/alltoallv.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
*
* See file LICENSE for terms.
Expand All @@ -24,4 +24,47 @@ ucc_status_t ucc_tl_cuda_alltoallv_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *tl_team,
ucc_coll_task_t **task_p);

/**
* @brief Post a copy operation using CUDA copy engine
*
* This function posts a copy operation to be executed directly by the CUDA copy engine.
* The executor and task parameters are unused as the operation is handled by CUDA.
* The stream parameter is used to specify which CUDA stream should execute the copy.
*
* @param dst Destination buffer for the copy operation
* @param src Source buffer for the copy operation
* @param len Length of data to copy in bytes
* @param executor Unused - operation handled by CUDA copy engine
* @param task Unused - operation handled by CUDA copy engine
* @param stream CUDA stream to execute the copy operation
* @return UCC_OK on success, error code otherwise
*/
ucc_status_t cuda_copy_post(void *dst, void *src, size_t len,
ucc_ee_executor_t *executor,
ucc_ee_executor_task_t **task, cudaStream_t stream);

/**
* @brief Post a copy operation using the UCC executor
*
* This function posts a copy operation to be executed by the UCC executor.
* The executor and task parameters are used to track the operation's progress.
* The stream parameter is unused as the executor manages its own execution context.
*
* @param dst Destination buffer for the copy operation
* @param src Source buffer for the copy operation
* @param len Length of data to copy in bytes
* @param executor UCC executor to handle the copy operation
* @param task Pointer to store the executor task handle
* @param stream Unused - executor manages its own execution context
* @return UCC_OK on success, error code otherwise
*/
ucc_status_t ee_copy_post(void *dst, void *src, size_t len,
ucc_ee_executor_t *executor,
ucc_ee_executor_task_t **task, cudaStream_t stream);

ucc_status_t ucc_tl_cuda_alltoallv_ce_triggered_post(ucc_ee_h ee, ucc_ev_t *ev,
ucc_coll_task_t *coll_task);

ucc_status_t ucc_tl_cuda_alltoallv_ce_post_batch_copies(ucc_tl_cuda_task_t *task);

#endif
Loading