Skip to content
valentin petrov edited this page Nov 5, 2021 · 9 revisions

TL implementation manual

Each TL is a dynamically loadable component that implements collectives primitives defined in common interface ucc_tl_iface_t (src/components/tl/ucc_tl.h) TLs are discovered in runtime and used by CLs (higher level layers that build collective schedules using TL backends)

Interface

TL interface can be split into groups: control and data.

typedef struct ucc_tl_iface {
    ucc_component_iface_t          super;
    ucs_config_global_list_entry_t tl_lib_config;
    ucs_config_global_list_entry_t tl_context_config;
    ucc_base_lib_iface_t           lib;
    ucc_base_context_iface_t       context;
    ucc_base_team_iface_t          team;
    ucc_base_coll_iface_t          coll;
    ucc_tl_service_coll_t          scoll;
    ucc_base_coll_alg_info_t *     alg_info[UCC_COLL_TYPE_NUM];
} ucc_tl_iface_t;

Control includes: ucc_base_lib_iface_t, ucc_base_context_iface_t, ucc_base_team_iface_t

  • lib: tl library object constructor/destructor and get_lib_attr
  • context: tl context object constructor/destructor and get_context_attr
  • team: tl team constructor/destructor, team_create_test

Data includes: ucc_base_coll_iface_t

  • coll: collective_init

Code structure

TL code resides under src/components/tl/new and usually contains: tl_new.h, tl_new.c, tl_new_lib.c, tl_new_context.c, tl_new_team.c, tl_new_coll.c/h

tl_new.h:

  1. Main interface structure
typedef struct ucc_tl_new_iface {
    ucc_tl_iface_t super;
} ucc_tl_new_iface_t;
/* Extern iface should follow the pattern: ucc_tl_<tl_name> */
extern ucc_tl_new_iface_t ucc_tl_new;

The actual ucc_tl_new object is instantiated in tl_new.c using macro: UCC_TL_IFACE_DECLARE(new, NEW); 2. TL configuration (env var parameters). Configs are allowed for lib and for context (note, there 1-to-many relation between lib and ctx objects)

typedef struct ucc_tl_new_lib_config {
    ucc_tl_lib_config_t super;
    uint32_t            example_lib_param;
} ucc_tl_new_lib_config_t;
typedef struct ucc_tl_new_context_config {
    ucc_tl_context_config_t super;
    uint32_t                example_ctx_param;
} ucc_tl_new_context_config_t;
  1. Lib structure:
typedef struct ucc_tl_new_lib {
    ucc_tl_lib_t            super;
    ucc_tl_new_lib_config_t cfg;
} ucc_tl_new_lib_t;
UCC_CLASS_DECLARE(ucc_tl_new_lib_t, const ucc_base_new_params_t *,
                  const ucc_base_config_t *);
  1. Context structure:
typedef struct ucc_tl_new_context {
    ucc_tl_context_t            super;
    ucc_tl_new_context_config_t cfg;
} ucc_tl_new_context_t;
UCC_CLASS_DECLARE(ucc_tl_new_context_t, const ucc_base_context_params_t *,
                  const ucc_base_config_t *);
  1. Team structure: typedef struct ucc_tl_new_team { ucc_tl_team_t super; } ucc_tl_new_team_t; UCC_CLASS_DECLARE(ucc_tl_new_team_t, ucc_base_context_t *, const ucc_base_team_params_t *);

tl_new.c

  1. Contains declaration of iface:
UCC_TL_IFACE_DECLARE(new, NEW);
  1. config tables for lib and ctx:
static ucc_config_field_t ucc_tl_new_lib_config_table[] = {
    {"", "", NULL, ucc_offsetof(ucc_tl_new_lib_config_t, super),
     UCC_CONFIG_TYPE_TABLE(ucc_tl_lib_config_table)},

    {"EXAMPLE_LIB_PARAM", "1",
     "param description",
     ucc_offsetof(ucc_tl_new_lib_config_t, example_lib_param),
     UCC_CONFIG_TYPE_UINT},
    {NULL}};
static ucc_config_field_t ucc_tl_new_context_config_table[] = {
    {"", "", NULL, ucc_offsetof(ucc_tl_new_context_config_t, super),
     UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)},

    {"EXAMPLE_CTX_PARAM", "1",
     "param description",
     ucc_offsetof(ucc_tl_new_ctx_config_t, example_ctx_param),
     UCC_CONFIG_TYPE_UINT},
    {NULL}};
  1. may contain a constructor (see tl_ucp.c as example)

tl_new_lib.c

  1. LIB constructor/destructor
UCC_CLASS_INIT_FUNC(ucc_tl_new_lib_t, const ucc_base_lib_params_t *params,
                    const ucc_base_config_t *config)
{
    const ucc_tl_new_lib_config_t *tl_new_config =
        ucc_derived_of(config, ucc_tl_new_lib_config_t);
    UCC_CLASS_CALL_SUPER_INIT(ucc_tl_lib_t, &ucc_tl_new.super,
                              &tl_new_config->super);
    memcpy(&self->cfg, tl_new_config, sizeof(*tl_new_config));
    tl_info(&self->super, "initialized lib object: %p", self);
    return UCC_OK;
}

UCC_CLASS_CLEANUP_FUNC(ucc_tl_new_lib_t)
{
    tl_info(&self->super, "finalizing lib object: %p", self);
}

UCC_CLASS_DEFINE(ucc_tl_new_lib_t, ucc_tl_lib_t);
  1. ucc_tl_new_get_lib_attr Need to set supported THREAD_MODE, bitmask of supported coll_types and additional flags if required.
ucc_status_t ucc_tl_new_get_lib_attr(const ucc_base_lib_t *lib, /* NOLINT */
                                     ucc_base_lib_attr_t  *base_attr)
{
    ucc_tl_lib_attr_t *attr = ucc_derived_of(base_attr, ucc_tl_lib_attr_t);
    attr->super.attr.thread_mode = UCC_THREAD_SINGLE;
    attr->super.attr.coll_types  = UCC_COLL_TYPE_ALLREDUCE | UCC_COLL_TYPE_ALLOTALL;
    attr->super.flags            = UCC_BASE_LIB_FLAG_TEAM_ID_REQUIRED;
    return UCC_OK;
}

tl_new_context.c

  1. constructor/destructor Usually context contains some common resources of a TL, like mpool of tasks data structures. In such case those resources are initialized/released in the tl context constructor/destructor.
UCC_CLASS_INIT_FUNC(ucc_tl_new_context_t,
                    const ucc_base_context_params_t *params,
                    const ucc_base_config_t *config)
{
    ucc_tl_new_context_config_t *tl_new_config =
        ucc_derived_of(config, ucc_tl_new_context_config_t);
    UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, tl_new_config->super.tl_lib,
                              params->context);
    memcpy(&self->cfg, tl_new_config, sizeof(*tl_new_config));
    tl_info(self->super.super.lib, "initialized tl context: %p", self);
    return UCC_OK;
}
UCC_CLASS_CLEANUP_FUNC(ucc_tl_new_context_t)
{
    tl_info(self->super.super.lib, "finalizing tl context: %p", self);
}

UCC_CLASS_DEFINE(ucc_tl_new_context_t, ucc_tl_context_t);

  1. ucc_tl_new_get_context_attr Main attributes of a context is the address. TL may not have addressing information than TL_ADDR_LEN is 0. Otherwise the get attribute should implement ctx addr packing (see below) to automatically make TL address part of overall UCC EP address and make the exchange of addresses automatic.
ucc_status_t ucc_tl_new_get_context_attr(const ucc_base_context_t *context,
                                         ucc_base_ctx_attr_t      *attr)
{
    ucc_tl_new_context_t *ctx = ucc_derived_of(context, ucc_tl_new_context_t);
    if (attr->attr.mask & UCC_CONTEXT_ATTR_FIELD_CTX_ADDR_LEN) {
        attr->attr.ctx_addr_len = TL_ADDR_LEN;
    }
    if (attr->attr.mask & UCC_CONTEXT_ATTR_FIELD_CTX_ADDR) {
        memcpy(attr->attr.ctx_addr, TL_ADDRESS, TL_ADDR_LEN);
    }
    attr->topo_required = 0;
    return UCC_OK;
}

tl_new_team.c

  1. constructor/destructor Team constructor call is executed when user of UCC API calls ucc_team_create_post. The call must be non-blocking. If any exchange among participants is required during team creation, then it must be initiated and progressed from ucc_team_create_test in a non-blocking fashion (see tl_nccl as example of such oob exchange).
UCC_CLASS_INIT_FUNC(ucc_tl_new_team_t, ucc_base_context_t *tl_context,
                    const ucc_base_team_params_t *params)
{
    ucc_tl_new_context_t *ctx =
        ucc_derived_of(tl_context, ucc_tl_new_context_t);
    UCC_CLASS_CALL_SUPER_INIT(ucc_tl_team_t, &ctx->super, params->team);
    self->status             = UCC_INPROGRESS;
    tl_info(tl_context->lib, "posted tl team: %p", self);
    return UCC_OK;
}

UCC_CLASS_CLEANUP_FUNC(ucc_tl_new_team_t)
{
    tl_info(self->super.super.context->lib, "finalizing tl team: %p", self);
}

UCC_CLASS_DEFINE_DELETE_FUNC(ucc_tl_new_team_t, ucc_base_team_t);
UCC_CLASS_DEFINE(ucc_tl_new_team_t, ucc_tl_team_t);

ucc_status_t ucc_tl_new_team_destroy(ucc_base_team_t *tl_team)
{
    UCC_CLASS_DELETE_FUNC_NAME(ucc_tl_new_team_t)(tl_team);
    return UCC_OK;
}

ucc_status_t ucc_tl_new_team_create_test(ucc_base_team_t *tl_team)
{
    ucc_tl_new_team_t    *team = ucc_derived_of(tl_team, ucc_tl_new_team_t);
    ucc_tl_new_context_t *ctx  = UCC_TL_NEW_TEAM_CTX(team);

   /* ... TEST OOB Exchange for completion and proceed */

    tl_info(tl_team->context->lib, "initialized tl team: %p", team);
    team->status = UCC_OK;
    return UCC_OK;
}
  1. ucc_tl_new_get_scores get_scores function defines how this TL will be selected over other TLs. The function must return the score data structure that describes which collectives are supported by TL, which init function to be used for them and what score is assigned. Below is example of a simple score map with default TL score:
ucc_status_t ucc_tl_new_team_get_scores(ucc_base_team_t   *tl_team,
                                        ucc_coll_score_t **score_p)
{
    ucc_tl_new_team_t *team = ucc_derived_of(tl_team, ucc_tl_new_team_t);
    ucc_tl_new_lib_t  *lib  = UCC_TL_NEW_TEAM_LIB(team);
    ucc_coll_score_t  *score;
    ucc_status_t       status;
    unsigned           i;
    /* There can be a different logic for different coll_type/mem_type.
       Right now just init everything the same way. */
    status = ucc_coll_score_build_default(tl_team, UCC_TL_NEW_DEFAULT_SCORE,
                              ucc_tl_new_coll_init, UCC_TL_NEW_SUPPORTED_COLLS,
                              NULL, 0, &score);
    if (UCC_OK != status) {
        return status;
    }

    if (strlen(lib->super.super.score_str) > 0) {
        status = ucc_coll_score_update_from_str(
           lib->super.super.score_str, score, team->size, NULL,
           &team->super.super, UCC_TL_NEW_DEFAULT_SCORE, NULL);

        /* If INVALID_PARAM - User provided incorrect input - try to proceed */
        if ((status < 0) && (status != UCC_ERR_INVALID_PARAM) &&
            (status != UCC_ERR_NOT_SUPPORTED)) {
            goto err;
        }
    }
    *score_p = score;
    return UCC_OK;
}

Firstly, you define default range with default UCC_TL_NEW_DEFAULT_SCORE (score must be defined in tl_new.h and must differ from default scores of other TLs). Secondly, function applies change to the score parsing UCC_TL_NEW_TUNE param.

tl_new_coll.h

this file usually contains the definition of TL task data structure, like

typedef struct ucc_tl_new_task {
    ucc_coll_task_t   super;
    uint32_t          tl_specific_task_field;
} ucc_tl_new_task_t;

For example you can look at tl_ucp_coll.h

tl_new_coll.c

This one contains the data path (fast one). In UCC data path consists of 4 phases:

  1. Request initialization. User fills the ucc_coll_args_t structure that describes the collective and colls ucc_collective_init(args, team, &req). User gets ucc_coll_req_t request as output. At this stage the buffers ownership is still on the users side.
  2. User posts the collective. There are 2 ways to do that: (i) post for immediate execution using ucc_collective_post(req); or (ii) post for potentially deferred execution using ucc_collective_triggered_post(ee, req). After that the buffer ownership goes to UCC.
  3. During 3rd phase user must check the completeness of the request using ucc_collective_test(req). At the same time user must guarantee a progress of ucc context for the given request. e.g.
while (UCC_OK != (status = ucc_collective_test(req))) {
    if (status < 0) {
        /* handle error */
        ucc_collective_finalize(req);
        break;
    }
    ucc_context_progress(ucc_context);
}
  1. When the request is completed user cleans it up calling ucc_collective_finalize(req);

This phases are translated to TL level as follows. When ucc_collective_init is called by user then CORE level of UCC will to the basic CL/TL selection and eventually TL init function will be called. The function that was provided by TL for its coll_score data struct (see above). In this example it is : ucc_tl_new_coll_init.

This function must allocate TL level request: ucc_tl_new_task_t and return it. This request (task) always inherits from the ucc_coll_task_t (which in turn inherits from ucc_coll_req_t - what user has to deal with). It has 4 function pointers that TL should set:

  • post, function that is called once when the collective starts
  • triggered_post, function that is called when user calls ucc_collective_triggered_post. TL can use default common implementation ucc_collective_triggered_post from CORE if it is required. TL/NCCL, for example, does not need that since it can use CUDA streams natively.
  • finalize, used to cleanup TL task
  • progress, a function executed multiple times until request completes. This function is called as part of ucc_context_progress implementation.

Simple stub for ucc_tl_new_coll_init:

ucc_status_t ucc_tl_new_coll_init(ucc_base_coll_args_t *coll_args,
                                    ucc_base_team_t *team,
                                    ucc_coll_task_t **task_h)
{
    ucc_tl_new_context_t *ctx  = ucc_derived_of(team->context,  ucc_tl_new_context_t);
    ucc_tl_new_task_t *task;
    ucc_status_t         status;

    /* having mpool of tasks on TL context is common. see tl/sharp for example */
    task = ucc_mpool_get(&ctx->req_mp);
    /* main task init fn */
    ucc_coll_task_init(&task->super, coll_args, team);
    /* setting finalize callback - it will return task to mpool */
    task->super.finalize       = ucc_tl_new_coll_finalize;
    /* taking triggered post implementation from core */
    task->super.triggered_post = ucc_collective_triggered_post;

    /* Checking supported coll_types and performing alg specific 
       task initialization. Main thing is to set alg specific post/progress functions */
    switch (coll_args->args.coll_type)
    {
    case UCC_COLL_TYPE_ALLREDUCE:
        status = ucc_tl_new_allreduce_init(task);
        break;
    case UCC_COLL_TYPE_BARRIER:
        status = ucc_tl_new_barrier_init(task);
        break;
    default:
        tl_error(UCC_TASK_LIB(task),
                 "collective %d is not supported by tl",
                 coll_args->args.coll_type);
        status = UCC_ERR_NOT_SUPPORTED;
    }
    if (status != UCC_OK) {
        goto free_task;
    }

    tl_info(UCC_TASK_LIB(task), "init coll task %p", task);
    *task_h = &task->super;
    return status;

free_task:
    ucc_mpool_put(task);
    return status;
}

Examlpe of tl_new_allreduce_init function:

ucc_status_t ucc_tl_new_allreduce_init(ucc_tl_sharp_task_t *task)
{
    ucc_coll_args_t *args = &TASK_ARGS(task);

    task->super.post     = ucc_tl_new_allreduce_start;
    task->super.progress = ucc_tl_new_collective_progress;
    return UCC_OK;
}

Example of start function:

ucc_status_t ucc_tl_new_allreduce_start(ucc_coll_task_t *coll_task)
{
    ucc_tl_new_task_t          *task  = ucc_derived_of(coll_task, ucc_tl_sharp_task_t);
    ucc_tl_new_team_t          *team  = TASK_TEAM(task);
    ucc_coll_args_t              *args  = &TASK_ARGS(task);

    task->super.super.status = UCC_INPROGRESS;
    if (UCC_INPROGRESS == ucc_tl_new_allreduce_progress(coll_task)) {
        ucc_progress_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
        return UCC_OK;
    }

    return ucc_task_complete(coll_task);
}

Key moments in the start function:

  • Request (task) status is set to UCC_INPROGRESS
  • If collective is not completed immediately then it must be enqueued into the progress queue of the current context
  • Otherwise (and always when collective completes) tl must call ucc_task_complete(task).

The progress function:

ucc_status_t ucc_tl_new_allreduce_progress(ucc_coll_task_t *coll_task)
{
    ucc_tl_new_task_t *task  = ucc_derived_of(coll_task, ucc_tl_new_task_t);

    /* DO progressing of your collective.
       For example: tl/ucp which implements collectives using p2p would do all of the 
                    algorithms logic here: posting send/recv, polling their completions etc
                    tl/sharp which uses 3rd party library libsharp_coll can just test the libsharp_coll request here
                    tl/nccl similarly checks the state of NCCL coll (e.g. by querying the event written to the stream
                    right after coll start)
    */

    if (COLLECTIVE_IS_DONE) {
        /* mark request as done */
        coll_task->super.status = UCC_OK;
    }
    return coll_task->super.status;
}
Clone this wiki locally