Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug7263/pmix setup app api v3 #12

Open
wants to merge 7 commits into
base: slurm/slurm-20.11
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion auxdir/x_ac_pmix.m4
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ AC_DEFUN([X_AC_PMIX],

AM_CONDITIONAL(HAVE_PMIX, [test $_x_ac_pmix_v1_found = "1"] ||
[test $_x_ac_pmix_v2_found = "1"] ||
[test $_x_ac_pmix_v3_found = "1"] )
[test $_x_ac_pmix_v3_found = "1"] ||
[test $_x_ac_pmix_v4_found = "1"])
AM_CONDITIONAL(HAVE_PMIX_V1, [test $_x_ac_pmix_v1_found = "1"])
AM_CONDITIONAL(HAVE_PMIX_V2, [test $_x_ac_pmix_v2_found = "1"])
AM_CONDITIONAL(HAVE_PMIX_V3, [test $_x_ac_pmix_v3_found = "1"])
Expand Down
3 changes: 2 additions & 1 deletion configure
Original file line number Diff line number Diff line change
Expand Up @@ -21933,7 +21933,8 @@ $as_echo "$as_me: WARNING: unable to locate pmix installation" >&2;}

if test $_x_ac_pmix_v1_found = "1" ||
test $_x_ac_pmix_v2_found = "1" ||
test $_x_ac_pmix_v3_found = "1" ; then
test $_x_ac_pmix_v3_found = "1" ||
test $_x_ac_pmix_v4_found = "1"; then
HAVE_PMIX_TRUE=
HAVE_PMIX_FALSE='#'
else
Expand Down
20 changes: 20 additions & 0 deletions src/common/env.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ strong_alias(env_array_overwrite, slurm_env_array_overwrite);
strong_alias(env_array_overwrite_fmt, slurm_env_array_overwrite_fmt);
strong_alias(env_array_overwrite_het_fmt, slurm_env_array_overwrite_het_fmt);
strong_alias(env_unset_environment, slurm_env_unset_environment);
strong_alias(env_get_val_maxlen, slurm_env_get_val_maxlen);

#define ENV_BUFSIZE (256 * 1024)
#define MAX_ENV_STRLEN (32 * 4096) /* Needed for CPU_BIND and MEM_BIND on
Expand Down Expand Up @@ -2301,3 +2302,22 @@ extern char *find_quote_token(char *tmp, char *sep, char **last)

}
}

/*
* Get the maximum size of the env value for a particular environment variable
* with the name env_name.
*/
uint32_t
env_get_val_maxlen(const char *env_name)
{
if (!env_name)
return MAX_ENV_STRLEN;
/* not included delimiter '=', end of line '\0', at least one-byte value */
/* From setenvf():
* size = strlen(name) + strlen(value) + 2;
* [*] if (size >= MAX_ENV_STRLEN)
* Thus, available space for value is:
* MAX_ENV_STRLEN - env_name - 2 - 1(to ensure that [*] is false)
*/
return MAX_ENV_STRLEN - strlen(env_name) - 3;
}
6 changes: 6 additions & 0 deletions src/common/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,4 +375,10 @@ extern void set_env_from_opts(slurm_opt_t *opt, char ***dest,
*/
extern char *find_quote_token(char *tmp, char *sep, char **last);

/*
* Get the maximum size of the env value for a particular environment variable
* with the name env_name.
*/
uint32_t env_get_val_maxlen(const char *env_name);

#endif
1 change: 1 addition & 0 deletions src/common/slurm_xlator.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@
#define env_array_overwrite slurm_env_array_overwrite
#define env_array_overwrite_fmt slurm_env_array_overwrite_fmt
#define env_array_overwrite_het_fmt slurm_env_array_overwrite_het_fmt
#define env_get_val_maxlen slurm_env_get_val_maxlen

/* read_config.[ch] functions */
#define destroy_config_key_pair slurm_destroy_config_key_pair
Expand Down
42 changes: 8 additions & 34 deletions src/plugins/mpi/pmix/mpi_pmix.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
** mpi_pmix.c - Main plugin callbacks for PMIx support in Slurm
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2017 Mellanox Technologies. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Y. Polyakov <[email protected], [email protected]>.
*
* This file is part of Slurm, a resource management program.
Expand Down Expand Up @@ -83,14 +83,14 @@ const char plugin_type[] = "mpi/pmix_v1";
const char plugin_type[] = "mpi/pmix_v2";
#elif (HAVE_PMIX_VER == 3)
const char plugin_type[] = "mpi/pmix_v3";
#elif (HAVE_PMIX_VER == 4)
const char plugin_type[] = "mpi/pmix_v4";
#endif

const uint32_t plugin_version = SLURM_VERSION_NUMBER;

void *libpmix_plug = NULL;

char *process_mapping = NULL;

static void _libpmix_close(void *lib_plug)
{
xassert(lib_plug);
Expand All @@ -108,6 +108,8 @@ static void *_libpmix_open(void)
xstrfmtcat(full_path, "%s/", PMIXP_V2_LIBPATH);
#elif defined PMIXP_V3_LIBPATH
xstrfmtcat(full_path, "%s/", PMIXP_V3_LIBPATH);
#elif defined PMIXP_V4_LIBPATH
xstrfmtcat(full_path, "%s/", PMIXP_V4_LIBPATH);
#endif
xstrfmtcat(full_path, "libpmix.so");

Expand Down Expand Up @@ -204,44 +206,16 @@ extern int p_mpi_hook_slurmstepd_task(
extern mpi_plugin_client_state_t *p_mpi_hook_client_prelaunch(
const mpi_plugin_client_info_t *job, char ***env)
{
static pthread_mutex_t setup_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t setup_cond = PTHREAD_COND_INITIALIZER;
static bool setup_done = false;
uint32_t nnodes, ntasks, **tids;
uint16_t *task_cnt;

PMIXP_DEBUG("setup process mapping in srun");
if ((job->het_job_id == NO_VAL) || (job->het_job_task_offset == 0)) {
nnodes = job->step_layout->node_cnt;
ntasks = job->step_layout->task_cnt;
task_cnt = job->step_layout->tasks;
tids = job->step_layout->tids;
process_mapping = pack_process_mapping(nnodes, ntasks,
task_cnt, tids);
slurm_mutex_lock(&setup_mutex);
setup_done = true;
slurm_cond_broadcast(&setup_cond);
slurm_mutex_unlock(&setup_mutex);
} else {
slurm_mutex_lock(&setup_mutex);
while (!setup_done)
slurm_cond_wait(&setup_cond, &setup_mutex);
slurm_mutex_unlock(&setup_mutex);
}

if (!process_mapping) {
PMIXP_ERROR("Cannot create process mapping");
if (SLURM_SUCCESS != pmixp_srun_init(job, env)) {
PMIXP_ERROR("pmixp_srun_init() failed");
return NULL;
}
setenvf(env, PMIXP_SLURM_MAPPING_ENV, "%s", process_mapping);

/* only return NULL on error */
return (void *)0xdeadbeef;
}

extern int p_mpi_hook_client_fini(void)
{
xfree(process_mapping);

return SLURM_SUCCESS;
return pmixp_srun_finalize();
}
104 changes: 11 additions & 93 deletions src/plugins/mpi/pmix/pmixp_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
** pmix_client.c - PMIx client communication code
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2017 Mellanox Technologies. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Polyakov <[email protected], [email protected]>.
*
* This file is part of Slurm, a resource management program.
Expand Down Expand Up @@ -48,6 +48,7 @@
#include <sys/stat.h>
#include <sys/types.h>

#include <pmix.h>
#include <pmix_server.h>

#ifdef HAVE_HWLOC
Expand Down Expand Up @@ -357,108 +358,25 @@ static void _set_topology(List lresp)
return;
}

/*
* Estimate the size of a buffer capable of holding the proc map for this job.
* PMIx proc map string format:
*
* xx,yy,...,zz;ll,mm,...,nn;...;aa,bb,...,cc;
* - n0 ranks -;- n1 ranks -;...;- nX ranks -;
*
* To roughly estimate the size of the string we leverage the following
* dependency: for any rank \in [0; nspace->ntasks - 1]
* num_digits_10(rank) <= num_digits_10(nspace->ntasks)
*
* So we can say that the cumulative number "digits_cnt" of all symbols
* comprising all rank numbers in the namespace is:
* digits_size <= num_digits_10(nspace->ntasks) * nspace->ntasks
* Every rank is followed either by a comma, a semicolon, or the terminating
* '\0', thus each rank requires at most num_digits_10(nspace_ntasks) + 1.
* So we need at most: (num_digits_10(nspace->ntasks) + 1) * nspace->ntasks.
*
* Considering a 1.000.000 core system with 64PPN.
* The size of the intermediate buffer will be:
* - num_digits_10(1.000.000) = 7
* - (7 + 1) * 1.000.000 ~= 8MB
*/
static size_t _proc_map_buffer_size(uint32_t ntasks)
{
return (pmixp_count_digits_base10(ntasks) + 1) * ntasks;
}

/* Build a sequence of ranks sorted by nodes */
static void _build_node2task_map(pmixp_namespace_t *nsptr, uint32_t *node2tasks)
{
uint32_t *node_offs = xcalloc(nsptr->nnodes, sizeof(*node_offs));
uint32_t *node_tasks = xcalloc(nsptr->nnodes, sizeof(*node_tasks));

/* Build the offsets structure needed to fill the node-to-tasks map */
for (int i = 1; i < nsptr->nnodes; i++)
node_offs[i] = node_offs[i - 1] + nsptr->task_cnts[i - 1];

xassert(nsptr->ntasks == (node_offs[nsptr->nnodes - 1] +
nsptr->task_cnts[nsptr->nnodes - 1]));

/* Fill the node-to-task map */
for (int i = 0; i < nsptr->ntasks; i++) {
int node = nsptr->task_map[i], offset;
xassert(node < nsptr->nnodes);
offset = node_offs[node] + node_tasks[node]++;
xassert(nsptr->task_cnts[node] >= node_tasks[node]);
node2tasks[offset] = i;
}

/* Cleanup service structures */
xfree(node_offs);
xfree(node_tasks);
}

static int _set_mapsinfo(List lresp)
{
pmix_info_t *kvp;
char *regexp, *input, *map = NULL, *pos = NULL;
char *regexp;
pmixp_namespace_t *nsptr = pmixp_nspaces_local();
hostlist_t hl = nsptr->hl;
int rc, i, j;
int count = hostlist_count(hl);
uint32_t *node2tasks = NULL, *cur_task = NULL;

input = hostlist_deranged_string_malloc(hl);
rc = PMIx_generate_regex(input, &regexp);
free(input);
if (PMIX_SUCCESS != rc) {

if (NULL == (regexp = pmixp_info_get_node_map(nsptr->hl))) {
return SLURM_ERROR;
}
PMIXP_KVP_CREATE(kvp, PMIX_NODE_MAP, regexp, PMIX_STRING);
regexp = NULL;
list_append(lresp, kvp);

/* Preallocate the buffer to avoid constant xremalloc() calls. */
map = xmalloc(_proc_map_buffer_size(nsptr->ntasks));

/* Build a node-to-tasks map that can be traversed in O(n) steps */
node2tasks = xcalloc(nsptr->ntasks, sizeof(*node2tasks));
_build_node2task_map(nsptr, node2tasks);
cur_task = node2tasks;

for (i = 0; i < nsptr->nnodes; i++) {
char *sep = "";
/* For each node, provide IDs of the tasks residing on it */
for (j = 0; j < nsptr->task_cnts[i]; j++){
xstrfmtcatat(map, &pos, "%s%u", sep, *(cur_task++));
sep = ",";
}
if (i < (count - 1)) {
xstrfmtcatat(map, &pos, ";");
}
}
rc = PMIx_generate_ppn(map, &regexp);
xfree(map);
xfree(node2tasks);

if (PMIX_SUCCESS != rc) {
if (NULL == (regexp = pmixp_info_get_proc_map(nsptr->hl, nsptr->nnodes,
nsptr->ntasks,
nsptr->task_cnts,
nsptr->task_map))) {
return SLURM_ERROR;
}

PMIXP_KVP_CREATE(kvp, PMIX_PROC_MAP, regexp, PMIX_STRING);
regexp = NULL;
list_append(lresp, kvp);
Expand Down Expand Up @@ -494,7 +412,7 @@ static void _set_localinfo(List lresp)
list_append(lresp, kvp);
}

extern int pmixp_libpmix_init(void)
extern int pmixp_stepd_libpmix_init(void)
{
int rc;
mode_t rights = (S_IRUSR | S_IWUSR | S_IXUSR) |
Expand Down Expand Up @@ -530,7 +448,7 @@ extern int pmixp_libpmix_init(void)
return 0;
}

extern int pmixp_libpmix_finalize(void)
extern int pmixp_stepd_libpmix_finalize(void)
{
int rc = SLURM_SUCCESS, rc1;

Expand Down
12 changes: 9 additions & 3 deletions src/plugins/mpi/pmix/pmixp_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
** pmix_client.h - PMIx client communication code
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015 Mellanox Technologies. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Polyakov <[email protected], [email protected]>.
*
* This file is part of Slurm, a resource management program.
Expand Down Expand Up @@ -86,8 +86,8 @@
xfree(kvp); \
}

int pmixp_libpmix_init(void);
int pmixp_libpmix_finalize(void);
int pmixp_stepd_libpmix_init(void);
int pmixp_stepd_libpmix_finalize(void);
int pmixp_libpmix_job_set(void);
void pmix_libpmix_task_set(int rank, char ***env);
void pmix_client_new_conn(int fd);
Expand All @@ -108,4 +108,10 @@ int pmixp_lib_fence(const pmixp_proc_t procs[], size_t nprocs,
bool collect, char *data, size_t ndata,
void *cbfunc, void *cbdata);

int pmixp_srun_libpmix_init(const mpi_plugin_client_info_t *job, char ***env);
int pmixp_srun_libpmix_finalize(void);
int pmixp_libpmix_local_setup(char ***env);
int pmixp_libpmix_setup_application(const mpi_plugin_client_info_t *job,
char ***env);

#endif /* PMIXP_CLIENT_H */
19 changes: 17 additions & 2 deletions src/plugins/mpi/pmix/pmixp_client_v1.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
** pmix_client_v1.c - PMIx v1 client communication code
*****************************************************************************
* Copyright (C) 2014-2015 Artem Polyakov. All rights reserved.
* Copyright (C) 2015-2018 Mellanox Technologies. All rights reserved.
* Copyright (C) 2015-2020 Mellanox Technologies. All rights reserved.
* Written by Artem Polyakov <[email protected], [email protected]>,
* Boris Karasev <[email protected], [email protected]>.
*
Expand Down Expand Up @@ -223,7 +223,7 @@ int pmixp_lib_init(void)

#ifdef PMIX_SERVER_TMPDIR
PMIXP_KVP_ADD(kvp, PMIX_SERVER_TMPDIR,
pmixp_info_tmpdir_lib(), PMIX_STRING);
pmixp_info_tmpdir_lib(), PMIX_STRING);
#endif

/* setup the server library */
Expand Down Expand Up @@ -252,3 +252,18 @@ int pmixp_lib_finalize(void)
}
return rc;
}

int pmixp_srun_libpmix_init(const mpi_plugin_client_info_t *job, char ***env)
{
return SLURM_SUCCESS;
}

int pmixp_srun_libpmix_finalize(void)
{
return SLURM_SUCCESS;
}

int pmixp_libpmix_local_setup(char ***env)
{
return SLURM_SUCCESS;
}
Loading