From 2f4ba1b8aa23a6ea0f418c4a93640aeaaedb05e4 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Fri, 18 Oct 2024 11:23:24 -0600 Subject: [PATCH] ofi/common: fix code that broke sessions With sessions initialization model (section 11.3 of MPI 4 standard) MPI may be initialized and finalized any number of times. This patch refactors code that was assuming a one shot init/finalize sequence for initializing Open MPI and its MCA param space The underlying problem with the replaced code was that when an app calls MPI_Session_finalize and there are no more sessions active, the MCA param space is destroyed. So if one does not build Open MPI to use dynamically load components, and a component is using static variables in a way that assumes the MCA param space is always preserved if a static variable is set to some value, then things break if a subsequent MPI_Session_init is invoked. Related to #12869 Signed-off-by: Howard Pritchard --- opal/mca/common/ofi/common_ofi.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 78df6003212..f6195b41af3 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -324,10 +324,11 @@ int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char int opal_common_ofi_mca_register(const mca_base_component_t *component) { - static int include_index = -1; - static int exclude_index = -1; - static int verbose_index = -1; - static int accelerator_rank_index = -1; + int include_index; + int exclude_index; + int verbose_index; + int accelerator_rank_index; + int param; int ret; if (fi_version() < FI_VERSION(1, 0)) { @@ -336,7 +337,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) OPAL_THREAD_LOCK(&opal_common_ofi_mutex); - if (0 > include_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include"); + if (0 > param) { /* * this monkey business is needed because of the way the MCA VARs stuff tries to handle * pointers to strings when when destructing the MCA var database. If you don't do @@ -359,9 +361,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = include_index; goto err; } + } else { + include_index = param; } - if (0 > exclude_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "provider_exclude"); + if (0 > param) { if (NULL == opal_common_ofi.prov_exclude) { opal_common_ofi.prov_exclude = (char **) malloc(sizeof(char *)); assert(NULL != opal_common_ofi.prov_exclude); @@ -378,9 +383,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = exclude_index; goto err; } + } else { + exclude_index = param; } - if (0 > verbose_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "verbose"); + if (0 > param) { verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose", "Verbose level of the OFI components", MCA_BASE_VAR_TYPE_INT, NULL, 0, @@ -391,9 +399,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = verbose_index; goto err; } + } else { + verbose_index = param; } - if (0 > accelerator_rank_index) { + + param = mca_base_var_find("opal", "opal_common", "ofi", "accelerator_rank"); + if (0 > param) { accelerator_rank_index = mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank", "Process rank(non-negative) on the selected accelerator device", @@ -404,6 +416,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = accelerator_rank_index; goto err; } + } else { + accelerator_rank_index = param; } if (component) {