Skip to content

Commit

Permalink
Support ParallelCluster 3.11.1 (#270)
Browse files Browse the repository at this point in the history
Require at least 4 GB or else instance doesn't have enough memory.

Update Lambdas to Python 3.12 from 3.9.

Fix bug in Xio configuration.

Resolves #268

Fix bug in ansible task that updates slurm.conf that didn't correctly
detect changes and restart slurmctld.

Resolves #267
  • Loading branch information
cartalla authored Oct 23, 2024
1 parent 1ebf9a9 commit ada1a31
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 24 deletions.
36 changes: 18 additions & 18 deletions source/cdk/cdk_slurm_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,9 +903,10 @@ def update_config_for_exostellar(self):
if self.slurm_compute_node_sg_id:
if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.slurm_compute_node_sg_id)
if self.res_dcv_security_group_id:
if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id)
if 'RESStackName' in self.config:
if self.res_dcv_security_group_id:
if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id)

# Get values from stack outputs
ems_ip_address = None
Expand Down Expand Up @@ -1376,8 +1377,9 @@ def check_regions_config(self):
self.instance_types = sorted(self.instance_types)

# Filter the instance types by architecture due to PC limitation to 1 architecture
# Also require at least 2 GB of memory.
# Also require at least 4 GB of memory.
# Also filter by the CPU vendor from the config
MIN_COMPUTE_NODE_GB = 4
cluster_architecture = self.config['slurm']['ParallelClusterConfig']['Architecture']
logger.info(f"ParallelCluster Architecture: {cluster_architecture}")
filtered_instance_types = []
Expand All @@ -1387,7 +1389,7 @@ def check_regions_config(self):
logger.warning(f"Excluding {instance_type} because architecture ({instance_architecture}) != {cluster_architecture}")
continue
mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024)
if mem_gb < 2:
if mem_gb < MIN_COMPUTE_NODE_GB:
logger.warning(f"Excluding {instance_type} because has less than 2 GiB of memory.")
continue
cpu_vendor = self.plugin.get_cpu_vendor(self.cluster_region, instance_type)
Expand Down Expand Up @@ -1425,9 +1427,7 @@ def create_parallel_cluster_lambdas(self):
aws_lambda.Architecture.X86_64,
],
compatible_runtimes = [
aws_lambda.Runtime.PYTHON_3_9,
# aws_lambda.Runtime.PYTHON_3_10, # Doesn't work: No module named 'rpds.rpds'
# aws_lambda.Runtime.PYTHON_3_11, # Doesn't work: No module named 'rpds.rpds'
aws_lambda.Runtime.PYTHON_3_12,
],
)

Expand All @@ -1437,7 +1437,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-CreateBuildFiles",
description="Create ParallelCluster build configuration files",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(2),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1499,7 +1499,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-CreateParallelClusterConfig",
description="Create ParallelCluster config",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1547,7 +1547,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-CreateParallelCluster",
description="Create ParallelCluster",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1846,7 +1846,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-CreateHeadNodeARecord",
description="Create head node A record",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1893,7 +1893,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-UpdateHeadNode",
description="Update head node",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1935,7 +1935,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-ConfigUsersGroupsJson",
description="Configure users and groups json file",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -1983,7 +1983,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-ConfigExternalLoginNodes",
description="Configure external login nodes",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -2030,7 +2030,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-DeconfigUsersGroupsJson",
description="Deconfigure RES users and groups json file",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -2072,7 +2072,7 @@ def create_parallel_cluster_lambdas(self):
function_name=f"{self.stack_name}-DeconfigExternalLoginNodes",
description="Deconfigure external login nodes",
memory_size=2048,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.X86_64,
timeout=Duration.minutes(15),
log_retention=logs.RetentionDays.INFINITE,
Expand Down Expand Up @@ -2114,7 +2114,7 @@ def create_callSlurmRestApiLambda(self):
function_name=f"{self.stack_name}-CallSlurmRestApiLambda",
description="Example showing how to call Slurm REST API",
memory_size=128,
runtime=aws_lambda.Runtime.PYTHON_3_9,
runtime=aws_lambda.Runtime.PYTHON_3_12,
architecture=aws_lambda.Architecture.ARM_64,
timeout=Duration.minutes(1),
log_retention=logs.RetentionDays.INFINITE,
Expand Down
11 changes: 11 additions & 0 deletions source/cdk/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
# 3.11.0:
# * Add support for ap-southeast-3
# * login node enhancements
# 3.11.1:
# * Disable Pyxis Spack plugin by default
# * Upgrade Python runtime to 3.12
# * Upgrade libjwt to version 1.17.0.
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
# Update source/resources/default_config.yml with latest version when this is updated.
PARALLEL_CLUSTER_VERSIONS = [
Expand All @@ -106,14 +110,17 @@
'3.10.0',
'3.10.1',
'3.11.0',
'3.11.1',
]
PARALLEL_CLUSTER_ENROOT_VERSIONS = {
# This can be found on the head node by running 'yum info enroot'
'3.11.0': '3.4.1', # confirmed
'3.11.1': '3.4.1', # confirmed
}
PARALLEL_CLUSTER_PYXIS_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/sources
'3.11.0': '0.20.0', # confirmed
'3.11.1': '0.20.0', # confirmed
}
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/sources
Expand All @@ -131,6 +138,7 @@
'3.10.0': '0.5.16', # confirmed
'3.10.1': '0.5.16', # confirmed
'3.11.0': '0.5.16', # confirmed
'3.11.1': '0.5.16', # confirmed
}
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
Expand All @@ -147,6 +155,7 @@
'3.10.0': '3.9.19', # confirmed
'3.10.1': '3.9.19', # confirmed
'3.11.0': '3.9.20', # confirmed
'3.11.1': '3.9.20', # confirmed
}
PARALLEL_CLUSTER_SLURM_VERSIONS = {
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
Expand All @@ -163,6 +172,7 @@
'3.10.0': '23.11.7', # confirmed
'3.10.1': '23.11.7', # confirmed
'3.11.0': '23.11.10', # confirmed
'3.11.1': '23.11.10', # confirmed
}
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
Expand All @@ -179,6 +189,7 @@
'3.10.0': '23-11-7-1', # confirmed
'3.10.1': '23-11-7-1', # confirmed
'3.11.0': '23-11-10-1', # confirmed
'3.11.1': '23-11-10-1', # confirmed
}
SLURM_REST_API_VERSIONS = {
'23-02-2-1': '0.0.39',
Expand Down
4 changes: 4 additions & 0 deletions source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ def lambda_handler(event, context):
else:
raise KeyError(error_message)

if requestType == 'Delete':
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name)
return

ami_builds = json.loads(environ['AmiBuildsJson'])
assets_bucket = environ['AssetsBucket']
assets_base_key = environ['AssetsBaseKey']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ fi
export PATH=/usr/sbin:$PATH

echo "Creating users and groups"
$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json
if [[ -e $config_dir/users_groups.json ]]; then
$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json
fi

# ansible_compute_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_compute_node_vars.yml"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@
cmd: |
set -ex
conf_files=$(find /opt/slurm -name '*.conf')
conf_files=$(find /opt/slurm/etc -name '*.conf')
backup_suffix=".$(date '+%Y-%m-%dT%H:%M:%S')~"
num_changed=0
for conf_file in ${conf_files[*]}; do
sed --in-place=$backup_suffix 's%/opt/slurm/etc%/opt/slurm/{{ cluster_name }}/etc%' $conf_file
sed --in-place=$backup_suffix 's%/opt/slurm/lib%/opt/slurm/{{ cluster_name }}/lib%' $conf_file
sed --in-place=$backup_suffix \
-e 's%/opt/slurm/etc%/opt/slurm/{{ cluster_name }}/etc%' \
-e 's%/opt/slurm/lib%/opt/slurm/{{ cluster_name }}/lib%' \
$conf_file
backup_conf_file="${conf_file}${backup_suffix}"
if diff -q $backup_conf_file $conf_file; then
Expand All @@ -56,6 +58,12 @@
else
echo "No conf files changed."
fi
register: change_slurm_conf_result

- name: Show change_slurm_conf_result
debug:
msg: |
{{ change_slurm_conf_result }}
- name: Fix permissions on config dir so users can access it to get the modulefiles
file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import json
import logging
import logging.handlers
import os
import pycurl
import requests
import yaml

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
cmd: |
set -ex
yum -y install python3.11-pip
python3.11 -m pip install requests PyYaml
{{ exostellar_dir }}/configure_xio.py
- name: Create {{ exostellar_dir }}/xspot.slurm.conf
Expand Down

0 comments on commit ada1a31

Please sign in to comment.