diff --git a/cli/src/pcluster/aws/ec2.py b/cli/src/pcluster/aws/ec2.py index d17a492f17..8862315e60 100644 --- a/cli/src/pcluster/aws/ec2.py +++ b/cli/src/pcluster/aws/ec2.py @@ -162,6 +162,22 @@ def describe_image(self, ami_id): return ImageInfo(images[0]) raise AWSClientError(function_name="describe_images", message=f"Image {ami_id} not found") + @AWSExceptionHandler.handle_client_exception + @Cache.cached + def describe_launch_template_version(self, launch_template_id, version): + """Describe a specific launch template version and return its LaunchTemplateData.""" + response = self._client.describe_launch_template_versions( + LaunchTemplateId=launch_template_id, + Versions=[str(version)], + ) + versions = response.get("LaunchTemplateVersions", []) + if not versions: + raise AWSClientError( + function_name="describe_launch_template_versions", + message=f"Launch template {launch_template_id} version {version} not found", + ) + return versions[0].get("LaunchTemplateData", {}) + @AWSExceptionHandler.handle_client_exception @Cache.cached def describe_images(self, ami_ids, filters, owners): diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 7ab12723bc..5274dd9d7b 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -158,6 +158,7 @@ InstanceTypePlacementGroupValidator, InstanceTypeValidator, KeyPairValidator, + LaunchTemplateOverridesValidator, PlacementGroupCapacityReservationValidator, PlacementGroupCapacityTypeValidator, PlacementGroupNamingValidator, @@ -1631,6 +1632,7 @@ def __init__( self.managed_head_node_security_group = None self.managed_compute_security_group = None self.instance_types_data_version = "" + self.run_instances_overrides_version = "" def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument self._register_validator(RegionValidator, region=self.region) @@ -2222,6 +2224,15 @@ def scheduler_resources(self): return str(files(__package__).parent / "resources" / "batch") +class LaunchTemplateOverrides(Resource): + """Represent the LaunchTemplateOverrides configuration for a compute resource.""" + + def __init__(self, launch_template_id: str = None, version: int = None, **kwargs): + super().__init__(**kwargs) + self.launch_template_id = Resource.init_param(launch_template_id) + self.version = Resource.init_param(version) + + class _BaseSlurmComputeResource(BaseComputeResource): """Represent the Slurm Compute Resource.""" @@ -2240,6 +2251,7 @@ def __init__( tags: List[Tag] = None, static_node_priority: int = None, dynamic_node_priority: int = None, + launch_specification_overrides=None, **kwargs, ): super().__init__(**kwargs) @@ -2260,6 +2272,7 @@ def __init__( self.tags = tags self.static_node_priority = Resource.init_param(static_node_priority, default=1) self.dynamic_node_priority = Resource.init_param(dynamic_node_priority, default=1000) + self.launch_specification_overrides = launch_specification_overrides @abstractmethod def is_flexible(self) -> bool: @@ -2362,6 +2375,15 @@ def _register_validators(self, context: ValidatorContext = None): ec2memory=min_memory, instance_type=smallest_type, ) + if self.launch_specification_overrides: + self._register_validator( + LaunchTemplateOverridesValidator, + launch_template_id=self.launch_specification_overrides.launch_template_id, + version=self.launch_specification_overrides.version, + instance_types=self.instance_types, + max_network_cards=self.max_network_cards, + is_flexible=self.is_flexible(), + ) def is_flexible(self): """Return True because the ComputeResource can contain multiple instance types.""" @@ -2449,6 +2471,15 @@ def _register_validators(self, context: ValidatorContext = None): ec2memory=self._instance_type_info.ec2memory_size_in_mib(), instance_type=self.instance_type, ) + if self.launch_specification_overrides: + self._register_validator( + LaunchTemplateOverridesValidator, + launch_template_id=self.launch_specification_overrides.launch_template_id, + version=self.launch_specification_overrides.version, + instance_types=self.instance_types, + max_network_cards=self.max_network_cards, + is_flexible=self.is_flexible(), + ) @property def architecture(self) -> str: @@ -2975,6 +3006,40 @@ def get_instance_types_data(self): result[instance_type] = instance_type_info.instance_type_data return result + def get_run_instances_overrides(self): + """ + Build run_instances_overrides data from LaunchTemplateOverrides config. + + Iterates all queues and compute resources. For each compute resource that has + launch_specification_overrides configured, fetches the launch template data. + + Returns a dict keyed by {queue_name} -> {compute_resource_name} -> {launch_template_data}. + Returns empty dict if no overrides are configured. + """ + overrides = {} + for queue in self.scheduling.queues: + for compute_resource in queue.compute_resources: + if not compute_resource.launch_specification_overrides: + continue + + lt_overrides = compute_resource.launch_specification_overrides + lt_id = lt_overrides.launch_template_id + lt_version = lt_overrides.version + + LOGGER.info( + "Fetching launch template %s version %s for queue %s, compute resource %s", + lt_id, + lt_version, + queue.name, + compute_resource.name, + ) + lt_data = AWSApi.instance().ec2.describe_launch_template_version(lt_id, lt_version) + + if lt_data: + overrides.setdefault(queue.name, {})[compute_resource.name] = lt_data + + return overrides + @property def login_nodes_ami(self): """Get the image id of the LoginNodes.""" diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 748b598a03..188f9dbc17 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -232,6 +232,7 @@ "custom_artifacts_name": "artifacts.zip", "scheduler_resources_name": "scheduler_resources.zip", "change_set_name": "change-set.json", + "run_instances_overrides_name": "run_instances_overrides.json", } PCLUSTER_TAG_VALUE_REGEX = r"^([\w\+\-\=\.\_\:\@/]{0,256})$" diff --git a/cli/src/pcluster/models/cluster.py b/cli/src/pcluster/models/cluster.py index e5ffa4afcb..25c585aa1e 100644 --- a/cli/src/pcluster/models/cluster.py +++ b/cli/src/pcluster/models/cluster.py @@ -374,6 +374,7 @@ def create( artifact_dir_generated = True self._upload_config() self._upload_instance_types_data() + self._upload_run_instances_overrides() LOGGER.info("Generation and upload completed successfully") # Create template if not provided by the user @@ -558,6 +559,25 @@ def _upload_instance_types_data(self): e, f"Unable to upload instance types data to the S3 bucket {self.bucket.name} due to exception: {e}" ) + def _upload_run_instances_overrides(self): + """Upload run_instances_overrides.json to the cluster S3 bucket.""" + try: + overrides = self.config.get_run_instances_overrides() + LOGGER.info("Uploading run_instances_overrides.json to S3...") + result = self.bucket.upload_config( + config=overrides, + config_name=PCLUSTER_S3_ARTIFACTS_DICT.get("run_instances_overrides_name"), + format=S3FileFormat.JSON, + ) + self.config.run_instances_overrides_version = result.get("VersionId") + LOGGER.info("run_instances_overrides.json uploaded successfully.") + except Exception as e: + raise _cluster_error_mapper( + e, + f"Unable to upload run_instances_overrides.json to the S3 bucket {self.bucket.name} " + f"due to exception: {e}", + ) + def _upload_change_set(self, changes=None): """Upload change set.""" if changes: @@ -924,6 +944,7 @@ def update( self._add_tags() self._upload_config() self._upload_instance_types_data() + self._upload_run_instances_overrides() self._upload_change_set(changes) # Create template if not provided by the user diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index b417305deb..06c3de53e9 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -64,6 +64,7 @@ Image, Imds, IntelSoftware, + LaunchTemplateOverrides, LocalStorage, LoginNodes, LoginNodesIam, @@ -1536,6 +1537,25 @@ def make_resource(self, data, **kwargs): return BaseTag(**data) +class LaunchTemplateOverridesSchema(BaseSchema): + """Represent the schema of the LaunchTemplateOverrides section.""" + + launch_template_id = fields.Str( + required=True, + validate=validate.Regexp(r"^lt-[a-zA-Z0-9]+$"), + metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}, + ) + version = fields.Int( + required=True, + metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}, + ) + + @post_load + def make_resource(self, data, **kwargs): + """Generate resource.""" + return LaunchTemplateOverrides(**data) + + class SlurmComputeResourceSchema(_ComputeResourceSchema): """Represent the schema of the Slurm ComputeResource.""" @@ -1576,6 +1596,9 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema): validate=validate.Range(min=MIN_SLURM_NODE_PRIORITY, max=MAX_SLURM_NODE_PRIORITY), metadata={"update_policy": UpdatePolicy.SUPPORTED}, ) + launch_specification_overrides = fields.Nested( + LaunchTemplateOverridesSchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY} + ) @validates_schema def no_coexist_instance_type_flexibility(self, data, **kwargs): diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index c27b06e34d..b1f152c7a1 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -1419,10 +1419,13 @@ def _add_head_node(self): ), "cluster_config_version": self.config.config_version, "instance_types_data_version": self.config.instance_types_data_version, + "run_instances_overrides_version": self.config.run_instances_overrides_version, "change_set_s3_key": f"{self.bucket.artifact_directory}/configs/" f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}", "instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/" f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}", + "run_instances_overrides_s3_key": f"{self.bucket.artifact_directory}/configs/" + f"{PCLUSTER_S3_ARTIFACTS_DICT.get('run_instances_overrides_name')}", "custom_node_package": self.config.custom_node_package or "", "custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "", "head_node_imds_secured": str(self.config.head_node.imds.secured).lower(), diff --git a/cli/src/pcluster/validators/ec2_validators.py b/cli/src/pcluster/validators/ec2_validators.py index 26ffc06923..5820240cd5 100644 --- a/cli/src/pcluster/validators/ec2_validators.py +++ b/cli/src/pcluster/validators/ec2_validators.py @@ -870,3 +870,53 @@ def _validate(self, cluster_ultraserver_capacity_block_dict): f"The following capacity blocks have invalid block sizes: {'; '.join(invalid_capacity_blocks)}.", FailureLevel.ERROR, ) + + +class LaunchTemplateOverridesValidator(Validator): + """Validate the launch template overrides configuration.""" + + def _validate(self, launch_template_id, version, instance_types, max_network_cards, is_flexible): + try: + lt_data = AWSApi.instance().ec2.describe_launch_template_version(launch_template_id, str(version)) + except AWSClientError as e: + self._add_failure( + f"Unable to retrieve launch template {launch_template_id} version {version}. {str(e)}", + FailureLevel.ERROR, + ) + return + + # Check for properties not in allow list + allow_list = {"InstanceType", "NetworkInterfaces"} + denied_found = [prop for prop in lt_data if prop not in allow_list] + if denied_found: + self._add_failure( + f"Launch template {launch_template_id} contains unsupported properties: " + f"{', '.join(sorted(denied_found))}. Only NetworkInterfaces, InstanceType " + f"are supported in the override launch template.", + FailureLevel.ERROR, + ) + + # Validate network interface count does not exceed max supported + network_interfaces = lt_data.get("NetworkInterfaces", []) + if network_interfaces and len(network_interfaces) > max_network_cards: + self._add_failure( + f"Launch template {launch_template_id} configures {len(network_interfaces)} network interfaces, " + f"but the instance type supports a maximum of {max_network_cards}.", + FailureLevel.ERROR, + ) + + # Validate instance type in LT matches the compute resource if specified + lt_instance_type = lt_data.get("InstanceType") + if lt_instance_type and lt_instance_type not in instance_types: + self._add_failure( + f"Instance type '{lt_instance_type}' in launch template {launch_template_id} does not match " + f"the compute resource instance type(s): {', '.join(instance_types)}.", + FailureLevel.ERROR, + ) + + # Warn if used with flexible instance types + if is_flexible: + self._add_failure( + "LaunchTemplateOverrides cannot be used with flexible instance types.", + FailureLevel.ERROR, + ) diff --git a/cli/tests/pcluster/example_configs/slurm.full.yaml b/cli/tests/pcluster/example_configs/slurm.full.yaml index 3ef5184589..cd79a63989 100644 --- a/cli/tests/pcluster/example_configs/slurm.full.yaml +++ b/cli/tests/pcluster/example_configs/slurm.full.yaml @@ -180,6 +180,9 @@ Scheduling: HttpProxyAddress: https://proxy-address:port ComputeResources: - Name: compute-resource-1 + LaunchTemplateOverrides: + LaunchTemplateId: lt-0ab6123b7f1111111 + Version: "2" InstanceType: c4.2xlarge - Name: compute-resource-2 InstanceType: c5.2xlarge diff --git a/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py b/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py index 6073392e66..f14705949e 100644 --- a/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py +++ b/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py @@ -14,8 +14,111 @@ import boto3 import pytest from assertpy import assert_that +from cfn_stacks_factory import CfnStack from remote_command_executor import RemoteCommandExecutor -from utils import get_compute_nodes_instance_ids +from troposphere import GetAtt, Output, Ref, Template +from troposphere.ec2 import LaunchTemplate, LaunchTemplateData, NetworkInterfaces, SecurityGroup +from utils import generate_stack_name, get_compute_nodes_instance_ids + + +@pytest.fixture(scope="class") +def override_resources_stack(cfn_stacks_factory, request, region, vpc_stack, instance): + """ + Create a CFN stack with security groups and launch templates for LaunchSpecificationOverrides testing. + + Creates 3 launch templates for different use cases: + - EFA enabled + override SG on primary NIC (Use Case 1/2) + - EFA enabled + override SG on secondary NIC (Use Case 1/2 with non-primary card) + - EFA disabled + override InterfaceType + SG on primary NIC (Use Case 3/4) + """ + template = Template() + template.set_version() + template.set_description("Launch templates for LaunchSpecificationOverrides integration test") + + private_subnet_id = vpc_stack.cfn_outputs.get( + "PrivateSubnetId", vpc_stack.get_private_subnet() + ) + + # Security group used by all override launch templates + override_sg = template.add_resource( + SecurityGroup( + "OverrideSecurityGroup", + GroupDescription="Security group for LaunchSpecificationOverrides test", + VpcId=vpc_stack.cfn_outputs["VpcId"], + ) + ) + + # LT 1: EFA enabled, override SG on primary NIC only + lt_efa_primary = template.add_resource( + LaunchTemplate( + "LtEfaOverridePrimary", + LaunchTemplateData=LaunchTemplateData( + NetworkInterfaces=[ + NetworkInterfaces( + DeviceIndex=0, + NetworkCardIndex=0, + Groups=[Ref(override_sg)], + ), + ], + ), + ) + ) + + # LT 2: EFA enabled, override SG on secondary NIC (requires InstanceType) + lt_efa_secondary = template.add_resource( + LaunchTemplate( + "LtEfaOverrideSecondary", + LaunchTemplateData=LaunchTemplateData( + InstanceType=instance, + NetworkInterfaces=[ + NetworkInterfaces( + DeviceIndex=1, + NetworkCardIndex=1, + Groups=[Ref(override_sg)], + InterfaceType="efa-only", + SubnetId=private_subnet_id, + ), + ], + ), + ) + ) + + # LT 3: EFA disabled, override InterfaceType to efa + SG on primary NIC + lt_no_efa = template.add_resource( + LaunchTemplate( + "LtNoEfaOverride", + LaunchTemplateData=LaunchTemplateData( + NetworkInterfaces=[ + NetworkInterfaces( + DeviceIndex=0, + NetworkCardIndex=0, + Groups=[Ref(override_sg)], + InterfaceType="efa", + ), + ], + ), + ) + ) + + # Outputs + template.add_output(Output("OverrideSecurityGroupId", Value=Ref(override_sg))) + template.add_output(Output("LtEfaOverridePrimaryId", Value=Ref(lt_efa_primary))) + template.add_output(Output("LtEfaOverridePrimaryVersion", Value=GetAtt(lt_efa_primary, "LatestVersionNumber"))) + template.add_output(Output("LtEfaOverrideSecondaryId", Value=Ref(lt_efa_secondary))) + template.add_output(Output("LtEfaOverrideSecondaryVersion", Value=GetAtt(lt_efa_secondary, "LatestVersionNumber"))) + template.add_output(Output("LtNoEfaOverrideId", Value=Ref(lt_no_efa))) + template.add_output(Output("LtNoEfaOverrideVersion", Value=GetAtt(lt_no_efa, "LatestVersionNumber"))) + + stack = CfnStack( + name=generate_stack_name("integ-tests-lt-override", request.config.getoption("stackname_suffix")), + region=region, + template=template.to_json(), + ) + cfn_stacks_factory.create_stack(stack) + + yield stack + + cfn_stacks_factory.delete_stack(stack.name, region) @pytest.mark.usefixtures("os", "instance", "scheduler") @@ -25,16 +128,35 @@ def test_multiple_nics( test_datadir, clusters_factory, scheduler_commands_factory, + override_resources_stack, ): - cluster_config = pcluster_config_reader() + outputs = override_resources_stack.cfn_outputs + override_sg_id = outputs["OverrideSecurityGroupId"] + + cluster_config = pcluster_config_reader( + lt_efa_override_primary=outputs["LtEfaOverridePrimaryId"], + lt_efa_override_primary_version=outputs["LtEfaOverridePrimaryVersion"], + lt_efa_override_secondary=outputs["LtEfaOverrideSecondaryId"], + lt_efa_override_secondary_version=outputs["LtEfaOverrideSecondaryVersion"], + lt_no_efa_override=outputs["LtNoEfaOverrideId"], + lt_no_efa_override_version=outputs["LtNoEfaOverrideVersion"], + ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) _test_head_node_nics(remote_command_executor, region) _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands) + _test_overrides_file_exists(remote_command_executor) + _test_override_primary_nic_sg(cluster, region, "cr-efa-override-primary", override_sg_id) + _test_override_secondary_nic_sg(cluster, region, "cr-efa-override-secondary", override_sg_id) + _test_override_no_efa_primary_nic(cluster, region, "cr-no-efa-override", override_sg_id) +# --------------------------------------------------------------------------- +# Existing NIC tests (unchanged) +# --------------------------------------------------------------------------- + def _get_private_ip_addresses(instance_id): ec2_client = boto3.client("ec2") instance_info = ec2_client.describe_instances(InstanceIds=[instance_id])["Reservations"][0]["Instances"][0] @@ -46,7 +168,6 @@ def _get_private_ip_addresses(instance_id): def _test_head_node_nics(remote_command_executor, region): - # On the head node we just check that all the private IPs have been assigned to NICs token = remote_command_executor.run_remote_command( "sudo curl --retry 3 --retry-delay 0 --fail -s -X PUT 'http://169.254.169.254/latest/api/token' " "-H 'X-aws-ec2-metadata-token-ttl-seconds: 300'" @@ -67,7 +188,6 @@ def _test_head_node_nics(remote_command_executor, region): def _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands): compute_instance_id = get_compute_nodes_instance_ids(cluster.cfn_name, region)[0] - # Get compute node's IP addresses compute_ip_addresses = _get_private_ip_addresses(compute_instance_id) logging.info("Compute node IP addresses: %s", compute_ip_addresses) for ip_address in compute_ip_addresses: @@ -75,20 +195,16 @@ def _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_ def _test_compute_node_nic(ip_address, remote_command_executor, scheduler_commands): - # ping test from head node result = remote_command_executor.run_remote_command("ping -c 5 {0}".format(ip_address)) assert_that(result.stdout).matches(".*5 packets transmitted, 5 received, 0% packet loss,.*") - # ssh test from head node result = remote_command_executor.run_remote_command( "ssh -o StrictHostKeyChecking=no -q {0} echo Hello".format(ip_address) ) assert_that(result.stdout).matches("Hello") - # ping test from compute node results = {} sites = ["amazon.com", "google.com", "github.com"] for site in sites: results[site] = _check_ping(scheduler_commands, remote_command_executor, ip_address, site) - assert any(results.values()), f"Ping test failed for all sites. Results: {results}" @@ -100,3 +216,112 @@ def _check_ping(scheduler_commands, remote_command_executor, ip_address, site): scheduler_commands.wait_job_completed(job_id) result = remote_command_executor.run_remote_command(f"cat /shared/ping_{ip_address}_{site}.out") return "5 packets transmitted, 5 received, 0% packet loss" in result.stdout + + +# --------------------------------------------------------------------------- +# LaunchSpecificationOverrides verification helpers +# --------------------------------------------------------------------------- + +def _find_instance_by_compute_resource(cluster, region, compute_resource_name): + """Find the EC2 instance launched by a specific compute resource.""" + ec2_client = boto3.client("ec2", region_name=region) + compute_instance_ids = get_compute_nodes_instance_ids(cluster.cfn_name, region) + for instance_id in compute_instance_ids: + instance_info = ec2_client.describe_instances(InstanceIds=[instance_id])["Reservations"][0]["Instances"][0] + tags = {tag["Key"]: tag["Value"] for tag in instance_info.get("Tags", [])} + if tags.get("parallelcluster:compute-resource-name") == compute_resource_name: + return instance_info + return None + + +def _get_nic_by_device_index(instance_info, device_index): + """Get a NIC from instance info by device index.""" + for nic in instance_info["NetworkInterfaces"]: + if nic["Attachment"]["DeviceIndex"] == device_index: + return nic + return None + + +def _get_sg_ids(nic): + """Extract security group IDs from a NIC.""" + return [sg["GroupId"] for sg in nic["Groups"]] + + +def _test_overrides_file_exists(remote_command_executor): + """Verify pcluster_run_instances_overrides.json exists on head node.""" + result = remote_command_executor.run_remote_command( + "sudo cat /opt/slurm/etc/pcluster/pcluster_run_instances_overrides.json" + ) + assert_that(result.stdout).is_not_empty() + logging.info("pcluster_run_instances_overrides.json content: %s", result.stdout) + + +def _test_override_primary_nic_sg(cluster, region, compute_resource_name, expected_sg_id): + """ + Use Case 1/2: EFA enabled, override SG on primary NIC. + + Verify the override security group is applied to the primary NIC (DeviceIndex 0). + """ + instance_info = _find_instance_by_compute_resource(cluster, region, compute_resource_name) + assert_that(instance_info).described_as( + f"Expected to find instance for compute resource {compute_resource_name}" + ).is_not_none() + + primary_nic = _get_nic_by_device_index(instance_info, 0) + assert_that(primary_nic).is_not_none() + sg_ids = _get_sg_ids(primary_nic) + logging.info( + "CR %s primary NIC SGs: %s (expected %s)", compute_resource_name, sg_ids, expected_sg_id + ) + assert_that(sg_ids).contains(expected_sg_id) + + +def _test_override_secondary_nic_sg(cluster, region, compute_resource_name, expected_sg_id): + """ + Use Case 1/2 (non-primary card): EFA enabled, override SG on secondary NIC. + + Verify the override security group is applied to a secondary NIC (DeviceIndex 1). + """ + instance_info = _find_instance_by_compute_resource(cluster, region, compute_resource_name) + assert_that(instance_info).described_as( + f"Expected to find instance for compute resource {compute_resource_name}" + ).is_not_none() + + secondary_nic = _get_nic_by_device_index(instance_info, 1) + assert_that(secondary_nic).described_as( + f"Expected to find secondary NIC (DeviceIndex 1) on {compute_resource_name}" + ).is_not_none() + sg_ids = _get_sg_ids(secondary_nic) + logging.info( + "CR %s secondary NIC SGs: %s (expected %s)", compute_resource_name, sg_ids, expected_sg_id + ) + assert_that(sg_ids).contains(expected_sg_id) + + +def _test_override_no_efa_primary_nic(cluster, region, compute_resource_name, expected_sg_id): + """ + Use Case 3/4: EFA disabled, override InterfaceType + SG on primary NIC. + + Verify the override security group is applied and the InterfaceType is overridden to efa. + """ + instance_info = _find_instance_by_compute_resource(cluster, region, compute_resource_name) + assert_that(instance_info).described_as( + f"Expected to find instance for compute resource {compute_resource_name}" + ).is_not_none() + + primary_nic = _get_nic_by_device_index(instance_info, 0) + assert_that(primary_nic).is_not_none() + + # Verify override SG + sg_ids = _get_sg_ids(primary_nic) + logging.info( + "CR %s (no-efa) primary NIC SGs: %s (expected %s)", compute_resource_name, sg_ids, expected_sg_id + ) + assert_that(sg_ids).contains(expected_sg_id) + + # Verify InterfaceType was overridden to efa (EFA was disabled in cluster config) + interface_type = primary_nic.get("InterfaceType", "") + logging.info( + "CR %s (no-efa) primary NIC InterfaceType: %s (expected efa)", compute_resource_name, interface_type + ) + assert_that(interface_type).is_equal_to("efa") diff --git a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml index 15a74a8efa..66d9c057eb 100644 --- a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml +++ b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.yaml @@ -20,6 +20,7 @@ Scheduling: {% if scheduler == "awsbatch" %}AwsBatchQueues:{% else %}SlurmQueues:{% endif %} - Name: queue-0 ComputeResources: + # Baseline: EFA enabled, no override - Name: compute-resource-0 {% if scheduler == "awsbatch" %} InstanceTypes: @@ -33,6 +34,35 @@ Scheduling: Efa: Enabled: true {% endif %} + {% if scheduler != "awsbatch" and lt_efa_override_primary is defined %} + # Use Case 1/2: EFA enabled, override SG on primary NIC + - Name: cr-efa-override-primary + InstanceType: {{ instance }} + MinCount: 1 + Efa: + Enabled: true + LaunchSpecificationOverrides: + LaunchTemplateId: {{ lt_efa_override_primary }} + Version: "{{ lt_efa_override_primary_version }}" + # Use Case 1/2: EFA enabled, override SG on secondary NIC + - Name: cr-efa-override-secondary + InstanceType: {{ instance }} + MinCount: 1 + Efa: + Enabled: true + LaunchSpecificationOverrides: + LaunchTemplateId: {{ lt_efa_override_secondary }} + Version: "{{ lt_efa_override_secondary_version }}" + # Use Case 3/4: EFA disabled, override InterfaceType + SG on primary NIC + - Name: cr-no-efa-override + InstanceType: {{ instance }} + MinCount: 1 + Efa: + Enabled: false + LaunchSpecificationOverrides: + LaunchTemplateId: {{ lt_no_efa_override }} + Version: "{{ lt_no_efa_override_version }}" + {% endif %} Networking: SubnetIds: - {{ private_subnet_id }}