From 60492128dc04f12adeba9efa691a5b1167128134 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 19 Nov 2025 19:55:56 -0800 Subject: [PATCH 1/5] set template-version flag to optional for cluster create, add support for efa for pytorch job, remove default request and limits when instance type is none --- .../v1_1/model.py | 44 +++++--- .../v1_1/template.py | 10 +- .../hyperpod/cli/commands/cluster_stack.py | 19 +++- src/sagemaker/hyperpod/cli/commands/init.py | 5 + .../test_pytorch_job_template_model.py | 104 ++++++++++++++++++ 5 files changed, 166 insertions(+), 16 deletions(-) create mode 100644 test/unit_tests/training/test_pytorch_job_template_model.py diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index abfe0f53..01cf8075 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -333,19 +333,37 @@ def build_dict(**kwargs): return {k: v for k, v in kwargs.items() if v is not None} # Build resources - if self.instance_type is None: - requests_value = limits_value = {"nvidia.com/gpu": "0"} - else: - requests_value = build_dict( - accelerators=str(self.accelerators) if self.accelerators else None, - vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None - ) - limits_value = build_dict( - accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, - vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None - ) + requests_value = {} + limits_value = {} + + # Add GPU resources (respect accelerators regardless of instance_type) + if self.accelerators: + requests_value["nvidia.com/gpu"] = str(self.accelerators) + if self.accelerators_limit: + limits_value["nvidia.com/gpu"] = str(self.accelerators_limit) + + # Add CPU resources + if self.vcpu: + requests_value["cpu"] = str(self.vcpu) + if self.vcpu_limit: + limits_value["cpu"] = str(self.vcpu_limit) + + # Add memory resources + if self.memory: + requests_value["memory"] = f"{self.memory}Gi" + if self.memory_limit: + limits_value["memory"] = f"{self.memory_limit}Gi" + + # Add EFA for multi-node jobs + if self.node_count and self.node_count > 1: + requests_value["vpc.amazonaws.com/efa"] = "1" + limits_value["vpc.amazonaws.com/efa"] = "1" + + # Set default GPU to "0" only if no resources specified at all + if not requests_value: + requests_value = {"nvidia.com/gpu": "0"} + if not limits_value: + limits_value = {"nvidia.com/gpu": "0"} # Build container container_kwargs = build_dict( diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py index 4348d6cc..98b55475 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py @@ -84,7 +84,7 @@ {%- endfor %} {%- endif %} resources: -{%- if accelerators or vcpu or memory %} +{%- if accelerators or vcpu or memory or (node_count and node_count > 1) %} requests: {%- if accelerators %} nvidia.com/gpu: {{ accelerators }} @@ -95,11 +95,14 @@ {%- if memory %} memory: {{ memory }}Gi {%- endif %} +{%- if (node_count and node_count > 1) %} + vpc.amazonaws.com/efa: 1 +{%- endif %} {%- else %} requests: nvidia.com/gpu: "0" {%- endif %} -{%- if accelerators_limit or vcpu_limit or memory_limit %} +{%- if accelerators_limit or vcpu_limit or memory_limit or (node_count and node_count > 1) %} limits: {%- if accelerators_limit %} nvidia.com/gpu: {{ accelerators_limit }} @@ -110,6 +113,9 @@ {%- if memory_limit %} memory: {{ memory_limit }}Gi {%- endif %} +{%- if (node_count and node_count > 1) %} + vpc.amazonaws.com/efa: 1 +{%- endif %} {%- else %} limits: nvidia.com/gpu: "0" diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py index 2a278086..f53d544e 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -30,6 +30,19 @@ logger = logging.getLogger(__name__) +def get_newest_template_version() -> int: + """Get the newest available template version. + + Returns: + int: The newest template version number + + TODO: Implement logic to fetch the actual newest template version + from the template registry or remote source. + """ + # Placeholder implementation - currently returns 1 as the latest version + return 1 + + def parse_status_list(ctx, param, value): """Parse status list from string format like "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" """ if not value: @@ -79,7 +92,6 @@ def create_cluster_stack(config_file, region, template_version, debug): return # Load config to get template and version - config_dir = Path(config_file).parent data, template, version = load_config(config_dir) @@ -95,6 +107,11 @@ def create_cluster_stack(config_file, region, template_version, debug): model_instance = model_class(**filtered_config) config = model_instance.to_config(region=region) + # Use newest template version if not provided + if template_version is None: + template_version = get_newest_template_version() + logger.info(f"No template version specified, using newest version: {template_version}") + # Create the cluster stack stack_id = HpClusterStack(**config).create(region, template_version) diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py index 66ce7068..eb445d47 100644 --- a/src/sagemaker/hyperpod/cli/commands/init.py +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -11,6 +11,7 @@ CFN ) from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +from sagemaker.hyperpod.cli.commands.cluster_stack import get_newest_template_version from sagemaker.hyperpod.cli.init_utils import ( generate_click_command, save_config_yaml, @@ -375,6 +376,10 @@ def _default_create(region, template_version): # Pass region to to_domain for cluster stack template if template == "cluster-stack": config = template_model.to_config(region=region) + # Use newest template version if not provided + if template_version is None: + template_version = get_newest_template_version() + click.secho(f"No template version specified, using newest version: {template_version}", fg="yellow") HpClusterStack(**config).create(region, template_version) else: # Create from k8s.yaml diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py new file mode 100644 index 00000000..b7a3490e --- /dev/null +++ b/test/unit_tests/training/test_pytorch_job_template_model.py @@ -0,0 +1,104 @@ +import unittest +from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig + + +class TestPyTorchJobConfigEFA(unittest.TestCase): + """Test EFA resource allocation in PyTorchJobConfig""" + + def test_single_node_no_efa(self): + """Test that single-node jobs don't get EFA resources""" + config = PyTorchJobConfig( + job_name="test-single-node", + image="pytorch:latest", + node_count=1, + accelerators=2, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + container = job.replicaSpecs[0].template.spec.containers[0] + + # Should not have EFA resources + self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests) + self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits) + + # Should have GPU resources + self.assertEqual(container.resources.requests["nvidia.com/gpu"], "2") + + def test_multi_node_with_efa(self): + """Test that multi-node jobs automatically get EFA resources""" + config = PyTorchJobConfig( + job_name="test-multi-node", + image="pytorch:latest", + node_count=4, + accelerators=8, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + container = job.replicaSpecs[0].template.spec.containers[0] + + # Should have EFA resources + self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1") + self.assertEqual(container.resources.limits["vpc.amazonaws.com/efa"], "1") + + # Should also have GPU resources + self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8") + + def test_no_node_count_no_efa(self): + """Test that jobs without node_count don't get EFA resources""" + config = PyTorchJobConfig( + job_name="test-no-node-count", + image="pytorch:latest", + accelerators=1, + instance_type="ml.g5.xlarge" + ) + + job = config.to_domain() + container = job.replicaSpecs[0].template.spec.containers[0] + + # Should not have EFA resources + self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests) + self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits) + + def test_multi_node_with_memory_and_cpu(self): + """Test EFA with other resource types""" + config = PyTorchJobConfig( + job_name="test-multi-resources", + image="pytorch:latest", + node_count=2, + accelerators=4, + vcpu=16.0, + memory=64.0, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + container = job.replicaSpecs[0].template.spec.containers[0] + + # Should have all resources including EFA + self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1") + self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4") + self.assertEqual(container.resources.requests["cpu"], "16.0") + self.assertEqual(container.resources.requests["memory"], "64.0Gi") + + def test_accelerators_without_instance_type(self): + """Test that accelerators work without instance_type (fixes the main issue)""" + config = PyTorchJobConfig( + job_name="test-no-instance-type", + image="pytorch:latest", + accelerators=4 + # No instance_type specified + ) + + job = config.to_domain() + container = job.replicaSpecs[0].template.spec.containers[0] + + # Should respect accelerators value even without instance_type + self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4") + # Limits should default to "0" since accelerators_limit not specified + self.assertEqual(container.resources.limits["nvidia.com/gpu"], "0") + + +if __name__ == '__main__': + unittest.main() From 4845a7cb117f5b129ddb7d9e82a09767df2be6ce Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 20 Nov 2025 13:22:45 -0800 Subject: [PATCH 2/5] fix gpu allocation validation error --- src/sagemaker/hyperpod/training/quota_allocation_util.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index b3f1a4c1..6b4f081d 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -12,6 +12,7 @@ # language governing permissions and limitations under the License. import logging import re +import traceback from sagemaker.hyperpod.cli.constants.command_constants import NVIDIA_GPU_RESOURCE_LIMIT_KEY, NEURON_RESOURCE_LIMIT_KEY from sagemaker.hyperpod.cli.utils import ( setup_logger @@ -139,11 +140,12 @@ } def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool: - return ( + result = ( (memory_in_gib is not None) or (vcpu is not None ) or - (accelerators is not None) + (accelerators is not None and accelerators > 0) # Fix: treat accelerators=0 as not specified ) + return result # Gets resources from compute quotas that user provided; if not all provided, calculates defaults. def _get_resources_from_compute_quotas(instance_type: str, From 3b15a6859dc9dff82846ad3aa474039fe7fbd65a Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 20 Nov 2025 13:22:45 -0800 Subject: [PATCH 3/5] remove redundant --- src/sagemaker/hyperpod/training/quota_allocation_util.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index 6b4f081d..dd55147a 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -10,9 +10,7 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -import logging import re -import traceback from sagemaker.hyperpod.cli.constants.command_constants import NVIDIA_GPU_RESOURCE_LIMIT_KEY, NEURON_RESOURCE_LIMIT_KEY from sagemaker.hyperpod.cli.utils import ( setup_logger @@ -140,12 +138,11 @@ } def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool: - result = ( + return ( (memory_in_gib is not None) or (vcpu is not None ) or (accelerators is not None and accelerators > 0) # Fix: treat accelerators=0 as not specified ) - return result # Gets resources from compute quotas that user provided; if not all provided, calculates defaults. def _get_resources_from_compute_quotas(instance_type: str, From 6ee057e965f19d44aa698dce650fffbb4c27b2b6 Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 20 Nov 2025 14:15:18 -0800 Subject: [PATCH 4/5] fix unit test and expand logic to memory and vcpu field --- src/sagemaker/hyperpod/training/quota_allocation_util.py | 6 +++--- test/unit_tests/cli/test_quota_allocation_util.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index dd55147a..d34fff12 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -139,9 +139,9 @@ def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool: return ( - (memory_in_gib is not None) or - (vcpu is not None ) or - (accelerators is not None and accelerators > 0) # Fix: treat accelerators=0 as not specified + (memory_in_gib is not None and memory_in_gib > 0) or + (vcpu is not None and vcpu > 0) or + (accelerators is not None and accelerators > 0) ) # Gets resources from compute quotas that user provided; if not all provided, calculates defaults. diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py index fb87aaa5..b1c43598 100644 --- a/test/unit_tests/cli/test_quota_allocation_util.py +++ b/test/unit_tests/cli/test_quota_allocation_util.py @@ -52,10 +52,10 @@ class TestQuotaAllocationUtil: (16.0, None, 2, True), (None, 4.0, 2, True), (16.0, 4.0, 2, True), - # Zero values - (0, None, None, True), - (None, 0, None, True), - (None, None, 0, True), + # Zero values - all zeros treated as not specified for consistency + (0, None, None, False), + (None, 0, None, False), + (None, None, 0, False), ] ) def test_has_gpu_quota_allocation_resources(self, memory_in_gib, vcpu, accelerators, expected): From c81683827930b848a34090b87613fdd0155703f6 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 21 Nov 2025 10:53:59 -0800 Subject: [PATCH 5/5] Follow up on merge conflict in release --- .../v1_1/model.py | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 9011c44e..c2ab6012 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -373,32 +373,33 @@ def build_dict(**kwargs): return {k: v for k, v in kwargs.items() if v is not None} # Build resources - if self.instance_type is None: - requests_value = limits_value = {"nvidia.com/gpu": "0"} + if self.accelerator_partition_type: + partition_resource_key = f"nvidia.com/{self.accelerator_partition_type}" + requests_value = build_dict( + **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, + vcpu=str(self.vcpu) if self.vcpu else None, + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {} + ) + limits_value = build_dict( + **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, + vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {} + ) else: - if self.accelerator_partition_type: - partition_resource_key = f"nvidia.com/{self.accelerator_partition_type}" - requests_value = build_dict( - **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, - vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None - ) - limits_value = build_dict( - **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, - vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None - ) - else: - requests_value = build_dict( - accelerators=str(self.accelerators) if self.accelerators else None, - vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None - ) - limits_value = build_dict( - accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, - vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None - ) + requests_value = build_dict( + **{"nvidia.com/gpu": str(self.accelerators)} if self.accelerators else {}, + vcpu=str(self.vcpu) if self.vcpu else None, + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {} + ) + limits_value = build_dict( + **{"nvidia.com/gpu": str(self.accelerators_limit)} if self.accelerators_limit else {}, + vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {} + ) # Build container container_kwargs = build_dict(