From 60492128dc04f12adeba9efa691a5b1167128134 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Wed, 19 Nov 2025 19:55:56 -0800
Subject: [PATCH 1/5] set template-version flag to optional for cluster create,
 add support for efa for pytorch job, remove default request and limits when
 instance type is none

---
 .../v1_1/model.py                             |  44 +++++---
 .../v1_1/template.py                          |  10 +-
 .../hyperpod/cli/commands/cluster_stack.py    |  19 +++-
 src/sagemaker/hyperpod/cli/commands/init.py   |   5 +
 .../test_pytorch_job_template_model.py        | 104 ++++++++++++++++++
 5 files changed, 166 insertions(+), 16 deletions(-)
 create mode 100644 test/unit_tests/training/test_pytorch_job_template_model.py

diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
index abfe0f53..01cf8075 100644
--- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
+++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
@@ -333,19 +333,37 @@ def build_dict(**kwargs):
             return {k: v for k, v in kwargs.items() if v is not None}
         
         # Build resources
-        if self.instance_type is None:
-            requests_value = limits_value = {"nvidia.com/gpu": "0"}
-        else:
-            requests_value = build_dict(
-                accelerators=str(self.accelerators) if self.accelerators else None,
-                vcpu=str(self.vcpu) if self.vcpu else None,
-                memory=str(self.memory) if self.memory else None
-            )
-            limits_value = build_dict(
-                accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
-                vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
-                memory=str(self.memory_limit) if self.memory_limit else None
-            )
+        requests_value = {}
+        limits_value = {}
+        
+        # Add GPU resources (respect accelerators regardless of instance_type)
+        if self.accelerators:
+            requests_value["nvidia.com/gpu"] = str(self.accelerators)
+        if self.accelerators_limit:
+            limits_value["nvidia.com/gpu"] = str(self.accelerators_limit)
+            
+        # Add CPU resources
+        if self.vcpu:
+            requests_value["cpu"] = str(self.vcpu)
+        if self.vcpu_limit:
+            limits_value["cpu"] = str(self.vcpu_limit)
+            
+        # Add memory resources
+        if self.memory:
+            requests_value["memory"] = f"{self.memory}Gi"
+        if self.memory_limit:
+            limits_value["memory"] = f"{self.memory_limit}Gi"
+            
+        # Add EFA for multi-node jobs
+        if self.node_count and self.node_count > 1:
+            requests_value["vpc.amazonaws.com/efa"] = "1"
+            limits_value["vpc.amazonaws.com/efa"] = "1"
+        
+        # Set default GPU to "0" only if no resources specified at all
+        if not requests_value:
+            requests_value = {"nvidia.com/gpu": "0"}
+        if not limits_value:
+            limits_value = {"nvidia.com/gpu": "0"}
 
         # Build container
         container_kwargs = build_dict(
diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py
index 4348d6cc..98b55475 100644
--- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py
+++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py
@@ -84,7 +84,7 @@
 {%-             endfor %}
 {%-           endif %}
               resources:
-{%-           if accelerators or vcpu or memory %}
+{%-           if accelerators or vcpu or memory or (node_count and node_count > 1) %}
                 requests:
 {%-             if accelerators %}
                   nvidia.com/gpu: {{ accelerators }}
@@ -95,11 +95,14 @@
 {%-             if memory %}
                   memory: {{ memory }}Gi
 {%-             endif %}
+{%-             if (node_count and node_count > 1) %}
+                  vpc.amazonaws.com/efa: 1
+{%-             endif %}
 {%-           else %}
                 requests:
                   nvidia.com/gpu: "0"
 {%-           endif %}
-{%-           if accelerators_limit or vcpu_limit or memory_limit %}
+{%-           if accelerators_limit or vcpu_limit or memory_limit or (node_count and node_count > 1) %}
                 limits:
 {%-             if accelerators_limit %}
                   nvidia.com/gpu: {{ accelerators_limit }}
@@ -110,6 +113,9 @@
 {%-             if memory_limit %}
                   memory: {{ memory_limit }}Gi
 {%-             endif %}
+{%-             if (node_count and node_count > 1) %}
+                  vpc.amazonaws.com/efa: 1
+{%-             endif %}
 {%-           else %}
                 limits:
                   nvidia.com/gpu: "0"
diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py
index 2a278086..f53d544e 100644
--- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py
+++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py
@@ -30,6 +30,19 @@
 logger = logging.getLogger(__name__)
 
 
+def get_newest_template_version() -> int:
+    """Get the newest available template version.
+    
+    Returns:
+        int: The newest template version number
+        
+    TODO: Implement logic to fetch the actual newest template version
+    from the template registry or remote source.
+    """
+    # Placeholder implementation - currently returns 1 as the latest version
+    return 1
+
+
 def parse_status_list(ctx, param, value):
     """Parse status list from string format like "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" """
     if not value:
@@ -79,7 +92,6 @@ def create_cluster_stack(config_file, region, template_version, debug):
             return
 
         # Load config to get template and version
-
         config_dir = Path(config_file).parent
         data, template, version = load_config(config_dir)
 
@@ -95,6 +107,11 @@ def create_cluster_stack(config_file, region, template_version, debug):
             model_instance = model_class(**filtered_config)
             config = model_instance.to_config(region=region)
 
+            # Use newest template version if not provided
+            if template_version is None:
+                template_version = get_newest_template_version()
+                logger.info(f"No template version specified, using newest version: {template_version}")
+
             # Create the cluster stack
             stack_id = HpClusterStack(**config).create(region, template_version)
 
diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py
index 66ce7068..eb445d47 100644
--- a/src/sagemaker/hyperpod/cli/commands/init.py
+++ b/src/sagemaker/hyperpod/cli/commands/init.py
@@ -11,6 +11,7 @@
     CFN
 )
 from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack
+from sagemaker.hyperpod.cli.commands.cluster_stack import get_newest_template_version
 from sagemaker.hyperpod.cli.init_utils import (
     generate_click_command,
     save_config_yaml,
@@ -375,6 +376,10 @@ def _default_create(region, template_version):
             # Pass region to to_domain for cluster stack template
             if template == "cluster-stack":
                 config = template_model.to_config(region=region)
+                # Use newest template version if not provided
+                if template_version is None:
+                    template_version = get_newest_template_version()
+                    click.secho(f"No template version specified, using newest version: {template_version}", fg="yellow")
                 HpClusterStack(**config).create(region, template_version)
             else:
                 # Create from k8s.yaml
diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py
new file mode 100644
index 00000000..b7a3490e
--- /dev/null
+++ b/test/unit_tests/training/test_pytorch_job_template_model.py
@@ -0,0 +1,104 @@
+import unittest
+from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig
+
+
+class TestPyTorchJobConfigEFA(unittest.TestCase):
+    """Test EFA resource allocation in PyTorchJobConfig"""
+
+    def test_single_node_no_efa(self):
+        """Test that single-node jobs don't get EFA resources"""
+        config = PyTorchJobConfig(
+            job_name="test-single-node",
+            image="pytorch:latest",
+            node_count=1,
+            accelerators=2,
+            instance_type="ml.p4d.24xlarge"
+        )
+        
+        job = config.to_domain()
+        container = job.replicaSpecs[0].template.spec.containers[0]
+        
+        # Should not have EFA resources
+        self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
+        self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
+        
+        # Should have GPU resources
+        self.assertEqual(container.resources.requests["nvidia.com/gpu"], "2")
+
+    def test_multi_node_with_efa(self):
+        """Test that multi-node jobs automatically get EFA resources"""
+        config = PyTorchJobConfig(
+            job_name="test-multi-node",
+            image="pytorch:latest",
+            node_count=4,
+            accelerators=8,
+            instance_type="ml.p4d.24xlarge"
+        )
+        
+        job = config.to_domain()
+        container = job.replicaSpecs[0].template.spec.containers[0]
+        
+        # Should have EFA resources
+        self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1")
+        self.assertEqual(container.resources.limits["vpc.amazonaws.com/efa"], "1")
+        
+        # Should also have GPU resources
+        self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8")
+
+    def test_no_node_count_no_efa(self):
+        """Test that jobs without node_count don't get EFA resources"""
+        config = PyTorchJobConfig(
+            job_name="test-no-node-count",
+            image="pytorch:latest",
+            accelerators=1,
+            instance_type="ml.g5.xlarge"
+        )
+        
+        job = config.to_domain()
+        container = job.replicaSpecs[0].template.spec.containers[0]
+        
+        # Should not have EFA resources
+        self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
+        self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
+
+    def test_multi_node_with_memory_and_cpu(self):
+        """Test EFA with other resource types"""
+        config = PyTorchJobConfig(
+            job_name="test-multi-resources",
+            image="pytorch:latest",
+            node_count=2,
+            accelerators=4,
+            vcpu=16.0,
+            memory=64.0,
+            instance_type="ml.p4d.24xlarge"
+        )
+        
+        job = config.to_domain()
+        container = job.replicaSpecs[0].template.spec.containers[0]
+        
+        # Should have all resources including EFA
+        self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1")
+        self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4")
+        self.assertEqual(container.resources.requests["cpu"], "16.0")
+        self.assertEqual(container.resources.requests["memory"], "64.0Gi")
+
+    def test_accelerators_without_instance_type(self):
+        """Test that accelerators work without instance_type (fixes the main issue)"""
+        config = PyTorchJobConfig(
+            job_name="test-no-instance-type",
+            image="pytorch:latest",
+            accelerators=4
+            # No instance_type specified
+        )
+        
+        job = config.to_domain()
+        container = job.replicaSpecs[0].template.spec.containers[0]
+        
+        # Should respect accelerators value even without instance_type
+        self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4")
+        # Limits should default to "0" since accelerators_limit not specified
+        self.assertEqual(container.resources.limits["nvidia.com/gpu"], "0")
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4845a7cb117f5b129ddb7d9e82a09767df2be6ce Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Thu, 20 Nov 2025 13:22:45 -0800
Subject: [PATCH 2/5] fix gpu allocation validation error

---
 src/sagemaker/hyperpod/training/quota_allocation_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
index b3f1a4c1..6b4f081d 100644
--- a/src/sagemaker/hyperpod/training/quota_allocation_util.py
+++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -12,6 +12,7 @@
 # language governing permissions and limitations under the License.
 import logging
 import re
+import traceback
 from sagemaker.hyperpod.cli.constants.command_constants import NVIDIA_GPU_RESOURCE_LIMIT_KEY, NEURON_RESOURCE_LIMIT_KEY
 from sagemaker.hyperpod.cli.utils import (
     setup_logger
@@ -139,11 +140,12 @@
 }
 
 def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool:
-    return (
+    result = (
         (memory_in_gib is not None) or
         (vcpu is not None ) or
-        (accelerators is not None)
+        (accelerators is not None and accelerators > 0)  # Fix: treat accelerators=0 as not specified
     )
+    return result
 
 # Gets resources from compute quotas that user provided; if not all provided, calculates defaults.
 def _get_resources_from_compute_quotas(instance_type: str, 

From 3b15a6859dc9dff82846ad3aa474039fe7fbd65a Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Thu, 20 Nov 2025 13:22:45 -0800
Subject: [PATCH 3/5] remove redundant

---
 src/sagemaker/hyperpod/training/quota_allocation_util.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
index 6b4f081d..dd55147a 100644
--- a/src/sagemaker/hyperpod/training/quota_allocation_util.py
+++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -10,9 +10,7 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-import logging
 import re
-import traceback
 from sagemaker.hyperpod.cli.constants.command_constants import NVIDIA_GPU_RESOURCE_LIMIT_KEY, NEURON_RESOURCE_LIMIT_KEY
 from sagemaker.hyperpod.cli.utils import (
     setup_logger
@@ -140,12 +138,11 @@
 }
 
 def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool:
-    result = (
+    return (
         (memory_in_gib is not None) or
         (vcpu is not None ) or
         (accelerators is not None and accelerators > 0)  # Fix: treat accelerators=0 as not specified
     )
-    return result
 
 # Gets resources from compute quotas that user provided; if not all provided, calculates defaults.
 def _get_resources_from_compute_quotas(instance_type: str, 

From 6ee057e965f19d44aa698dce650fffbb4c27b2b6 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Thu, 20 Nov 2025 14:15:18 -0800
Subject: [PATCH 4/5] fix unit test and expand logic to memory and vcpu field

---
 src/sagemaker/hyperpod/training/quota_allocation_util.py | 6 +++---
 test/unit_tests/cli/test_quota_allocation_util.py        | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
index dd55147a..d34fff12 100644
--- a/src/sagemaker/hyperpod/training/quota_allocation_util.py
+++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -139,9 +139,9 @@
 
 def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool:
     return (
-        (memory_in_gib is not None) or
-        (vcpu is not None ) or
-        (accelerators is not None and accelerators > 0)  # Fix: treat accelerators=0 as not specified
+        (memory_in_gib is not None and memory_in_gib > 0) or
+        (vcpu is not None and vcpu > 0) or
+        (accelerators is not None and accelerators > 0)
     )
 
 # Gets resources from compute quotas that user provided; if not all provided, calculates defaults.
diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py
index fb87aaa5..b1c43598 100644
--- a/test/unit_tests/cli/test_quota_allocation_util.py
+++ b/test/unit_tests/cli/test_quota_allocation_util.py
@@ -52,10 +52,10 @@ class TestQuotaAllocationUtil:
             (16.0, None, 2, True),
             (None, 4.0, 2, True),
             (16.0, 4.0, 2, True),
-            # Zero values
-            (0, None, None, True),
-            (None, 0, None, True),
-            (None, None, 0, True),
+            # Zero values - all zeros treated as not specified for consistency
+            (0, None, None, False),
+            (None, 0, None, False),
+            (None, None, 0, False),
         ]
     )
     def test_has_gpu_quota_allocation_resources(self, memory_in_gib, vcpu, accelerators, expected):

From c81683827930b848a34090b87613fdd0155703f6 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Fri, 21 Nov 2025 10:53:59 -0800
Subject: [PATCH 5/5] Follow up on merge conflict in release

---
 .../v1_1/model.py                             | 51 ++++++++++---------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
index 9011c44e..c2ab6012 100644
--- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
+++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
@@ -373,32 +373,33 @@ def build_dict(**kwargs):
             return {k: v for k, v in kwargs.items() if v is not None}
         
         # Build resources
-        if self.instance_type is None:
-            requests_value = limits_value = {"nvidia.com/gpu": "0"}
+        if self.accelerator_partition_type:
+            partition_resource_key = f"nvidia.com/{self.accelerator_partition_type}"
+            requests_value = build_dict(
+                **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
+                vcpu=str(self.vcpu) if self.vcpu else None,
+                memory=str(self.memory) if self.memory else None,
+                **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {}
+            )
+            limits_value = build_dict(
+                **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
+                vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
+                memory=str(self.memory_limit) if self.memory_limit else None,
+                **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {}
+            )
         else:
-            if self.accelerator_partition_type:
-                partition_resource_key = f"nvidia.com/{self.accelerator_partition_type}"
-                requests_value = build_dict(
-                    **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
-                    vcpu=str(self.vcpu) if self.vcpu else None,
-                    memory=str(self.memory) if self.memory else None
-                )
-                limits_value = build_dict(
-                    **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
-                    vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
-                    memory=str(self.memory_limit) if self.memory_limit else None
-                )
-            else:
-                requests_value = build_dict(
-                    accelerators=str(self.accelerators) if self.accelerators else None,
-                    vcpu=str(self.vcpu) if self.vcpu else None,
-                    memory=str(self.memory) if self.memory else None
-                )
-                limits_value = build_dict(
-                    accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
-                    vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
-                    memory=str(self.memory_limit) if self.memory_limit else None
-                )
+            requests_value = build_dict(
+                **{"nvidia.com/gpu": str(self.accelerators)} if self.accelerators else {},
+                vcpu=str(self.vcpu) if self.vcpu else None,
+                memory=str(self.memory) if self.memory else None,
+                **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {}
+            )
+            limits_value = build_dict(
+                **{"nvidia.com/gpu": str(self.accelerators_limit)} if self.accelerators_limit else {},
+                vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
+                memory=str(self.memory_limit) if self.memory_limit else None,
+                **{"vpc.amazonaws.com/efa": "1"} if self.instance_type and "p4d" in self.instance_type else {}
+            )
 
         # Build container
         container_kwargs = build_dict(