Skip to content

Commit 41ae2b7

Browse files
committed
Remove hardcoded training image refernce
- Already defined as a const
1 parent 723fb6c commit 41ae2b7

File tree

2 files changed

+56
-60
lines changed

2 files changed

+56
-60
lines changed

pipeline.yaml

+54-56
Original file line numberDiff line numberDiff line change
@@ -747,16 +747,15 @@ deploymentSpec:
747747
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
748748
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
749749
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
750-
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
751-
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
752-
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
753-
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
754-
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
755-
\ 1\n restartPolicy: OnFailure\n template:\n \
756-
\ metadata:\n annotations:\n \
757-
\ sidecar.istio.io/inject: 'false'\n spec:\n \
758-
\ containers:\n - args:\n \
759-
\ - |\n echo \"Running phase {phase_num}\"\
750+
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
751+
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
752+
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
753+
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
754+
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
755+
\ template:\n metadata:\n annotations:\n\
756+
\ sidecar.istio.io/inject: 'false'\n spec:\n\
757+
\ containers:\n - args:\n \
758+
\ - |\n echo \"Running phase {phase_num}\"\
760759
\n echo \"Using {path_to_model} model for training\"\
761760
\n echo \"Using {path_to_data} data for training\"\
762761
\n mkdir -p /output/phase_{phase_num}/model;\n\
@@ -781,23 +780,23 @@ deploymentSpec:
781780
\ --checkpoint_at_epoch\n \
782781
\ command:\n - /bin/bash\n \
783782
\ - '-c'\n - '--'\n image:\
784-
\ {image}\n name: pytorch\n volumeMounts:\n\
785-
\ - mountPath: /input_data\n \
786-
\ name: input-data\n readOnly: true\n \
787-
\ - mountPath: /input_model\n \
788-
\ name: model\n readOnly: true\n \
789-
\ - mountPath: /output\n name: output\n\
790-
\ env:\n - name: NNODES\n \
791-
\ value: \\\"{nnodes}\\\"\n \
792-
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
793-
\"\n - name: XDG_CACHE_HOME\n \
794-
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
783+
\ {RHELAI_IMAGE}\n name: pytorch\n \
784+
\ volumeMounts:\n - mountPath: /input_data\n\
785+
\ name: input-data\n readOnly:\
786+
\ true\n - mountPath: /input_model\n \
787+
\ name: model\n readOnly: true\n \
788+
\ - mountPath: /output\n \
789+
\ name: output\n env:\n - name:\
790+
\ NNODES\n value: \\\"{nnodes}\\\"\n \
791+
\ - name: NPROC_PER_NODE\n value:\
792+
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
795793
\ value: /tmp\n - name:\
796-
\ HF_HOME\n value: /tmp\n \
797-
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
798-
\ resources:\n requests:\n \
799-
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
800-
\ limits:\n \"nvidia.com/gpu\"\
794+
\ TRITON_CACHE_DIR\n value: /tmp\n \
795+
\ - name: HF_HOME\n value: /tmp\n \
796+
\ - name: TRANSFORMERS_CACHE\n \
797+
\ value: /tmp\n resources:\n \
798+
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
799+
\ limits:\n \"nvidia.com/gpu\"\
801800
: {nproc_per_node}\n volumes:\n - name:\
802801
\ input-data\n persistentVolumeClaim:\n \
803802
\ claimName: {input_pvc_name}\n - name: model\n\
@@ -831,8 +830,8 @@ deploymentSpec:
831830
\ \\\n --checkpoint_at_epoch\n \
832831
\ command:\n - /bin/bash\n \
833832
\ - '-c'\n - '--'\n \
834-
\ image: {image}\n name: pytorch\n \
835-
\ volumeMounts:\n - mountPath: /input_data\n\
833+
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
834+
\ volumeMounts:\n - mountPath: /input_data\n\
836835
\ name: input-data\n readOnly:\
837836
\ true\n - mountPath: /input_model\n \
838837
\ name: model\n readOnly: true\n \
@@ -951,16 +950,15 @@ deploymentSpec:
951950
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
952951
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
953952
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
954-
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
955-
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
956-
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
957-
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
958-
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
959-
\ 1\n restartPolicy: OnFailure\n template:\n \
960-
\ metadata:\n annotations:\n \
961-
\ sidecar.istio.io/inject: 'false'\n spec:\n \
962-
\ containers:\n - args:\n \
963-
\ - |\n echo \"Running phase {phase_num}\"\
953+
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
954+
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
955+
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
956+
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
957+
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
958+
\ template:\n metadata:\n annotations:\n\
959+
\ sidecar.istio.io/inject: 'false'\n spec:\n\
960+
\ containers:\n - args:\n \
961+
\ - |\n echo \"Running phase {phase_num}\"\
964962
\n echo \"Using {path_to_model} model for training\"\
965963
\n echo \"Using {path_to_data} data for training\"\
966964
\n mkdir -p /output/phase_{phase_num}/model;\n\
@@ -985,23 +983,23 @@ deploymentSpec:
985983
\ --checkpoint_at_epoch\n \
986984
\ command:\n - /bin/bash\n \
987985
\ - '-c'\n - '--'\n image:\
988-
\ {image}\n name: pytorch\n volumeMounts:\n\
989-
\ - mountPath: /input_data\n \
990-
\ name: input-data\n readOnly: true\n \
991-
\ - mountPath: /input_model\n \
992-
\ name: model\n readOnly: true\n \
993-
\ - mountPath: /output\n name: output\n\
994-
\ env:\n - name: NNODES\n \
995-
\ value: \\\"{nnodes}\\\"\n \
996-
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
997-
\"\n - name: XDG_CACHE_HOME\n \
998-
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
986+
\ {RHELAI_IMAGE}\n name: pytorch\n \
987+
\ volumeMounts:\n - mountPath: /input_data\n\
988+
\ name: input-data\n readOnly:\
989+
\ true\n - mountPath: /input_model\n \
990+
\ name: model\n readOnly: true\n \
991+
\ - mountPath: /output\n \
992+
\ name: output\n env:\n - name:\
993+
\ NNODES\n value: \\\"{nnodes}\\\"\n \
994+
\ - name: NPROC_PER_NODE\n value:\
995+
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
999996
\ value: /tmp\n - name:\
1000-
\ HF_HOME\n value: /tmp\n \
1001-
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
1002-
\ resources:\n requests:\n \
1003-
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
1004-
\ limits:\n \"nvidia.com/gpu\"\
997+
\ TRITON_CACHE_DIR\n value: /tmp\n \
998+
\ - name: HF_HOME\n value: /tmp\n \
999+
\ - name: TRANSFORMERS_CACHE\n \
1000+
\ value: /tmp\n resources:\n \
1001+
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
1002+
\ limits:\n \"nvidia.com/gpu\"\
10051003
: {nproc_per_node}\n volumes:\n - name:\
10061004
\ input-data\n persistentVolumeClaim:\n \
10071005
\ claimName: {input_pvc_name}\n - name: model\n\
@@ -1035,8 +1033,8 @@ deploymentSpec:
10351033
\ \\\n --checkpoint_at_epoch\n \
10361034
\ command:\n - /bin/bash\n \
10371035
\ - '-c'\n - '--'\n \
1038-
\ image: {image}\n name: pytorch\n \
1039-
\ volumeMounts:\n - mountPath: /input_data\n\
1036+
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
1037+
\ volumeMounts:\n - mountPath: /input_data\n\
10401038
\ name: input-data\n readOnly:\
10411039
\ true\n - mountPath: /input_model\n \
10421040
\ name: model\n readOnly: true\n \

training/components.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,6 @@ def list_phase1_final_model():
167167
else:
168168
raise RuntimeError(f"Unsupported value of {phase_num=}")
169169

170-
image = "registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1"
171-
172170
manifest = inspect.cleandoc(
173171
f"""
174172
apiVersion: kubeflow.org/v1
@@ -218,7 +216,7 @@ def list_phase1_final_model():
218216
- /bin/bash
219217
- '-c'
220218
- '--'
221-
image: {image}
219+
image: {RHELAI_IMAGE}
222220
name: pytorch
223221
volumeMounts:
224222
- mountPath: /input_data
@@ -296,7 +294,7 @@ def list_phase1_final_model():
296294
- /bin/bash
297295
- '-c'
298296
- '--'
299-
image: {image}
297+
image: {RHELAI_IMAGE}
300298
name: pytorch
301299
volumeMounts:
302300
- mountPath: /input_data

0 commit comments

Comments
 (0)