@@ -747,16 +747,15 @@ deploymentSpec:
747
747
\ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
748
748
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
749
749
\ = \" /input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\" \
750
- Unsupported value of {phase_num=}\" )\n\n image = \" registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\" \
751
- \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
752
- \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
753
- \ name: {name}\n spec:\n nprocPerNode: \\\" {nproc_per_node}\\ \
754
- \"\n pytorchReplicaSpecs:\n Master:\n replicas:\
755
- \ 1\n restartPolicy: OnFailure\n template:\n \
756
- \ metadata:\n annotations:\n \
757
- \ sidecar.istio.io/inject: 'false'\n spec:\n \
758
- \ containers:\n - args:\n \
759
- \ - |\n echo \" Running phase {phase_num}\" \
750
+ Unsupported value of {phase_num=}\" )\n\n manifest = inspect.cleandoc(\n \
751
+ \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n \
752
+ \ metadata:\n name: {name}\n spec:\n nprocPerNode:\
753
+ \ \\\" {nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
754
+ \ Master:\n replicas: 1\n restartPolicy: OnFailure\n \
755
+ \ template:\n metadata:\n annotations:\n \
756
+ \ sidecar.istio.io/inject: 'false'\n spec:\n \
757
+ \ containers:\n - args:\n \
758
+ \ - |\n echo \" Running phase {phase_num}\" \
760
759
\n echo \" Using {path_to_model} model for training\" \
761
760
\n echo \" Using {path_to_data} data for training\" \
762
761
\n mkdir -p /output/phase_{phase_num}/model;\n \
@@ -781,23 +780,23 @@ deploymentSpec:
781
780
\ --checkpoint_at_epoch\n \
782
781
\ command:\n - /bin/bash\n \
783
782
\ - '-c'\n - '--'\n image:\
784
- \ {image}\n name: pytorch\n volumeMounts:\n \
785
- \ - mountPath: /input_data\n \
786
- \ name: input-data\n readOnly: true\n \
787
- \ - mountPath: /input_model\n \
788
- \ name: model\n readOnly: true\n \
789
- \ - mountPath: /output\n name: output\n \
790
- \ env:\n - name: NNODES\n \
791
- \ value: \\\" {nnodes}\\\"\n \
792
- \ - name: NPROC_PER_NODE\n value: \\\" {nproc_per_node}\\ \
793
- \"\n - name: XDG_CACHE_HOME\n \
794
- \ value: /tmp\n - name: TRITON_CACHE_DIR\n \
783
+ \ {RHELAI_IMAGE}\n name: pytorch\n \
784
+ \ volumeMounts:\n - mountPath: /input_data\n \
785
+ \ name: input-data\n readOnly:\
786
+ \ true\n - mountPath: /input_model\n \
787
+ \ name: model\n readOnly: true\n \
788
+ \ - mountPath: /output\n \
789
+ \ name: output\n env:\n - name:\
790
+ \ NNODES\n value: \\\" {nnodes}\\\"\n \
791
+ \ - name: NPROC_PER_NODE\n value:\
792
+ \ \\\" {nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n \
795
793
\ value: /tmp\n - name:\
796
- \ HF_HOME\n value: /tmp\n \
797
- \ - name: TRANSFORMERS_CACHE\n value: /tmp\n \
798
- \ resources:\n requests:\n \
799
- \ \" nvidia.com/gpu\" : {nproc_per_node}\n \
800
- \ limits:\n \" nvidia.com/gpu\" \
794
+ \ TRITON_CACHE_DIR\n value: /tmp\n \
795
+ \ - name: HF_HOME\n value: /tmp\n \
796
+ \ - name: TRANSFORMERS_CACHE\n \
797
+ \ value: /tmp\n resources:\n \
798
+ \ requests:\n \" nvidia.com/gpu\" : {nproc_per_node}\n \
799
+ \ limits:\n \" nvidia.com/gpu\" \
801
800
: {nproc_per_node}\n volumes:\n - name:\
802
801
\ input-data\n persistentVolumeClaim:\n \
803
802
\ claimName: {input_pvc_name}\n - name: model\n \
@@ -831,8 +830,8 @@ deploymentSpec:
831
830
\ \\\n --checkpoint_at_epoch\n \
832
831
\ command:\n - /bin/bash\n \
833
832
\ - '-c'\n - '--'\n \
834
- \ image: {image }\n name: pytorch\n \
835
- \ volumeMounts:\n - mountPath: /input_data\n \
833
+ \ image: {RHELAI_IMAGE }\n name: pytorch\n \
834
+ \ volumeMounts:\n - mountPath: /input_data\n \
836
835
\ name: input-data\n readOnly:\
837
836
\ true\n - mountPath: /input_model\n \
838
837
\ name: model\n readOnly: true\n \
@@ -951,16 +950,15 @@ deploymentSpec:
951
950
\ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
952
951
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
953
952
\ = \" /input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\" \
954
- Unsupported value of {phase_num=}\" )\n\n image = \" registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\" \
955
- \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
956
- \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
957
- \ name: {name}\n spec:\n nprocPerNode: \\\" {nproc_per_node}\\ \
958
- \"\n pytorchReplicaSpecs:\n Master:\n replicas:\
959
- \ 1\n restartPolicy: OnFailure\n template:\n \
960
- \ metadata:\n annotations:\n \
961
- \ sidecar.istio.io/inject: 'false'\n spec:\n \
962
- \ containers:\n - args:\n \
963
- \ - |\n echo \" Running phase {phase_num}\" \
953
+ Unsupported value of {phase_num=}\" )\n\n manifest = inspect.cleandoc(\n \
954
+ \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n \
955
+ \ metadata:\n name: {name}\n spec:\n nprocPerNode:\
956
+ \ \\\" {nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
957
+ \ Master:\n replicas: 1\n restartPolicy: OnFailure\n \
958
+ \ template:\n metadata:\n annotations:\n \
959
+ \ sidecar.istio.io/inject: 'false'\n spec:\n \
960
+ \ containers:\n - args:\n \
961
+ \ - |\n echo \" Running phase {phase_num}\" \
964
962
\n echo \" Using {path_to_model} model for training\" \
965
963
\n echo \" Using {path_to_data} data for training\" \
966
964
\n mkdir -p /output/phase_{phase_num}/model;\n \
@@ -985,23 +983,23 @@ deploymentSpec:
985
983
\ --checkpoint_at_epoch\n \
986
984
\ command:\n - /bin/bash\n \
987
985
\ - '-c'\n - '--'\n image:\
988
- \ {image}\n name: pytorch\n volumeMounts:\n \
989
- \ - mountPath: /input_data\n \
990
- \ name: input-data\n readOnly: true\n \
991
- \ - mountPath: /input_model\n \
992
- \ name: model\n readOnly: true\n \
993
- \ - mountPath: /output\n name: output\n \
994
- \ env:\n - name: NNODES\n \
995
- \ value: \\\" {nnodes}\\\"\n \
996
- \ - name: NPROC_PER_NODE\n value: \\\" {nproc_per_node}\\ \
997
- \"\n - name: XDG_CACHE_HOME\n \
998
- \ value: /tmp\n - name: TRITON_CACHE_DIR\n \
986
+ \ {RHELAI_IMAGE}\n name: pytorch\n \
987
+ \ volumeMounts:\n - mountPath: /input_data\n \
988
+ \ name: input-data\n readOnly:\
989
+ \ true\n - mountPath: /input_model\n \
990
+ \ name: model\n readOnly: true\n \
991
+ \ - mountPath: /output\n \
992
+ \ name: output\n env:\n - name:\
993
+ \ NNODES\n value: \\\" {nnodes}\\\"\n \
994
+ \ - name: NPROC_PER_NODE\n value:\
995
+ \ \\\" {nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n \
999
996
\ value: /tmp\n - name:\
1000
- \ HF_HOME\n value: /tmp\n \
1001
- \ - name: TRANSFORMERS_CACHE\n value: /tmp\n \
1002
- \ resources:\n requests:\n \
1003
- \ \" nvidia.com/gpu\" : {nproc_per_node}\n \
1004
- \ limits:\n \" nvidia.com/gpu\" \
997
+ \ TRITON_CACHE_DIR\n value: /tmp\n \
998
+ \ - name: HF_HOME\n value: /tmp\n \
999
+ \ - name: TRANSFORMERS_CACHE\n \
1000
+ \ value: /tmp\n resources:\n \
1001
+ \ requests:\n \" nvidia.com/gpu\" : {nproc_per_node}\n \
1002
+ \ limits:\n \" nvidia.com/gpu\" \
1005
1003
: {nproc_per_node}\n volumes:\n - name:\
1006
1004
\ input-data\n persistentVolumeClaim:\n \
1007
1005
\ claimName: {input_pvc_name}\n - name: model\n \
@@ -1035,8 +1033,8 @@ deploymentSpec:
1035
1033
\ \\\n --checkpoint_at_epoch\n \
1036
1034
\ command:\n - /bin/bash\n \
1037
1035
\ - '-c'\n - '--'\n \
1038
- \ image: {image }\n name: pytorch\n \
1039
- \ volumeMounts:\n - mountPath: /input_data\n \
1036
+ \ image: {RHELAI_IMAGE }\n name: pytorch\n \
1037
+ \ volumeMounts:\n - mountPath: /input_data\n \
1040
1038
\ name: input-data\n readOnly:\
1041
1039
\ true\n - mountPath: /input_model\n \
1042
1040
\ name: model\n readOnly: true\n \
0 commit comments