@@ -35,7 +35,7 @@ inputs:
3535 default : 2
3636 required : false
3737 type : string
38- MAIN_CONTAINER_NAME :
38+ MAIN_CONTAINER :
3939 description : ' Name of the main contianer in an XPK JobSet (fixed)'
4040 default : gpu-image
4141 required : false
@@ -65,7 +65,7 @@ inputs:
6565 required : false
6666 default : ' exit \$EXIT_CODE'
6767 type : string
68- WORKLOAD_NAME_PREFIX :
68+ WORKLOAD_PREFIX :
6969 description : ' Workload name prefix for XPK, also used to name uploaded artifact'
7070 required : false
7171 default : ' xpk'
@@ -137,35 +137,35 @@ runs:
137137 shell : bash -x -e -u {0}
138138 if : steps.cluster-online.outputs.proceed == 'true'
139139 run : |
140- WORKLOAD_NAME ="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
140+ WORKLOAD ="${{ inputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
141141 DATE=$(date +'%Y-%m-%d')
142- GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME }"
142+ GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_PREFIX }}/${DATE}/${WORKLOAD }"
143143
144- echo "WORKLOAD_NAME =${WORKLOAD_NAME }" >> ${GITHUB_ENV}
144+ echo "WORKLOAD =${WORKLOAD }" >> ${GITHUB_ENV}
145145 echo "DATE=${DATE}" >> ${GITHUB_ENV}
146146 echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
147147
148148 - name : Setup environment
149149 shell : bash -x -e -u {0}
150150 if : steps.cluster-online.outputs.proceed == 'true'
151151 run : |
152- mkdir -p ${WORKLOAD_NAME }
153- uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME }
154- source ${WORKLOAD_NAME }/.venv/bin/activate
152+ mkdir -p ${WORKLOAD }
153+ uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD }
154+ source ${WORKLOAD }/.venv/bin/activate
155155
156156 # install xpk
157- git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME }/xpk
157+ git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD }/xpk
158158
159- sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME }/xpk/Makefile
160- cd ${WORKLOAD_NAME }/xpk && sudo make install; cd -
159+ sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD }/xpk/Makefile
160+ cd ${WORKLOAD }/xpk && sudo make install; cd -
161161
162162 - name : Show environment
163163 shell : bash -x -e -u {0}
164164 if : steps.cluster-online.outputs.proceed == 'true'
165165 run : |
166166 gcloud version
167167
168- source ${WORKLOAD_NAME }/.venv/bin/activate
168+ source ${WORKLOAD }/.venv/bin/activate
169169 python --version
170170 xpk version
171171
@@ -174,7 +174,7 @@ runs:
174174 if : steps.cluster-online.outputs.proceed == 'true'
175175 run : |
176176 PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
177- ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME }/xpk
177+ ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD }/xpk
178178
179179 - name : Set workload commands
180180 shell : bash -x -e -u {0}
@@ -207,14 +207,14 @@ runs:
207207 shell : bash -x -e -u {0}
208208 if : steps.cluster-online.outputs.proceed == 'true'
209209 run : |
210- source ${WORKLOAD_NAME }/.venv/bin/activate
211- cd ${WORKLOAD_NAME }/xpk
210+ source ${WORKLOAD }/.venv/bin/activate
211+ cd ${WORKLOAD }/xpk
212212
213213 args=(
214214 --project=${{ inputs.GCP_PROJECT }}
215215 --cluster=${{ inputs.GKE_CLUSTER }}
216216 --zone=${{ inputs.GCP_REGION }}
217- --workload=${WORKLOAD_NAME }
217+ --workload=${WORKLOAD }
218218 --docker-image=${{ inputs.IMAGE }}
219219 --device-type=${{ inputs.CLUSTER_DEVICE }}
220220 --num-nodes=${{ inputs.NUM_NODES }}
@@ -254,52 +254,54 @@ runs:
254254 START=$(date +%s)
255255 JOBSET_ACTIVE=false
256256 while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
257- JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME }'").status.replicatedJobsStatus[0] | .active == 1')
257+ JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD }'").status.replicatedJobsStatus[0] | .active == 1')
258258 NOW=$(date +%s)
259259 ELAPSED=$(( NOW - START ))
260260 if (( ELAPSED > POLL_TIMEOUT )) ; then
261- echo "Timeout after waiting for JobSet ${WORKLOAD_NAME } to become active in cluster ${{ inputs.GKE_CLUSTER }}"
261+ echo "Timeout after waiting for JobSet ${WORKLOAD } to become active in cluster ${{ inputs.GKE_CLUSTER }}"
262262 exit 1
263263 fi
264- echo "Waiting for JobSet ${WORKLOAD_NAME } to become active in cluster ${{ inputs.GKE_CLUSTER }}"
264+ echo "Waiting for JobSet ${WORKLOAD } to become active in cluster ${{ inputs.GKE_CLUSTER }}"
265265 sleep 5
266266 done
267267
268- echo "JobSet ${WORKLOAD_NAME } has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
268+ echo "JobSet ${WORKLOAD } has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
269269
270- - name : Set JobSet Pod name
270+ - name : Set JobSet Pods names
271271 shell : bash -u {0}
272272 if : steps.cluster-online.outputs.proceed == 'true'
273273 run : |
274- echo "POD= $(kubectl get pods -o json | jq -r '.items[] | select(.metadata. labels."' jobset.sigs.k8s.io/jobset-name' " == "'${WORKLOAD_NAME }'") | .metadata. name ' | sort | head -n1 )" >> ${GITHUB_ENV}
274+ echo "JOBSET_PODS=( $(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD }'") | .name' | tr '\n' ' ') )" >> ${GITHUB_ENV}
275275
276276 - name : Wait for JobSet Pod readiness
277277 shell : bash -u {0}
278278 if : steps.cluster-online.outputs.proceed == 'true'
279279 run : |
280- POD_READY=false
281- while ! ${POD_READY} || [ -z ${POD_READY} ]; do
282- echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
283- sleep 10
280+ for jobset_pod in ${JOBSET_PODS[@]}; do
281+ POD_READY=false
282+ while [ ${POD_READY} == "false" ] || [ -z ${POD_READY} ]; do
283+ echo "Waiting for pod ${jobset_pod} in JobSet ${WORKLOAD} to become ready"
284+ sleep 10
284285
285- POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
286- if ${POD_ERROR} ; then
287- echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
288- break
289- fi
286+ POD_READY=$(kubectl get pod ${jobset_pod} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER }}'").ready')
287+
288+ if [ ${POD_READY} == "false" ]; then
289+ POD_ERROR=$(kubectl get pod ${jobset_pod} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
290+ if ${POD_ERROR} ; then
291+ echo "There was an issue starting the JobSet ${WORKLOAD} on ${{ inputs.GKE_CLUSTER }}"
292+ break
293+ fi
294+ fi
290295
291- POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
296+ done;
292297 done;
293298
294299 - name : Stream logs from JobSet Pods
295300 shell : bash -u {0}
296301 if : steps.cluster-online.outputs.proceed == 'true'
297302 run : |
298- JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
299- echo "JOBSET_PODS=${JOBSET_PODS[@]}" >> ${GITHUB_ENV}
300-
301303 for jobset_pod in ${JOBSET_PODS[@]}; do
302- kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME }/${jobset_pod}.log &
304+ kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD }/${jobset_pod}.log &
303305 done
304306 wait < <(jobs -p)
305307
@@ -309,11 +311,11 @@ runs:
309311 run : |
310312 parse_pod_exit_code() {
311313 local pod=$1
312- MAYBE_JOBSET_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME }/${pod}.log | awk '{ print $3 }' )"
314+ MAYBE_JOBSET_EXIT_CODE="$(tail -n 1 ${WORKLOAD }/${pod}.log | awk '{ print $3 }' )"
313315 echo ${MAYBE_JOBSET_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
314316
315317 if [ $? -ne 0 ]; then
316- echo "The JobSet ${WORKLOAD_NAME } on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
318+ echo "The JobSet ${WORKLOAD } on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
317319 echo "JOBSET_EXIT_CODE=1" >> ${GITHUB_ENV}
318320 exit 1
319321 fi
@@ -340,27 +342,27 @@ runs:
340342 shell : bash -x -u {0}
341343 if : steps.cluster-online.outputs.proceed == 'true'
342344 run : |
343- kubectl delete jobset --wait ${WORKLOAD_NAME } || echo "JobSet ${WORKLOAD_NAME } does not exist in ${{ inputs.GKE_CLUSTER }}"
345+ kubectl delete jobset --wait ${WORKLOAD } || echo "JobSet ${WORKLOAD } does not exist in ${{ inputs.GKE_CLUSTER }}"
344346
345347 - name : Download artifacts from GCS to runner
346348 shell : bash -x -u {0}
347349 if : steps.cluster-online.outputs.proceed == 'true'
348350 run : |
349- mkdir -p ${WORKLOAD_NAME }/output
350- gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD_NAME }/output
351- cp ${WORKLOAD_NAME }/*.log ${WORKLOAD_NAME }/output
351+ mkdir -p ${WORKLOAD }/output
352+ gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD }/output
353+ cp ${WORKLOAD }/*.log ${WORKLOAD }/output
352354
353355 - name : Upload artifacts to GitHub Actions from runner
354356 uses : actions/upload-artifact@v4
355357 with :
356- name : ${{ inputs.WORKLOAD_NAME_PREFIX }}
357- path : ${{ env.WORKLOAD_NAME }}/output/*
358+ name : ${{ inputs.WORKLOAD_PREFIX }}
359+ path : ${{ env.WORKLOAD }}/output/*
358360
359361 - name : Clean up xpk environment from runner
360362 shell : bash -x -u {0}
361363 if : steps.cluster-online.outputs.proceed == 'true'
362364 run : |
363- sudo rm -rf ${WORKLOAD_NAME }
365+ sudo rm -rf ${WORKLOAD }
364366
365367 - name : Generate sitrep
366368 id : sitrep
@@ -370,8 +372,8 @@ runs:
370372 source .github/workflows/scripts/to_json.sh
371373 badge_label="${{ matrix.test }}"
372374
373- summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
374- badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
375+ summary="${{ inputs.WORKLOAD_PREFIX }}"
376+ badge_label="${{ inputs.WORKLOAD_PREFIX }}"
375377
376378 if [[ -z "${JOBSET_EXIT_CODE}" ]]; then
377379 badge_color=gray
@@ -397,6 +399,6 @@ runs:
397399 if : ${{ always() }}
398400 uses : actions/upload-artifact@v4
399401 with :
400- name : ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
402+ name : ${{ inputs.WORKLOAD_PREFIX }}-sitrep
401403 path : |
402404 sitrep.json
0 commit comments