Skip to content

Commit fca94d5

Browse files
committed
Set to check status of all jobset pods
1 parent 630807b commit fca94d5

File tree

3 files changed

+58
-56
lines changed

3 files changed

+58
-56
lines changed

.github/actions/gke-xpk/action.yml

Lines changed: 50 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ inputs:
3535
default: 2
3636
required: false
3737
type: string
38-
MAIN_CONTAINER_NAME:
38+
MAIN_CONTAINER:
3939
description: 'Name of the main contianer in an XPK JobSet (fixed)'
4040
default: gpu-image
4141
required: false
@@ -65,7 +65,7 @@ inputs:
6565
required: false
6666
default: 'exit \$EXIT_CODE'
6767
type: string
68-
WORKLOAD_NAME_PREFIX:
68+
WORKLOAD_PREFIX:
6969
description: 'Workload name prefix for XPK, also used to name uploaded artifact'
7070
required: false
7171
default: 'xpk'
@@ -137,35 +137,35 @@ runs:
137137
shell: bash -x -e -u {0}
138138
if: steps.cluster-online.outputs.proceed == 'true'
139139
run: |
140-
WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
140+
WORKLOAD="${{ inputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
141141
DATE=$(date +'%Y-%m-%d')
142-
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}"
142+
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_PREFIX }}/${DATE}/${WORKLOAD}"
143143
144-
echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV}
144+
echo "WORKLOAD=${WORKLOAD}" >> ${GITHUB_ENV}
145145
echo "DATE=${DATE}" >> ${GITHUB_ENV}
146146
echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
147147
148148
- name: Setup environment
149149
shell: bash -x -e -u {0}
150150
if: steps.cluster-online.outputs.proceed == 'true'
151151
run: |
152-
mkdir -p ${WORKLOAD_NAME}
153-
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME}
154-
source ${WORKLOAD_NAME}/.venv/bin/activate
152+
mkdir -p ${WORKLOAD}
153+
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD}
154+
source ${WORKLOAD}/.venv/bin/activate
155155
156156
# install xpk
157-
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk
157+
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD}/xpk
158158
159-
sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile
160-
cd ${WORKLOAD_NAME}/xpk && sudo make install; cd -
159+
sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD}/xpk/Makefile
160+
cd ${WORKLOAD}/xpk && sudo make install; cd -
161161
162162
- name: Show environment
163163
shell: bash -x -e -u {0}
164164
if: steps.cluster-online.outputs.proceed == 'true'
165165
run: |
166166
gcloud version
167167
168-
source ${WORKLOAD_NAME}/.venv/bin/activate
168+
source ${WORKLOAD}/.venv/bin/activate
169169
python --version
170170
xpk version
171171

@@ -174,7 +174,7 @@ runs:
174174
if: steps.cluster-online.outputs.proceed == 'true'
175175
run: |
176176
PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
177-
ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME}/xpk
177+
ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD}/xpk
178178
179179
- name: Set workload commands
180180
shell: bash -x -e -u {0}
@@ -207,14 +207,14 @@ runs:
207207
shell: bash -x -e -u {0}
208208
if: steps.cluster-online.outputs.proceed == 'true'
209209
run: |
210-
source ${WORKLOAD_NAME}/.venv/bin/activate
211-
cd ${WORKLOAD_NAME}/xpk
210+
source ${WORKLOAD}/.venv/bin/activate
211+
cd ${WORKLOAD}/xpk
212212
213213
args=(
214214
--project=${{ inputs.GCP_PROJECT }}
215215
--cluster=${{ inputs.GKE_CLUSTER }}
216216
--zone=${{ inputs.GCP_REGION }}
217-
--workload=${WORKLOAD_NAME}
217+
--workload=${WORKLOAD}
218218
--docker-image=${{ inputs.IMAGE }}
219219
--device-type=${{ inputs.CLUSTER_DEVICE }}
220220
--num-nodes=${{ inputs.NUM_NODES }}
@@ -254,52 +254,54 @@ runs:
254254
START=$(date +%s)
255255
JOBSET_ACTIVE=false
256256
while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
257-
JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
257+
JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD}'").status.replicatedJobsStatus[0] | .active == 1')
258258
NOW=$(date +%s)
259259
ELAPSED=$(( NOW - START ))
260260
if (( ELAPSED > POLL_TIMEOUT )) ; then
261-
echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
261+
echo "Timeout after waiting for JobSet ${WORKLOAD} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
262262
exit 1
263263
fi
264-
echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
264+
echo "Waiting for JobSet ${WORKLOAD} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
265265
sleep 5
266266
done
267267
268-
echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
268+
echo "JobSet ${WORKLOAD} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
269269

270-
- name: Set JobSet Pod name
270+
- name: Set JobSet Pods names
271271
shell: bash -u {0}
272272
if: steps.cluster-online.outputs.proceed == 'true'
273273
run: |
274-
echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV}
274+
echo "JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD}'") | .name' | tr '\n' ' '))" >> ${GITHUB_ENV}
275275
276276
- name: Wait for JobSet Pod readiness
277277
shell: bash -u {0}
278278
if: steps.cluster-online.outputs.proceed == 'true'
279279
run: |
280-
POD_READY=false
281-
while ! ${POD_READY} || [ -z ${POD_READY} ]; do
282-
echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
283-
sleep 10
280+
for jobset_pod in ${JOBSET_PODS[@]}; do
281+
POD_READY=false
282+
while [ ${POD_READY} == "false" ] || [ -z ${POD_READY} ]; do
283+
echo "Waiting for pod ${jobset_pod} in JobSet ${WORKLOAD} to become ready"
284+
sleep 10
284285
285-
POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
286-
if ${POD_ERROR} ; then
287-
echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
288-
break
289-
fi
286+
POD_READY=$(kubectl get pod ${jobset_pod} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER }}'").ready')
287+
288+
if [ ${POD_READY} == "false" ]; then
289+
POD_ERROR=$(kubectl get pod ${jobset_pod} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
290+
if ${POD_ERROR} ; then
291+
echo "There was an issue starting the JobSet ${WORKLOAD} on ${{ inputs.GKE_CLUSTER }}"
292+
break
293+
fi
294+
fi
290295

291-
POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
296+
done;
292297
done;
293298

294299
- name: Stream logs from JobSet Pods
295300
shell: bash -u {0}
296301
if: steps.cluster-online.outputs.proceed == 'true'
297302
run: |
298-
JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
299-
echo "JOBSET_PODS=${JOBSET_PODS[@]}" >> ${GITHUB_ENV}
300-
301303
for jobset_pod in ${JOBSET_PODS[@]}; do
302-
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}/${jobset_pod}.log &
304+
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD}/${jobset_pod}.log &
303305
done
304306
wait < <(jobs -p)
305307
@@ -309,11 +311,11 @@ runs:
309311
run: |
310312
parse_pod_exit_code() {
311313
local pod=$1
312-
MAYBE_JOBSET_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}/${pod}.log | awk '{ print $3 }' )"
314+
MAYBE_JOBSET_EXIT_CODE="$(tail -n 1 ${WORKLOAD}/${pod}.log | awk '{ print $3 }' )"
313315
echo ${MAYBE_JOBSET_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
314316
315317
if [ $? -ne 0 ]; then
316-
echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
318+
echo "The JobSet ${WORKLOAD} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
317319
echo "JOBSET_EXIT_CODE=1" >> ${GITHUB_ENV}
318320
exit 1
319321
fi
@@ -340,27 +342,27 @@ runs:
340342
shell: bash -x -u {0}
341343
if: steps.cluster-online.outputs.proceed == 'true'
342344
run: |
343-
kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}"
345+
kubectl delete jobset --wait ${WORKLOAD} || echo "JobSet ${WORKLOAD} does not exist in ${{ inputs.GKE_CLUSTER }}"
344346
345347
- name: Download artifacts from GCS to runner
346348
shell: bash -x -u {0}
347349
if: steps.cluster-online.outputs.proceed == 'true'
348350
run: |
349-
mkdir -p ${WORKLOAD_NAME}/output
350-
gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD_NAME}/output
351-
cp ${WORKLOAD_NAME}/*.log ${WORKLOAD_NAME}/output
351+
mkdir -p ${WORKLOAD}/output
352+
gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD}/output
353+
cp ${WORKLOAD}/*.log ${WORKLOAD}/output
352354
353355
- name: Upload artifacts to GitHub Actions from runner
354356
uses: actions/upload-artifact@v4
355357
with:
356-
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}
357-
path: ${{ env.WORKLOAD_NAME }}/output/*
358+
name: ${{ inputs.WORKLOAD_PREFIX }}
359+
path: ${{ env.WORKLOAD }}/output/*
358360

359361
- name: Clean up xpk environment from runner
360362
shell: bash -x -u {0}
361363
if: steps.cluster-online.outputs.proceed == 'true'
362364
run: |
363-
sudo rm -rf ${WORKLOAD_NAME}
365+
sudo rm -rf ${WORKLOAD}
364366
365367
- name: Generate sitrep
366368
id: sitrep
@@ -370,8 +372,8 @@ runs:
370372
source .github/workflows/scripts/to_json.sh
371373
badge_label="${{ matrix.test }}"
372374
373-
summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
374-
badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
375+
summary="${{ inputs.WORKLOAD_PREFIX }}"
376+
badge_label="${{ inputs.WORKLOAD_PREFIX }}"
375377
376378
if [[ -z "${JOBSET_EXIT_CODE}" ]]; then
377379
badge_color=gray
@@ -397,6 +399,6 @@ runs:
397399
if: ${{ always() }}
398400
uses: actions/upload-artifact@v4
399401
with:
400-
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
402+
name: ${{ inputs.WORKLOAD_PREFIX }}-sitrep
401403
path: |
402404
sitrep.json

.github/workflows/_test_maxtext_gke_xpk.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
runs-on: gke-a3mega
1515

1616
env:
17-
WORKLOAD_NAME_PREFIX: gke-maxtext-train
17+
WORKLOAD_PREFIX: gke-maxtext-train
1818
MAXTEXT_MODEL: llama2-7b
1919
MAXTEXT_ATTENTION_TYPE: cudnn_flash_te
2020
MAXTEXT_REMAT_POLICY: minimal_flash
@@ -30,7 +30,7 @@ jobs:
3030
uses: ./.github/actions/gke-xpk
3131
with:
3232
IMAGE: ${{ env.MAXTEXT_IMAGE }}
33-
WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }}
33+
WORKLOAD_PREFIX: ${{ env.WORKLOAD_PREFIX }}
3434
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3535
NVCR_TOKEN: ${{ secrets.NVCR_TOKEN }}
3636
COMMAND: |

.github/workflows/_test_nccl_gke.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
env:
5353
BASE_IMAGE: ${{ needs.build-nccl-gke.outputs.DOCKER_TAG_FINAL }}
5454
TEST_NAME: ${{ matrix.test }}
55-
WORKLOAD_NAME_PREFIX: nccl-gke
55+
WORKLOAD_PREFIX: nccl-gke
5656
NHOSTS: 2
5757
NCCL_MINBYTES: 8
5858
NCCL_MAXBYTES: 16G
@@ -66,24 +66,24 @@ jobs:
6666
id: workload-name
6767
run: |
6868
TEST_NAME=$(echo "${{ matrix.test }}" | sed 's/_perf_mpi//g' | sed 's/_/-/g')
69-
WORKLOAD_PREFIX="${{ env.WORKLOAD_NAME_PREFIX }}-${TEST_NAME}"
69+
WORKLOAD_PREFIX="${WORKLOAD_PREFIX}-${TEST_NAME}"
7070
7171
echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT}
7272
7373
- name: Create NCCL test Services on cluster
7474
run: |
75-
SERVICE_MANIFEST=".github/gke-workflow/gke/nccl-svc-${WORKLOAD_NAME}-${{ matrix.test }}.yaml"
76-
WORKLOAD_NAME="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
75+
SERVICE_MANIFEST=".github/gke-workflow/gke/nccl-svc-${WORKLOAD}-${{ matrix.test }}.yaml"
76+
WORKLOAD="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
7777
echo "SERVICE_MANIFEST=${SERVICE_MANIFEST}" >> ${GITHUB_ENV}
7878
79-
cat .github/gke-workflow/gke/nccl-svc.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD_NAME}'"' --yaml-output | tee ${SERVICE_MANIFEST}
79+
cat .github/gke-workflow/gke/nccl-svc.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD}'"' --yaml-output | tee ${SERVICE_MANIFEST}
8080
kubectl apply -f ${SERVICE_MANIFEST}
8181
8282
- name: Run XPK workload on cluster
8383
uses: ./.github/actions/gke-xpk
8484
with:
8585
IMAGE: ${{ env.BASE_IMAGE }}
86-
WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
86+
WORKLOAD_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
8787
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8888
NVCR_TOKEN: ${{ secrets.NVCR_TOKEN }}
8989
COMMAND: |

0 commit comments

Comments
 (0)