Skip to content

Commit

Permalink
ADLR/megatron-lm!1908 - ci: Refactor gitlab-ci
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Aug 12, 2024
1 parent 153f762 commit 15b7cfb
Show file tree
Hide file tree
Showing 11 changed files with 352 additions and 440 deletions.
465 changes: 38 additions & 427 deletions .gitlab-ci.yml

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
include:
- template: Security/Secret-Detection.gitlab-ci.yml

mirror_to_github:
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true"'
- when: never
tags: [mcore-docker-node-small]
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:[email protected]/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH

label_merge_request:
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- when: never
stage: .pre
image: golang:1.22
tags:
- mcore-docker-node-small
before_script:
- git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git
- cd gitlab-mr-labeler
- go install .
- cd ..
- go install github.com/itchyny/gojq/cmd/gojq@latest
- |
echo LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | gojq '.labels | join(",")') > labels
script:
- gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true
after_script:
- |
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
check_milestone:
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- when: never
stage: .pre
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
tags:
- mcore-docker-node-small
script:
- env
- |
MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone')
- |
if [[ "$MILESTONE" == "null" ]]; then
echo Please assign a Milestone to this MR!
exit 1
fi
150 changes: 150 additions & 0 deletions .gitlab/stages/01.tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
.tests_common:
rules:
- if: ($FUNCTIONAL_TEST == "no" || $FUNCTIONAL_TEST == "yes" || $CONVERGENCE_TEST == "yes") && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
- if: $FUNCTIONAL_TEST == "no" || $FUNCTIONAL_TEST == "yes" || $CONVERGENCE_TEST == "yes"
- when: never
stage: test

include:
- template: Security/Secret-Detection.gitlab-ci.yml

build_image:
tags: [8xL40S-builder]
image: docker:26.1.4-dind
timeout: 45m
parallel:
matrix:
- IMAGE: CI_MCORE_IMAGE
FILE: Dockerfile.ci
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
- IMAGE: CI_NEMO_IMAGE
FILE: Dockerfile.ci
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
- IMAGE: LINTING_IMAGE
FILE: Dockerfile.linting
BASE_IMAGE: python:3.10
before_script:
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
variables:
STAGE: main
script:
- |
set -x
eval "IMAGE=\$$IMAGE"
docker system prune -a --filter "until=96h" -f
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
ADDITIONAL_PARAMS="--pull"
fi
docker build \
--secret id=JET_INDEX_URLS \
--target $STAGE \
-f $FILE \
-t ${IMAGE}:${CI_PIPELINE_ID} \
--build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
--cache-to type=inline \
--cache-from type=registry,ref=${IMAGE}:buildcache \
--build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
${ADDITIONAL_PARAMS} .
docker push ${IMAGE}:${CI_PIPELINE_ID}
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache
docker push ${IMAGE}:buildcache
fi
if [[ $CI_COMMIT_BRANCH == core_r* ]]; then
docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID}
docker push ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID}
fi
retry:
max: 2

unit_tests:
# This job runs both test suite of ToT and of a historic ref against
# the current code. This is a form of backwards compatibility testing
# and helps in providing stable interfaces.
extends: [.tests_common]
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
needs: [build_image]
timeout: 180m
parallel:
matrix:
- TAG: latest
- TAG: db5c60ae3fe5247f16ec0536bbf41ee5c7fb9c4a
tags: [8xL40S]
variables:
GIT_STRATEGY: clone
GIT_DEPTH: 0
before_script:
- |
if [[ $TAG != latest ]]; then
git checkout $TAG
rm -rf /opt/megatron-lm/tests
cp -r tests/ /opt/megatron-lm
fi
script:
- |
cd /opt/megatron-lm
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail `$([[ $TAG != latest ]] && echo -m 'not internal')` tests/unit_tests
done
artifacts:
paths:
- coverage

docs_build_test:
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/python-format:0.0.1
tags: [mcore-docker-node-small]
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
allow_failure: true
except:
- main

formatting:
extends: [.tests_common]
image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
stage: test
needs: [build_image]
script:
- git fetch origin main
- CHECK_ONLY=true bash tools/autoformat.sh

copyright:
extends: [.tests_common]
tags: [mcore-docker-node-small]
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
needs: [build_image]
script:
- git fetch origin main
- bash tools/copyright.sh

secret_detection:
tags: [mcore-docker-node-small]
variables:
GIT_DEPTH: 0
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
allow_failure: false
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
script:
- apk add jq
- /analyzer run
- |
if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
echo "Atleast one vulnerability has been found"
cat gl-secret-detection-report.json | jq '.'
exit 1
fi
48 changes: 35 additions & 13 deletions jet-tests.yml → .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
.jet_common:
stage: functional_tests
rules:
- if: '$FUNCTIONAL_TEST == "yes" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/ )'
- if: $FUNCTIONAL_TEST == "yes" && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
- if: '$FUNCTIONAL_TEST == "yes"'
- if: $FUNCTIONAL_TEST == "yes"
- when: never

default:
Expand All @@ -21,12 +21,36 @@ jet-configure:
name: mikefarah/yq:4.35.2
entrypoint: [""]
extends: [.jet_common, .jet-configure]
tags:
- mcore-docker-node-small
tags: [mcore-docker-node-small]
script:
- set -x
- JET_FILTER=${JET_CUSTOM_FILTER:-False}
- echo "_JET_FILTER=$JET_FILTER" | tee -a jet.env
- |
JET_CUSTOM_FILTER="type == 'basic'"
if [[ $FUNCTIONAL_TEST_CLUSTER == dgxh100_eos ]]; then
JET_CI_BRANCH=mcore/eos
PLATFORM=dgx_h100
elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci ]]; then
JET_CI_BRANCH=mcore/draco-oci
PLATFORM=dgx_a100
elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci-ord ]]; then
JET_CI_BRANCH=mcore/draco-oci-ord
PLATFORM=dgx_a100
fi
# Add platform
JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$PLATFORM' in spec.platforms"
# Add scope
JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$FUNCTIONAL_TEST_SCOPE' in spec.scope"
if [[ "$JET_CUSTOM_FILTER" == "type == 'basic'" ]]; then
JET_CUSTOM_FILTER="False"
fi
echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a jet.env
echo "JET_CUSTOM_FILTER=$JET_CUSTOM_FILTER" | tee -a jet.env
- |
IMAGE=${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} yq '. |=
(
Expand All @@ -50,21 +74,20 @@ jet-configure:
max: 2
when: job_execution_timeout


jet-build:
extends: [build_image, .jet_common]
variables:
STAGE: jet

jet-trigger:
extends: [.jet_common, .jet-trigger]
needs: [metadata, jet-configure, jet-build]
needs: [jet-configure, jet-build]
trigger:
project: dl/jet/ci
branch: $JET_CI_BRANCH
strategy: depend
variables:
JET_WORKLOADS_FILTER: '$_JET_FILTER'
JET_WORKLOADS_FILTER: '$JET_CUSTOM_FILTER'
JET_CUSTOM_CONFIG: |
retrier:
enabled: true
Expand All @@ -74,7 +97,6 @@ jet-trigger:
environment: jet-auto-retrier
builds:
jet_flavour: # An empty mapping will disable building the JET flavor
inherit:
variables: true

Expand All @@ -97,10 +119,10 @@ jet-results-summary:
paths:
- scripts
rules:
- if: '$FUNCTIONAL_TEST == "yes" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/ )'
- if: '$FUNCTIONAL_TEST == "yes" && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"'
allow_failure: true
when: always
- if: '$FUNCTIONAL_TEST == "yes"'
allow_failure: false
when: always
- when: never

Expand All @@ -117,7 +139,7 @@ jet-results-notify:
- export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export CONTEXT=$SCOPE
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
- export DATE=$(date +"%Y-%m-%d")
- bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID}
artifacts:
Expand Down
50 changes: 50 additions & 0 deletions .gitlab/stages/03.convergence-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
convergence-test:
rules:
- if: $CONVERGENCE_TEST == "yes"
- when: never
stage: convergence_tests
needs: [build_image]
tags:
- ${TAG}
timeout: 7d
parallel:
matrix:
- SETTINGS: RELEASE_BERT
TAG: mcore-ssh-node-A
- SETTINGS: RELEASE_GPT
TAG: mcore-ssh-node-B
- SETTINGS: RELEASE_MOE
TAG: mcore-ssh-node-B
before_script: |
python -m venv local/venv
source local/venv/bin/activate
pip install jet-api --upgrade $JET_INDEX_URLS
script:
- |
set -x
export MCORE_RELEASE_NUM=${CI_COMMIT_BRANCH#core_r}
export IMAGE_TAG=v${MCORE_RELEASE_NUM}-${CI_PIPELINE_ID}
export WANDB_API_KEY=${WANDB_API_KEY}
export GITLAB_TOKEN=${PAT}
SETTINGS_ID=$(curl \
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets" \
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" \
| jq --arg TITLE "$SETTINGS" '
.[]
| select(.title == $TITLE)
| .id
' \
| tr -d '"')
SETTINGS=$(curl \
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets/${DATA_BLEND_ID}/raw" \
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE"
)
echo "$SETTINGS" > settings.txt
source settings.sh
yq '.MODEL_ARGS."--data-path" = env(DATA_PATH)' -i $TRAINING_PARAMS_PATH
env
bash tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
15 changes: 15 additions & 0 deletions .gitlab/stages/04.publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
publish-wheel:
image: quay.io/pypa/manylinux_2_28_x86_64
stage: publish
rules:
- if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes"
when: manual
- when: never
before_script:
- pip install twine
script:
- /opt/python/cp310-cp310/bin/python -m build
- /opt/python/cp311-cp311/bin/python -m build
- auditwheel repair dist/*.whl
- twine upload --repository pypi wheelhouse/*

4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# content of pytest.ini
[pytest]
markers =
internal: mark a test as a test to private/internal functions.
Loading

0 comments on commit 15b7cfb

Please sign in to comment.