forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ADLR/megatron-lm!1908 - ci: Refactor gitlab-ci
- Loading branch information
Showing
11 changed files
with
352 additions
and
440 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
include: | ||
- template: Security/Secret-Detection.gitlab-ci.yml | ||
|
||
mirror_to_github: | ||
rules: | ||
- if: '$CI_COMMIT_REF_PROTECTED == "true"' | ||
- when: never | ||
tags: [mcore-docker-node-small] | ||
stage: .pre | ||
image: python:3.10 | ||
variables: | ||
GIT_STRATEGY: "clone" | ||
script: | ||
- git checkout $CI_COMMIT_BRANCH | ||
- git remote add github https://ko3n1g:[email protected]/NVIDIA/Megatron-LM.git || true | ||
- git push -u github $CI_COMMIT_BRANCH | ||
|
||
label_merge_request: | ||
rules: | ||
- if: $CI_PIPELINE_SOURCE == "merge_request_event" | ||
- when: never | ||
stage: .pre | ||
image: golang:1.22 | ||
tags: | ||
- mcore-docker-node-small | ||
before_script: | ||
- git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git | ||
- cd gitlab-mr-labeler | ||
- go install . | ||
- cd .. | ||
- go install github.com/itchyny/gojq/cmd/gojq@latest | ||
- | | ||
echo LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | gojq '.labels | join(",")') > labels | ||
script: | ||
- gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true | ||
after_script: | ||
- | | ||
source labels | ||
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT | ||
check_milestone: | ||
rules: | ||
- if: $CI_PIPELINE_SOURCE == "merge_request_event" | ||
- when: never | ||
stage: .pre | ||
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache | ||
tags: | ||
- mcore-docker-node-small | ||
script: | ||
- env | ||
- | | ||
MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone') | ||
- | | ||
if [[ "$MILESTONE" == "null" ]]; then | ||
echo Please assign a Milestone to this MR! | ||
exit 1 | ||
fi | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
.tests_common: | ||
rules: | ||
- if: ($FUNCTIONAL_TEST == "no" || $FUNCTIONAL_TEST == "yes" || $CONVERGENCE_TEST == "yes") && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" | ||
allow_failure: true | ||
- if: $FUNCTIONAL_TEST == "no" || $FUNCTIONAL_TEST == "yes" || $CONVERGENCE_TEST == "yes" | ||
- when: never | ||
stage: test | ||
|
||
include: | ||
- template: Security/Secret-Detection.gitlab-ci.yml | ||
|
||
build_image: | ||
tags: [8xL40S-builder] | ||
image: docker:26.1.4-dind | ||
timeout: 45m | ||
parallel: | ||
matrix: | ||
- IMAGE: CI_MCORE_IMAGE | ||
FILE: Dockerfile.ci | ||
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 | ||
- IMAGE: CI_NEMO_IMAGE | ||
FILE: Dockerfile.ci | ||
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly | ||
- IMAGE: LINTING_IMAGE | ||
FILE: Dockerfile.linting | ||
BASE_IMAGE: python:3.10 | ||
before_script: | ||
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin | ||
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin | ||
variables: | ||
STAGE: main | ||
script: | ||
- | | ||
set -x | ||
eval "IMAGE=\$$IMAGE" | ||
docker system prune -a --filter "until=96h" -f | ||
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then | ||
ADDITIONAL_PARAMS="--pull" | ||
fi | ||
docker build \ | ||
--secret id=JET_INDEX_URLS \ | ||
--target $STAGE \ | ||
-f $FILE \ | ||
-t ${IMAGE}:${CI_PIPELINE_ID} \ | ||
--build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \ | ||
--cache-to type=inline \ | ||
--cache-from type=registry,ref=${IMAGE}:buildcache \ | ||
--build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ | ||
${ADDITIONAL_PARAMS} . | ||
docker push ${IMAGE}:${CI_PIPELINE_ID} | ||
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then | ||
docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache | ||
docker push ${IMAGE}:buildcache | ||
fi | ||
if [[ $CI_COMMIT_BRANCH == core_r* ]]; then | ||
docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} | ||
docker push ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} | ||
fi | ||
retry: | ||
max: 2 | ||
|
||
unit_tests: | ||
# This job runs both test suite of ToT and of a historic ref against | ||
# the current code. This is a form of backwards compatibility testing | ||
# and helps in providing stable interfaces. | ||
extends: [.tests_common] | ||
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} | ||
needs: [build_image] | ||
timeout: 180m | ||
parallel: | ||
matrix: | ||
- TAG: latest | ||
- TAG: db5c60ae3fe5247f16ec0536bbf41ee5c7fb9c4a | ||
tags: [8xL40S] | ||
variables: | ||
GIT_STRATEGY: clone | ||
GIT_DEPTH: 0 | ||
before_script: | ||
- | | ||
if [[ $TAG != latest ]]; then | ||
git checkout $TAG | ||
rm -rf /opt/megatron-lm/tests | ||
cp -r tests/ /opt/megatron-lm | ||
fi | ||
script: | ||
- | | ||
cd /opt/megatron-lm | ||
for i in $(seq $UNIT_TEST_REPEAT); do | ||
SEED=$((RANDOM % 9000 + 1000)); | ||
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail `$([[ $TAG != latest ]] && echo -m 'not internal')` tests/unit_tests | ||
done | ||
artifacts: | ||
paths: | ||
- coverage | ||
|
||
docs_build_test: | ||
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/python-format:0.0.1 | ||
tags: [mcore-docker-node-small] | ||
script: | ||
- cd .. | ||
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git | ||
- mv megatron-lm/ documentation/ | ||
- cd documentation/ | ||
- ./repo docs | ||
allow_failure: true | ||
except: | ||
- main | ||
|
||
formatting: | ||
extends: [.tests_common] | ||
image: ${LINTING_IMAGE}:${CI_PIPELINE_ID} | ||
tags: [mcore-docker-node-small] | ||
stage: test | ||
needs: [build_image] | ||
script: | ||
- git fetch origin main | ||
- CHECK_ONLY=true bash tools/autoformat.sh | ||
|
||
copyright: | ||
extends: [.tests_common] | ||
tags: [mcore-docker-node-small] | ||
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} | ||
needs: [build_image] | ||
script: | ||
- git fetch origin main | ||
- bash tools/copyright.sh | ||
|
||
secret_detection: | ||
tags: [mcore-docker-node-small] | ||
variables: | ||
GIT_DEPTH: 0 | ||
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA} | ||
allow_failure: false | ||
rules: | ||
- if: $CI_PIPELINE_SOURCE == "merge_request_event" | ||
script: | ||
- apk add jq | ||
- /analyzer run | ||
- | | ||
if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then | ||
echo "Atleast one vulnerability has been found" | ||
cat gl-secret-detection-report.json | jq '.' | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
convergence-test: | ||
rules: | ||
- if: $CONVERGENCE_TEST == "yes" | ||
- when: never | ||
stage: convergence_tests | ||
needs: [build_image] | ||
tags: | ||
- ${TAG} | ||
timeout: 7d | ||
parallel: | ||
matrix: | ||
- SETTINGS: RELEASE_BERT | ||
TAG: mcore-ssh-node-A | ||
- SETTINGS: RELEASE_GPT | ||
TAG: mcore-ssh-node-B | ||
- SETTINGS: RELEASE_MOE | ||
TAG: mcore-ssh-node-B | ||
before_script: | | ||
python -m venv local/venv | ||
source local/venv/bin/activate | ||
pip install jet-api --upgrade $JET_INDEX_URLS | ||
script: | ||
- | | ||
set -x | ||
export MCORE_RELEASE_NUM=${CI_COMMIT_BRANCH#core_r} | ||
export IMAGE_TAG=v${MCORE_RELEASE_NUM}-${CI_PIPELINE_ID} | ||
export WANDB_API_KEY=${WANDB_API_KEY} | ||
export GITLAB_TOKEN=${PAT} | ||
SETTINGS_ID=$(curl \ | ||
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets" \ | ||
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" \ | ||
| jq --arg TITLE "$SETTINGS" ' | ||
.[] | ||
| select(.title == $TITLE) | ||
| .id | ||
' \ | ||
| tr -d '"') | ||
SETTINGS=$(curl \ | ||
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets/${DATA_BLEND_ID}/raw" \ | ||
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" | ||
) | ||
echo "$SETTINGS" > settings.txt | ||
source settings.sh | ||
yq '.MODEL_ARGS."--data-path" = env(DATA_PATH)' -i $TRAINING_PARAMS_PATH | ||
env | ||
bash tests/functional_tests/shell_test_utils/run_ci_test_locally.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
publish-wheel: | ||
image: quay.io/pypa/manylinux_2_28_x86_64 | ||
stage: publish | ||
rules: | ||
- if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" | ||
when: manual | ||
- when: never | ||
before_script: | ||
- pip install twine | ||
script: | ||
- /opt/python/cp310-cp310/bin/python -m build | ||
- /opt/python/cp311-cp311/bin/python -m build | ||
- auditwheel repair dist/*.whl | ||
- twine upload --repository pypi wheelhouse/* | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# content of pytest.ini | ||
[pytest] | ||
markers = | ||
internal: mark a test as a test to private/internal functions. |
Oops, something went wrong.