diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e2f7725fb1..c99b97f697 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -13,22 +13,28 @@ workflow: FUNCTIONAL_TEST: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - FUNCTIONAL_TEST: "yes" - FUNCTIONAL_TEST_SCOPE: mr UNIT_TEST_REPEAT: 5 UNIT_TEST_TIMEOUT: 50 + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: mr + FUNCTIONAL_TEST_CLUSTER_A100: "" + FUNCTIONAL_TEST_CLUSTER_H100: "" - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - FUNCTIONAL_TEST: "yes" - FUNCTIONAL_TEST_SCOPE: nightly UNIT_TEST_REPEAT: 5 UNIT_TEST_TIMEOUT: 50 + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: nightly + FUNCTIONAL_TEST_CLUSTER_A100: "" + FUNCTIONAL_TEST_CLUSTER_H100: "" - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: - FUNCTIONAL_TEST: "yes" - FUNCTIONAL_TEST_SCOPE: weekly UNIT_TEST_REPEAT: 5 UNIT_TEST_TIMEOUT: 50 + FUNCTIONAL_TEST: "yes" + FUNCTIONAL_TEST_SCOPE: weekly + FUNCTIONAL_TEST_CLUSTER_A100: "" + FUNCTIONAL_TEST_CLUSTER_H100: "" - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != "" variables: FUNCTIONAL_TEST: "no" @@ -58,29 +64,23 @@ variables: - "mr" - "nightly" - "weekly" + - "pre-release" + - "release" description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)" - FUNCTIONAL_TEST_CLUSTER: + FUNCTIONAL_TEST_CLUSTER_A100: value: "dgxa100_dracooci" options: - "dgxa100_dracooci" - "dgxa100_dracooci-ord" - - "dgxh100_eos" - description: '"dgxa100_dracooci" for OCI-IAD, "dgxh100_eos" for EOS' - CONVERGENCE_TEST: - value: "no" + description: 'Cluster for A100 workloads' + FUNCTIONAL_TEST_CLUSTER_H100: + value: "dgxh100_eos" options: - - "yes" - - "no" - description: To run a convergence test - CONVERGENCE_TEST_SCOPE: - value: "release" - options: - - "release" - - "pre-release" - description: "Test suite to run (only for CONVERGENCE_TEST=yes)" - CONVERGENCE_TEST_RUN_NAME: - value: "pre-release-$$CI_PIPELINE_ID" - description: "Run directory of convergence test" + - "dgxh100_coreweave" + - "dgxh100_eos" + description: 'Cluster for H100 workloads' + FUNCTIONAL_TEST_NAME: + description: "Name of functional test run (only for pre-release and release)" PUBLISH: value: "no" options: @@ -96,6 +96,7 @@ variables: # CI wide variables CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci + CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting UNIT_TEST_TIMEOUT: 15 @@ -105,5 +106,4 @@ include: - .gitlab/stages/00.pre.yml - .gitlab/stages/01.tests.yml - .gitlab/stages/02.functional-tests.yml - - .gitlab/stages/03.convergence-tests.yml - - .gitlab/stages/04.publish.yml + - .gitlab/stages/03.publish.yml diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index e0b5c579c1..a91436be87 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -76,9 +76,10 @@ clean_docker_node: matrix: - node: 8xL40S - node: mcore-docker-node-small + - node: mcore-docker-node-jet script: - export DOCKER_HOST='unix:///var/run/docker.sock' - - docker system prune -a --filter "until=48h" -f || true + - docker system prune -a --filter "until=36h" -f || true maybe_cherry_pick_commit: rules: @@ -101,8 +102,13 @@ maybe_cherry_pick_commit: - git config --global user.email "mcore-bot@nvidia.com" - git config --global user.name "Mcore Bot" - | - LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}" | jq '.labels | join(",")' | tr -d '"') - + MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}") + + LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"') + AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"') + AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"') + TITLE=$(echo -E $MR | jq '.title' | tr -d '"') + MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"') TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*') if [[ $TARGET_BRANCHES == "" ]]; then @@ -134,8 +140,11 @@ maybe_cherry_pick_commit: --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \ -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \ -d "target_branch=$RELEASE_BRANCH" \ - -d "title=Cherry-pick $MR_ID into $RELEASE_BRANCH" \ - -d "labels=cherry-pick" + -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \ + -d "labels=cherry-pick" \ + -d "reviewer_ids=$AUTHOR_ID" \ + -d "milestone_id=$MILESTONE_ID" \ + -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,

we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀

Please review and approve this cherry pick by your convenience\!" else URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml index b3cefc0fde..dc59e026ac 100644 --- a/.gitlab/stages/01.tests.yml +++ b/.gitlab/stages/01.tests.yml @@ -21,6 +21,10 @@ build_image: FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 TAG: mcore-docker-node-large + - IMAGE: CI_MCORE_DEV_IMAGE + FILE: Dockerfile.ci.dev + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 + TAG: mcore-docker-node-large - IMAGE: CI_NEMO_IMAGE FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidian/nemo:nightly @@ -35,48 +39,44 @@ build_image: variables: STAGE: main script: + - apk add bash - | - set -x - env - eval "IMAGE=\$$IMAGE" - - docker system prune -a --filter "until=24h" -f || true - - if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then - ADDITIONAL_PARAMS="--pull" - fi - - docker pull ${IMAGE}:${CI_PIPELINE_ID} || true - docker pull ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} || true - docker pull ${IMAGE}:buildcache || true - - docker build \ - --secret id=JET_INDEX_URLS \ - --target $STAGE \ - -f $FILE \ - -t ${IMAGE}:${CI_PIPELINE_ID} \ - -t ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \ - --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \ - --cache-to type=inline \ - --cache-from type=registry,ref=${IMAGE}:buildcache \ - --cache-from type=registry,ref=${IMAGE}:${CI_PIPELINE_ID} \ - --cache-from type=registry,ref=${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \ - --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ - ${ADDITIONAL_PARAMS} . - - docker push ${IMAGE}:${CI_PIPELINE_ID} - docker push ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} - - if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then - docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:nightly - docker push ${IMAGE}:nightly - fi + bash -c ' + set -x + env + eval "IMAGE=\$$IMAGE" + + docker system prune -a --filter "until=24h" -f || true + + docker buildx create --name container --driver=docker-container + + ADDITIONAL_PARAMS=() + + if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then + ADDITIONAL_PARAMS+=("--pull") + ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main") + fi - if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then - docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache - docker push ${IMAGE}:buildcache - fi + if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then + ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly") + fi + DOCKER_BUILDKIT=1 docker build \ + --secret id=JET_INDEX_URLS \ + --target $STAGE \ + -f $FILE \ + -t ${IMAGE}:${CI_PIPELINE_ID} \ + --builder=container \ + --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \ + --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \ + --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \ + --cache-from type=registry,ref=${IMAGE}-buildcache:main \ + --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \ + --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \ + --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ + --push \ + ${ADDITIONAL_PARAMS[@]} . + ' retry: max: 2 @@ -85,13 +85,17 @@ unit_tests: # the current code. This is a form of backwards compatibility testing # and helps in providing stable interfaces. extends: [.test_mr_rules] - image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} + image: ${IMAGE}:${CI_PIPELINE_ID} needs: [build_image] timeout: 180m parallel: matrix: - TAG: latest - - TAG: 8fc755388a03bae05cb740857008b8916e01a63c + IMAGE: ${CI_MCORE_IMAGE} + # - TAG: latest + # IMAGE: ${CI_MCORE_DEV_IMAGE} + - TAG: core_r0.9.0 + IMAGE: ${CI_MCORE_IMAGE} tags: [8xL40S] variables: GIT_STRATEGY: clone @@ -112,11 +116,14 @@ unit_tests: for i in $(seq $UNIT_TEST_REPEAT); do SEED=$((RANDOM % 9000 + 1000)); - SKIPPED=() + ARGS=() if [[ $TAG != latest ]]; then - SKIPPED+=(-m "not internal") + ARGS+=(-m "not internal") + fi + if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then + ARGS+=(-m "experimental") fi - timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${SKIPPED[@]}" tests/unit_tests + timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests done artifacts: paths: @@ -125,10 +132,30 @@ unit_tests: - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" allow_failure: true when: always - - if: '$TAG != "latest"' - allow_failure: true - when: always +unit-tests-results-notify: + extends: [.test_mr_rules] + image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} + needs: [unit_tests] + tags: + - mcore-docker-node-small + script: + - env + - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK} + - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} + - export GITLAB_ENDPOINT + - export DATE=$(date +"%Y-%m-%d") + - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID} + artifacts: + when: always + paths: + - scripts + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended" + when: always + - when: never + docs_build_test: extends: [.test_mr_rules] image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index 0c30857409..531527b8b4 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -16,91 +16,68 @@ include: ref: main file: downstreams.yml -jet-configure: - image: - name: mikefarah/yq:4.35.2 - entrypoint: [""] - extends: [.jet_common, .jet-configure] +jet-build: + extends: [build_image, .jet_common] + variables: + STAGE: jet + +jet-generate: + needs: [jet-build] + extends: [.jet_common] + image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} tags: [mcore-docker-node-small] - script: + before_script: + - git rm -r tests/functional_tests/local_recipes || true + - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes + - ls tests/functional_tests/local_recipes + script: - set -x - | - if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" && "$CI_MERGE_REQUEST_LABELS" == "*H100*" ]]; then - FUNCTIONAL_TEST_CLUSTER=$DEFAULT_H100_CLUSTER - fi + A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER) + H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER) - | - JET_CUSTOM_FILTER="type == 'basic'" - - if [[ $FUNCTIONAL_TEST_CLUSTER == dgxh100_eos ]]; then - JET_CI_BRANCH=mcore/eos - PLATFORM=dgx_h100 - elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci ]]; then - JET_CI_BRANCH=mcore/draco-oci - PLATFORM=dgx_a100 - elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci-ord ]]; then - JET_CI_BRANCH=mcore/draco-oci-ord - PLATFORM=dgx_a100 - fi - - # Add platform - JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$PLATFORM' in spec.platforms" - - # Add scope - JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$FUNCTIONAL_TEST_SCOPE' in spec.scope" - - if [[ "$JET_CUSTOM_FILTER" == "type == 'basic'" ]]; then - JET_CUSTOM_FILTER="False" + if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then + RELEASE_ARGS=( + "--run-name" + $FUNCTIONAL_TEST_NAME + "--wandb-experiment" + $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-') + ) + else + RELEASE_ARGS=() fi - echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a jet.env - echo "JET_CUSTOM_FILTER=$JET_CUSTOM_FILTER" | tee -a jet.env - - | - IMAGE=${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} yq '. |= - ( - select(.spec.name == "mcore-pyt") - | .spec.source.image = env(IMAGE) - ) - ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml - - IMAGE=${CI_NEMO_IMAGE}:${CI_PIPELINE_ID} yq '. |= - ( - select(.spec.name == "mcore-nemo") - | .spec.source.image = env(IMAGE) - ) - ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml + export PYTHONPATH=$(pwd) + python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \ + --scope $FUNCTIONAL_TEST_SCOPE \ + --a100-cluster $A100_CLUSTER \ + --h100-cluster $H100_CLUSTER \ + --container-tag ${CI_PIPELINE_ID} \ + --container-image ${CI_MCORE_IMAGE} \ + --container-image-dev ${CI_MCORE_DEV_IMAGE} \ + --output-path "jet-trigger-job.yaml" \ + ${RELEASE_ARGS[@]} artifacts: - reports: - dotenv: jet.env paths: - - tests/functional_tests/jet_recipes - retry: - max: 2 - when: job_execution_timeout - -jet-build: - extends: [build_image, .jet_common] - variables: - STAGE: jet + - jet-trigger-job.yaml + - tests/functional_tests/local_recipes jet-trigger: - extends: [.jet_common, .jet-trigger] - needs: [jet-configure, jet-build] + stage: functional_tests + needs: [jet-generate] + extends: [.jet_common] trigger: - project: dl/jet/ci - branch: $JET_CI_BRANCH + include: + - artifact: jet-trigger-job.yaml + job: jet-generate strategy: depend variables: - JET_WORKLOADS_FILTER: '$JET_CUSTOM_FILTER' - JET_CUSTOM_CONFIG: | - retrier: - enabled: true - max_retries: 2 - retry_on: ['1.2', '1.2.*'] # All infra related issues - waiting_time: 60 - environment: jet-auto-retrier - builds: - jet_flavour: # An empty mapping will disable building the JET flavor + RO_API_TOKEN: $PAT + CONTAINER_TAG: $CI_PIPELINE_ID + CI_MCORE_IMAGE: $CI_MCORE_IMAGE + GITLAB_ENDPOINT: $GITLAB_ENDPOINT + PARENT_PIPELINE_ID: $CI_PIPELINE_ID inherit: variables: true diff --git a/.gitlab/stages/03.convergence-tests.yml b/.gitlab/stages/03.convergence-tests.yml deleted file mode 100644 index 5c7bd6a7a3..0000000000 --- a/.gitlab/stages/03.convergence-tests.yml +++ /dev/null @@ -1,86 +0,0 @@ -.common_release: - stage: convergence_tests - needs: [build_image] - timeout: 7d - before_script: - - git rm -r tests/functional_tests/local_recipes || true - - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes - - ls tests/functional_tests/local_recipes - - INSTALL_DIR=$(pwd)/local - - rm -rf "$INSTALL_DIR" - - mkdir -p "$INSTALL_DIR" - - wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname --machine).sh" -O "$INSTALL_DIR/miniconda.sh" - - bash "$INSTALL_DIR/miniconda.sh" -b -u -p "$INSTALL_DIR" - - rm -rf "$INSTALL_DIR/miniconda.sh" - - source $INSTALL_DIR/bin/activate - - pip install jet-api --upgrade $JET_INDEX_URLS - variables: - GIT_STRATEGY: clone - GIT_SUBMODULE_STRATEGY: none - script: - - | - env - set -x - - export IMAGE_TAG=${CI_PIPELINE_ID} - export WANDB_API_KEY - CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME) - - if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then - echo Please assign a CONVERGENCE_TEST_RUN_NAME - fi - - export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT - export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT - - bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh - artifacts: - paths: - - ./golden_values.json - retry: - max: 2 - -release-test: - rules: - - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release" - extends: [.common_release] - tags: - - ${TAG} - parallel: - matrix: - - MODEL: bert - VARIANT: bert_release - TAG: mcore-ssh-node-B - - MODEL: gpt - VARIANT: gpt3_15b_8t_release - TAG: mcore-ssh-node-B - - MODEL: mixtral - VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release - TAG: mcore-ssh-node-B - - MODEL: mixtral - VARIANT: mixtral_8x7b_tp1pp4ep8vpp8_release - TAG: mcore-ssh-agent-C - - MODEL: mixtral - VARIANT: mixtral_8x22b_tp2pp8ep8vpp1_release - TAG: mcore-ssh-agent-C - - MODEL: t5 - VARIANT: t5_release - TAG: mcore-ssh-agent-C - -pre-release-test: - rules: - - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release" - extends: [.common_release] - tags: - - ${TAG} - parallel: - matrix: - - MODEL: bert - VARIANT: bert_release - TAG: mcore-ssh-node-B - - MODEL: gpt - VARIANT: gpt3_15b_8t_release_sm - TAG: mcore-ssh-node-B - - MODEL: mixtral - VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm - TAG: mcore-ssh-node-B diff --git a/.gitlab/stages/04.publish.yml b/.gitlab/stages/03.publish.yml similarity index 100% rename from .gitlab/stages/04.publish.yml rename to .gitlab/stages/03.publish.yml diff --git a/Dockerfile.ci b/Dockerfile.ci index dfcc7381f7..fa13c48fd4 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -1,88 +1,62 @@ -# syntax=docker/dockerfile:experimental +# syntax=docker/dockerfile:1.3-labs ARG FROM_IMAGE_NAME -FROM $FROM_IMAGE_NAME as main -ENV DEBIAN_FRONTEND=noninteractive - -RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \ - /etc/apt/apt.conf.d/docker-clean - -RUN apt-get update && \ - apt-get install -y --no-install-recommends gettext && \ - apt-get clean - -RUN wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ -chmod a+x /usr/local/bin/yq - -##### For Mamba begin ##### -RUN pip uninstall -y triton && \ - pip install triton==2.1.0 +FROM $FROM_IMAGE_NAME as build_causal_conv1d +WORKDIR /opt +RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1 -# The causal-conv1d and mamba-ssm packages below are built from scratch here -# (which takes significant time) because there are no wheels available on PyPI -# for these relatively newer versions of the packages that are compatible with -# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we -# are using (in the NGC base container). Generally, if the package is not -# compatible with the PyTorch version, then it will generate a Python import -# error. The package authors tend to only release wheels for new versions of -# these pacakges which are compatible with the versions of regular PyTorch and -# NGC-variant PyTorch that are newer at the time of release. So, to use newer -# versions of these packages with relatively older versions of the NGC PyTorch -# container, we tend to have to build the packages from scratch. +FROM $FROM_IMAGE_NAME as build_grouped_gemm +WORKDIR /opt +RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 -RUN cd /tmp && \ - pip uninstall -y causal-conv1d && \ - git clone https://github.com/Dao-AILab/causal-conv1d.git && \ - cd causal-conv1d && \ - git checkout v1.2.2.post1 && \ - CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \ - cd .. && \ - rm -rf causal-conv1d +FROM $FROM_IMAGE_NAME as build_mamba_ssm +WORKDIR /opt +RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3 -RUN cd /tmp && \ - pip uninstall -y mamba-ssm && \ - git clone https://github.com/state-spaces/mamba.git && \ - cd mamba && \ - git checkout v2.0.3 && \ - MAMBA_FORCE_BUILD=TRUE pip install . && \ - cd .. && \ - rm -rf mamba -##### For Mamba end ##### - -##### For JET-API start ##### -RUN apt-get update && \ - apt-get install -y python3-venv && \ - apt-get clean -y && \ - python -m venv /opt/jet -##### For JET-API end ##### - -RUN pip3 install --no-cache-dir \ - einops \ - flask-restful \ - nltk \ - pytest \ - pytest-cov \ - pytest_mock \ - pytest-random-order \ - sentencepiece \ - wrapt \ - git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 \ - zarr \ - tensorstore==0.1.45 \ - wandb - -COPY . /workspace/megatron-lm - -COPY . /workspace/megatron-lm -RUN cp -r /workspace/megatron-lm /opt && \ - pip install /opt/megatron-lm +FROM $FROM_IMAGE_NAME as main +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends gettext python3-venv && \ + apt-get clean && \ + python -m venv /opt/jet && \ + wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ + chmod a+x /usr/local/bin/yq + +COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./ + +RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \ +einops \ +flask-restful \ +nltk \ +pytest \ +pytest-cov \ +pytest_mock \ +pytest-random-order \ +sentencepiece \ +wrapt \ +zarr \ +wandb \ +triton==2.1.0 \ +causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \ +mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \ +grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \ +tensorstore==0.1.45 && \ +rm *.whl + +# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker +COPY . /opt/megatron-lm +RUN pip install /opt/megatron-lm +ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH" ##### For NVIDIANS only ##### FROM main as jet ARG CACHEBUST=0 RUN --mount=type=secret,id=JET_INDEX_URLS \ JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ + pip install jet-client --upgrade $JET_INDEX_URLS && \ /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS ENV PATH="$PATH:/opt/jet/bin" ### \ No newline at end of file diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev new file mode 100644 index 0000000000..fa13c48fd4 --- /dev/null +++ b/Dockerfile.ci.dev @@ -0,0 +1,62 @@ +# syntax=docker/dockerfile:1.3-labs + +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME as build_causal_conv1d +WORKDIR /opt +RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1 + +FROM $FROM_IMAGE_NAME as build_grouped_gemm +WORKDIR /opt +RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 + +FROM $FROM_IMAGE_NAME as build_mamba_ssm +WORKDIR /opt +RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3 + +FROM $FROM_IMAGE_NAME as main +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gettext python3-venv && \ + apt-get clean && \ + python -m venv /opt/jet && \ + wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ + chmod a+x /usr/local/bin/yq + +COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./ + +RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \ +einops \ +flask-restful \ +nltk \ +pytest \ +pytest-cov \ +pytest_mock \ +pytest-random-order \ +sentencepiece \ +wrapt \ +zarr \ +wandb \ +triton==2.1.0 \ +causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \ +mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \ +grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \ +tensorstore==0.1.45 && \ +rm *.whl + +# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker +COPY . /opt/megatron-lm +RUN pip install /opt/megatron-lm +ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH" + +##### For NVIDIANS only ##### +FROM main as jet +ARG CACHEBUST=0 +RUN --mount=type=secret,id=JET_INDEX_URLS \ + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ + pip install jet-client --upgrade $JET_INDEX_URLS && \ + /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS +ENV PATH="$PATH:/opt/jet/bin" +### \ No newline at end of file diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md index 01e55c4a23..11601fd44f 100644 --- a/docs/llama_mistral.md +++ b/docs/llama_mistral.md @@ -282,6 +282,104 @@ If loading for either inference or finetuning, use the following arguments: --bf16 \ ``` +# Llama-3.1 + +Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps: + +1. Get access to download the checkpoints (weights and tokenizer). +2. Convert the checkpoints from Huggingface format to Megatron format. +3. (Optional) Validate converted checkpoints +4. Setup arguments for launching the model. + +The following sections detail these steps. + +## Contents + * [Download Huggingface checkpoints](#download-huggingface-checkpoints) + * [Convert checkpoint format](#convert-checkpoint-format) + * [Huggingface format](#huggingface-format) + * [Validate checkpoint](#optional-validate-checkpoint) + * [Launch model](#launch-model) + +## Download Huggingface checkpoints + +Users must first apply for access to download the Llama-3 checkpoints from [Huggingface](https://huggingface.co/meta-llama). + +## Convert checkpoint format + +We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16. + +### Huggingface format + +The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values: + +| Model size | Tensor parallel size (`TP`) | +| ---------- | --------------------------- | +| 8B | 1 | +| 70B | 8 | + +Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format: + +``` +$>: python tools/checkpoint/convert.py \ + > --bf16 \ + > --model-type GPT \ + > --loader llama_mistral \ + > --saver mcore \ + > --target-tensor-parallel-size ${TP} \ + > --checkpoint-type hf + > --load-dir ${HF_FORMAT_DIR} \ + > --save-dir ${MEGATRON_FORMAT_DIR} \ + > --tokenizer-model ${TOKENIZER_MODEL} + > --model-size llama3-8B \ +``` + +Valid values for `--model-size` are `llama3.1-8B` and `llama3.1-70B` (for pretrained-only models), and `llama3.1-8Bf` and `llama3.1-70Bf` (for chat-finetuned models). + +After this conversion, we are ready to load the checkpoints into a Megatron GPT model. + +## (Optional) Validate checkpoints + +A Megatron-LM text generation server for Llama3.1 can be launched using the script `examples/llama_mistral/run_text_generation_llama3.1.sh `. + +Once running, query the server with `curl 'http://:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":[""], "tokens_to_generate":100, "top_k":1}'`. + +A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path --prompt `. + +## Launch model + +If loading for either inference or finetuning, use the following arguments: + +``` +--tensor-model-parallel-size ${TP} \ +--pipeline-model-parallel-size 1 \ +--seq-length 8192 \ +--max-position-embeddings 131072 \ +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model ${TOKENIZER_MODEL} \ +--load ${CHECKPOINT_DIR} \ +--exit-on-missing-checkpoint \ +--use-checkpoint-args \ +--no-load-optim \ +--no-load-rng \ +--untie-embeddings-and-output-weights \ +--normalization RMSNorm \ +--position-embedding-type rope \ +--no-masked-softmax-fusion \ +--attention-softmax-in-fp32 \ +--disable-bias-linear \ +--transformer-impl transformer_engine \ +--group-query-attention 8 \ +--attention-dropout 0.0 \ +--hidden-dropout 0.0 \ +--rotary-base 500000 \ +--rotary-percent 1.0 \ +--use-rope-scaling \ +--ffn-hidden-size 14336 \ +--num-attention-heads 32 \ +--swiglu \ +--bf16 \ +``` + # Mistral-7b Megatron currently supports loading the v0.3 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps: @@ -372,3 +470,11 @@ If loading for either inference or finetuning, use the following arguments: *Note: Experimental* Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3). + +# Known numerical differences + +It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list: + +1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132 +2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas mcore combines them into a single GEMM for efficiency. This leads to small numerical differences. + diff --git a/examples/export/README.md b/examples/export/README.md new file mode 100644 index 0000000000..ddb8216f94 --- /dev/null +++ b/examples/export/README.md @@ -0,0 +1,10 @@ +# Megatron Core Export + +This module is used to export megatron core models to different inference frameworks. +Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. + +## PTQ AND EXPORT +Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. + +# TRTLLM EXPORT +Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone. \ No newline at end of file diff --git a/examples/inference/quantization/README.md b/examples/export/ptq_and_trtllm_export/README.md similarity index 82% rename from examples/inference/quantization/README.md rename to examples/export/ptq_and_trtllm_export/README.md index e167b60e1c..c5255f7ccf 100644 --- a/examples/inference/quantization/README.md +++ b/examples/export/ptq_and_trtllm_export/README.md @@ -74,7 +74,7 @@ cd ../.. Now launch the PTQ + TensorRT-LLM export script, ```sh -bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None ``` By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can @@ -104,12 +104,12 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base ``` ### mistral-12B FP8 Quantization and TensorRT-LLM Deployment @@ -139,7 +139,7 @@ huggingface-cli login Now launch the PTQ + TensorRT-LLM checkpoint export script, ```sh -bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None ``` Then build TensorRT engine and run text generation example using the newly built TensorRT engine @@ -149,12 +149,12 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407 +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407 ``` @@ -165,7 +165,7 @@ python examples/inference/quantization/trtllm_text_generation.py --tokenizer mis > that we support. ```sh -bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} ``` The script expect `${CHECKPOINT_DIR}` to have the following structure: @@ -184,8 +184,23 @@ The script expect `${CHECKPOINT_DIR}` to have the following structure: In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as the source of the tokenizer. +Then build TensorRT engine and run text generation example using the newly built TensorRT engine + +```sh +export trtllm_options=" \ + --checkpoint_dir /tmp/trtllm_ckpt \ + --output_dir /tmp/trtllm_engine \ + --max_input_len 2048 \ + --max_seq_len 512 \ + --max_batch_size 8 " + +trtllm-build ${trtllm_options} + +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b +``` + ### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment -> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12. +> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13. > **NOTE:** There are two ways to acquire the checkpoint. Users can follow > the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and @@ -199,16 +214,23 @@ If users choose to download the model from NGC, first extract the sharded checkp tar -xvf 8b_pre_trained_bf16.nemo ``` +> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface + +```sh +pip install -U "huggingface_hub[cli]" +huggingface-cli login +``` + Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3, ```sh -bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None ``` or llama-3.1 ```sh -bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None +bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None ``` Then build TensorRT engine and run text generation example using the newly built TensorRT engine @@ -218,14 +240,14 @@ export trtllm_options=" \ --checkpoint_dir /tmp/trtllm_ckpt \ --output_dir /tmp/trtllm_engine \ --max_input_len 2048 \ - --max_output_len 512 \ + --max_seq_len 512 \ --max_batch_size 8 " trtllm-build ${trtllm_options} -python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B # For llama-3 -python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B +python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B #For llama-3.1 ``` \ No newline at end of file diff --git a/examples/inference/quantization/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh similarity index 88% rename from examples/inference/quantization/ptq_trtllm_llama2_7b.sh rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh index 8c4777f07a..ebcc448955 100644 --- a/examples/inference/quantization/ptq_trtllm_llama2_7b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh @@ -66,7 +66,7 @@ options=" \ --tokenizer-model ${TOKENIZER_MODEL} \ --save-interval 1000000 \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} + --load ${CHECKPOINT_LOAD_DIR} \ --fp16" # Precompile CUDA extentions @@ -76,7 +76,5 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} -# This script is using mpi4py which will fork multiple processes. -python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh similarity index 91% rename from examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh index d22ae4d472..a6251663f7 100644 --- a/examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh @@ -63,9 +63,10 @@ options=" \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model meta-llama/Meta-Llama-3.1-8B \ --save-interval 1000000 \ + --use-rope-scaling \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} - --rotary-base 500000 + --load ${CHECKPOINT_LOAD_DIR} \ + --rotary-base 500000 \ --fp16" # Precompile CUDA extentions @@ -75,4 +76,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/inference/quantization/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh similarity index 92% rename from examples/inference/quantization/ptq_trtllm_llama3_8b.sh rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh index 11ab023fad..f181c8c2dd 100644 --- a/examples/inference/quantization/ptq_trtllm_llama3_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh @@ -64,8 +64,8 @@ options=" \ --tokenizer-model meta-llama/Meta-Llama-3-8B \ --save-interval 1000000 \ --use-dist-ckpt \ - --load ${CHECKPOINT_LOAD_DIR} - --rotary-base 500000 + --load ${CHECKPOINT_LOAD_DIR} \ + --rotary-base 500000 \ --fp16" # Precompile CUDA extentions @@ -75,4 +75,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/inference/quantization/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh similarity index 94% rename from examples/inference/quantization/ptq_trtllm_minitron_8b.sh rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh index 8c7bc0cb82..31ec192fd5 100644 --- a/examples/inference/quantization/ptq_trtllm_minitron_8b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh @@ -71,4 +71,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/inference/quantization/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh similarity index 94% rename from examples/inference/quantization/ptq_trtllm_mistral_12b.sh rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh index 17ded50d1e..3eb02d2e1d 100644 --- a/examples/inference/quantization/ptq_trtllm_mistral_12b.sh +++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh @@ -72,4 +72,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options} +torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options} diff --git a/examples/inference/quantization/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py similarity index 96% rename from examples/inference/quantization/text_generation_ptq.py rename to examples/export/ptq_and_trtllm_export/text_generation_ptq.py index 13b327b25a..340c9c90f7 100644 --- a/examples/inference/quantization/text_generation_ptq.py +++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py @@ -6,12 +6,11 @@ import sys from pathlib import Path -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))) import modelopt.torch.quantization as mtq import torch from datasets import load_dataset -from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group from tqdm import tqdm # [ModelOpt]: changing the default model provider to the ModelOpt version @@ -179,10 +178,6 @@ def hf_dataset_forword_loop_func(model): if args.calib_dataset is not None: ptq_forward_loop_func = hf_dataset_forword_loop_func - # Setting data parallel and tensor parallel group - set_data_parallel_group(mpu.get_data_parallel_group()) - set_tensor_parallel_group(mpu.get_tensor_model_parallel_group()) - if args.export_quant_cfg in QUANT_CFG_CHOICES: mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg] if "*output_layer*" not in mtq_config["quant_cfg"]: diff --git a/examples/inference/quantization/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py similarity index 100% rename from examples/inference/quantization/trtllm_text_generation.py rename to examples/export/ptq_and_trtllm_export/trtllm_text_generation.py diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md new file mode 100644 index 0000000000..52cad78583 --- /dev/null +++ b/examples/export/trtllm_export/README.md @@ -0,0 +1,161 @@ +# Megatron Core To TRTLLM Export Documentation +This guide will walk you through how you can use the megatron core export for exporting models to trtllm format + +### Contents +- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation) +- [Contents](#contents) + - [1. Quick Start](#1-quick-start) + - [1.1 Understanding The Code](#11-understanding-the-code) + - [1.2 Running The Code](#12-running-the-code) + - [2. GPU Export](#2-gpu-export) + - [3. Future work](#4-future-work) + +#### 1. Quick Start +This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py) + +NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function. + +
+ +##### 1.1 Understanding The Code +***STEP 1 - We initialize model parallel and other default arguments*** +We initalize tp and pp to 1 so that we can get the full model state dict on cpu +```python + initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) +``` + +***STEP 2 - We load the model using the model_provider_function*** +NOTE: We create a simple gpt model + +```python + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, # Needs to be atleast 32 times num_attn_heads + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + # Optionally you can also load a model using this code + # sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + # gpt_model.load_state_dict(checkpoint) + +``` + +***STEP 3 - Instantiate the TRTLLM Helper*** +We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py) For the GPT model we instantiate trtllm_helper as shown below. +```python + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) +``` + +***STEP 4 - Get the TRTLLM Weights and configs*** +To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. + +```python + model_state_dict={} + for key , val in gpt_model.state_dict().items(): + # val is non for _extra_state layers . We filter it out + if val is not None: + model_state_dict[key] = val + + export_config = ExportConfig(inference_tp_size = 2) + weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= model_state_dict, + dtype = DataType.bfloat16, + export_config=export_config + ) +``` + +***STEP 5 - Build the TRTLLM Engine*** +Following code is used to build the TRTLLM Engine. + +```python + for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list): + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights, + trtllm_model_config=trtllm_model_config, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) +``` +
+ +##### 1.2 Running The Code +An example run script is shown below. + +``` +# In a workstation +MLM_PATH=/path/to/megatron-lm +CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86 + +docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash + +# Inside the container run the following. + +cd /opt/megatron-lm/ + +CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py +``` + +
+ +#### 2. GPU Export +You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. +In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. + +To run the gpu version + +``` +CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py +``` + +
+ +#### 3. Future work +The following are planned for the future releases . +* Pipeline parallellism for export (Work in progress) +* GPU Export for more models (Work in progress for some models) +* Refit functionality +* VLLM Support \ No newline at end of file diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py new file mode 100644 index 0000000000..57d44f9f62 --- /dev/null +++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py @@ -0,0 +1,117 @@ +import os +import torch +from megatron.core import parallel_state +from megatron.core import dist_checkpointing +from megatron.core.export.model_type import ModelType +from megatron.core.export.data_type import DataType +from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + + +_SEQUENCE_LENGTH = 64 +_VOCAB_SIZE = 256 + +def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1): + parallel_state.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size) + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32 + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=_VOCAB_SIZE, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + return gpt_model + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model + +if __name__ == "__main__": + initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + device = torch.device("cuda") + gpt_model.to(device) + + # Optionally you can also load a gpt model from ckpt_path using this code below + # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + + seq_len_interpolation_factor = None + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) + + + trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= gpt_model.state_dict(), + dtype = DataType.bfloat16, + on_device_distributed_conversion=True, + vocab_size=_VOCAB_SIZE, + gpus_per_node=2, + ) + + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights[0], + trtllm_model_config=trtllm_model_config[0], + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py new file mode 100644 index 0000000000..587e7cfdd3 --- /dev/null +++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py @@ -0,0 +1,118 @@ +import os +import torch +from megatron.core import parallel_state +from megatron.core import dist_checkpointing +from megatron.core.export.model_type import ModelType +from megatron.core.export.data_type import DataType +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec + + +_SEQUENCE_LENGTH = 64 + + +def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1): + parallel_state.destroy_model_parallel() + + # Torch setup for distributed training + rank = int(os.environ['LOCAL_RANK']) + world_size = torch.cuda.device_count() + torch.cuda.set_device(rank) + torch.distributed.init_process_group(world_size=world_size, rank=rank) + + # Megatron core distributed training initialization + parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size) + +def model_provider(): + """Build the model.""" + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, # Needs to be atleast 32 times num_attn_heads + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + ) + + gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=100, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + return gpt_model + +def load_distributed_checkpoint(checkpoint_path, gpt_model): + sharded_state_dict=gpt_model.sharded_state_dict(prefix='') + checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path) + gpt_model.load_state_dict(checkpoint) + return gpt_model + +if __name__ == "__main__": + # Need to use TP1 PP1 for export on single device + initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1) + model_parallel_cuda_manual_seed(123) + + gpt_model = model_provider() + + # Optionally you can also load a gpt model from ckpt_path using this code below + # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path) + + seq_len_interpolation_factor = None + if hasattr(gpt_model, "rotary_pos_emb"): + seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor + + trtllm_helper = TRTLLMHelper( + transformer_config=gpt_model.config, + model_type=ModelType.gpt, + position_embedding_type = gpt_model.position_embedding_type, + max_position_embeddings = gpt_model.max_position_embeddings, + rotary_percentage = gpt_model.rotary_percent, + rotary_base = gpt_model.rotary_base, + moe_tp_mode = 2, + multi_query_mode = False, + activation = "gelu", + seq_len_interpolation_factor = seq_len_interpolation_factor, + share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights + ) + + + export_config = ExportConfig(inference_tp_size = 2) + # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api + weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict= gpt_model.state_dict(), + dtype = DataType.bfloat16, + export_config=export_config + ) + + for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list): + trtllm_helper.build_and_save_engine( + max_input_len=256, + max_output_len=256, + max_batch_size=8, + engine_dir='/opt/megatron-lm/engine', + trtllm_model_weights=trtllm_model_weights, + trtllm_model_config=trtllm_model_config, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank=64, + lora_target_modules=None, + max_prompt_embedding_table_size=0, + paged_kv_cache=True, + remove_input_padding=True, + paged_context_fmha=False, + use_refit=False, + max_num_tokens=None, + max_seq_len=512, + opt_num_tokens=None, + max_beam_width=1, + tokens_per_block=128, + multiple_profiles=False, + gpt_attention_plugin="auto", + gemm_plugin="auto", + ) \ No newline at end of file diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py index 7b583612a5..9d8f4465f6 100644 --- a/examples/inference/llama_mistral/huggingface_reference.py +++ b/examples/inference/llama_mistral/huggingface_reference.py @@ -20,5 +20,6 @@ for key in inputs: inputs[key] = inputs[key].cuda() # top_k, top_p and do_sample are set for greedy argmax based sampling + outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) \ No newline at end of file diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh new file mode 100755 index 0000000000..06584f0917 --- /dev/null +++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# This example will start serving the Llama3.1-8B model +export NCCL_IB_SL=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_APPLY_QK_LAYER_SCALING=0 + +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 0.0.0.0 \ + --master_port 6000" + +# Ensure CHECKPOINT and TOKENIZER_MODEL are provided +if [ -z "$1" ] || [ -z "$2" ]; then + echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." + echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" + exit 1 +fi + +# Assign command-line arguments to variables +CHECKPOINT=$1 +TOKENIZER_MODEL=$2 + +pip install flask-restful + +torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ + --use-checkpoint-args \ + --disable-bias-linear \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --transformer-impl transformer_engine \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 500000 \ + --use-rope-scaling \ + --use-rotary-position-embeddings \ + --swiglu \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 14336 \ + --load ${CHECKPOINT} \ + --num-attention-heads 32 \ + --max-position-embeddings 131072 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 8192 diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py new file mode 100644 index 0000000000..3f4557d3c2 --- /dev/null +++ b/examples/inference/t5/simple_t5_batch_inference.py @@ -0,0 +1,157 @@ +import os +import sys +from argparse import Namespace + +import torch + +import pretrain_t5 +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.engines.abstract_engine import AbstractEngine +from megatron.core.inference.engines.mcore_engine import MCoreEngine +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( + T5InferenceWrapper, +) +from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import ( + EncoderDecoderTextGenerationController, +) +from megatron.core.transformer.module import MegatronModule +from pretrain_t5 import model_provider + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +from typing import List + +from megatron.core import mpu +from megatron.training import get_args, get_model, get_tokenizer +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron + + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title='text generation') + + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--top_k", type=int, default=1, help='Top k sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument( + "--return-log-probs", + action='store_true', + default=False, + help='Return the log probabilities of the final output tokens', + ) + group.add_argument( + "--num-tokens-to-generate", + type=int, + default=30, + help='Number of tokens to generate for each prompt', + ) + group.add_argument( + "--encoder-prompts", + metavar='N', + type=str, + nargs='+', + help='Encoder input prompts with each prompt within quotes and seperated by space', + ) + group.add_argument( + "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once' + ) + return parser + + +def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine: + """Utility to get the relevant backend for running inference + + This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. + + Args: + args (Namespace): The user arguments parsed from command line + model (MegatronModule): The megatron model . + + Returns: + AbstractBackend: The chosen backend + """ + tokenizer = get_tokenizer() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=args.hidden_size, + inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + fp32_residual_connection=args.fp32_residual_connection, + params_dtype=args.params_dtype, + padded_vocab_size=args.padded_vocab_size, + ) + + inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config) + text_generation_controller = EncoderDecoderTextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer + ) + return MCoreEngine( + text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size + ) + + +def main(): + """Main program.""" + + # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) + # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) + initialize_megatron( + extra_args_provider=add_text_generate_args, + args_defaults={ + 'no_load_rng': True, + 'no_load_optim': True, + 'micro_batch_size': 1, + 'exit_on_missing_checkpoint': True, + }, + ) + + # Set up model and load checkpoint + model = get_model(model_provider, wrap_with_ddp=False) + load_checkpoint(model, None, None) + model = model[0] + + args = get_args() + + inference_engine = get_inference_engine(args, model) + + common_inference_params = CommonInferenceParams( + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + return_log_probs=args.return_log_probs, + num_tokens_to_generate=args.num_tokens_to_generate, + ) + + tokenizer = get_tokenizer() + decoder_prompts = [""] * len( + args.encoder_prompts + ) # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty + args.prompts = decoder_prompts + + results: List[InferenceRequest] = inference_engine.generate( + prompts=args.prompts, + add_BOS=True, + encoder_prompts=args.encoder_prompts, + common_inference_params=common_inference_params, + ) + + if torch.distributed.get_rank() == 0: + for idx, result in enumerate(results): + print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') + result = { + 'id': result.request_id, + 'input_prompt': result.prompt, + 'generated_text': result.generated_text, + 'generated_tokens': result.generated_tokens, + } + print(result) + + +if __name__ == "__main__": + main() diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py index f8c3714eb3..cf48b131a7 100644 --- a/examples/multimodal/config.py +++ b/examples/multimodal/config.py @@ -1,7 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from dataclasses import dataclass + import torch -from megatron.training.activations import quick_gelu, squared_relu +from megatron.training.activations import fast_gelu, quick_gelu, squared_relu def get_language_model_config(config): @@ -75,7 +77,26 @@ def get_vision_model_config(config, apply_query_key_layer_scaling): config.gated_linear_unit = False config.activation_func = quick_gelu config.kv_channels = 64 + config.num_query_groups = 16 + config.layernorm_zero_centered_gamma = False + config.apply_query_key_layer_scaling = apply_query_key_layer_scaling + config.bias_activation_fusion = False + config.bias_dropout_fusion = False + config.attention_softmax_in_fp32 = True + config.normalization = 'LayerNorm' + config.apply_rope_fusion = False + elif config.vision_model_type == "siglip": + config.num_layers = 27 config.num_attention_heads = 16 + config.add_bias_linear = True + config.add_qkv_bias = True + config.hidden_size = 1152 + config.hidden_dropout = 0.0 + config.attention_dropout = 0.0 + config.ffn_hidden_size = 4304 + config.gated_linear_unit = False + config.activation_func = fast_gelu + config.kv_channels = 72 config.num_query_groups = 16 config.layernorm_zero_centered_gamma = False config.apply_query_key_layer_scaling = apply_query_key_layer_scaling @@ -84,6 +105,8 @@ def get_vision_model_config(config, apply_query_key_layer_scaling): config.attention_softmax_in_fp32 = True config.normalization = 'LayerNorm' config.apply_rope_fusion = False + config.qk_layernorm = False + config.layernorm_epsilon = 1e-6 return config @@ -107,3 +130,26 @@ def get_vision_projection_config(config, hidden_size): config.activation_func = torch.nn.functional.gelu return config + + +@dataclass +class EvaluationConfig: + """Evaluation related configuration.""" + task: str + + temperature: float = 1.0 + top_p: float = 0.0 + top_k: int = 0 + + out_seq_length: int = 32 + + output_path: str = "" + + input_image_path: str = "" + gt_path: str = "" + + num_partitions: int = 1 + partition_id: int = 0 + num_samples_per_partition: int = 0 + + prompt_format: str = "mistral" diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py index 33bcf1bf1f..4bd1b29e51 100644 --- a/examples/multimodal/dataloader_provider.py +++ b/examples/multimodal/dataloader_provider.py @@ -4,7 +4,7 @@ import torch from dataset_helpers import TaskEncoder, print_error_handler -from megatron.core import mpu +from megatron.core import parallel_state from megatron.energon import ( LimitDataset, RepeatDataset, @@ -71,9 +71,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples): worker_debug_path = None worker_log_level = 0 - rank = mpu.get_data_parallel_rank() - world_size = mpu.get_data_parallel_world_size() - data_parallel_group = mpu.get_data_parallel_group() + rank = parallel_state.get_data_parallel_rank() + world_size = parallel_state.get_data_parallel_world_size() + data_parallel_group = parallel_state.get_data_parallel_group() worker_config = WorkerConfig( rank=rank, @@ -88,7 +88,7 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples): train_dataloader = get_savable_loader(train_ds, worker_config=worker_config) if args.load is not None: if getattr(args, "dataloader_save", None): - dp_rank = mpu.get_data_parallel_rank() + dp_rank = parallel_state.get_data_parallel_rank() data_save_name = get_checkpoint_name( args.dataloader_save, args.iteration, diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py index b80974a893..7d0a059f4d 100644 --- a/examples/multimodal/evaluate_textvqa.py +++ b/examples/multimodal/evaluate_textvqa.py @@ -1,16 +1,23 @@ import argparse import glob import json +import os from evaluate_vqav2 import compute_vqa_accuracy def merge_input_files(input_path): """Merge input files to a format compatible with the evaluator.""" - output_file_path = input_path + "-TextVQA-merged.json" + # Single input file. + if os.path.exists(input_path): + input_file_paths = [input_path] + output_file_path = input_path.replace(".jsonl", "-merged.json") + # Directory of partitioned input files. + else: + pattern = input_path + "-TextVQA-[0-9].*jsonl" + input_file_paths = glob.glob(pattern) - pattern = input_path + "-TextVQA-[0-9].*jsonl" - input_file_paths = glob.glob(pattern) + output_file_path = input_path + "-TextVQA-merged.json" results = [] @@ -35,7 +42,8 @@ def merge_input_files(input_path): def textvqa_eval(input_path): """Run TextVQA evaluation.""" result_file_path = merge_input_files(input_path) - compute_vqa_accuracy(result_file_path) + avg_acc = compute_vqa_accuracy(result_file_path) + return avg_acc if __name__ == "__main__": @@ -43,4 +51,6 @@ def textvqa_eval(input_path): parser.add_argument('--input-path', type=str, help="Path to input file(s)") args = parser.parse_args() - textvqa_eval(args.input_path) + avg_acc = textvqa_eval(args.input_path) + + print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====") diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py index 5d9dfe7844..cf10a0549d 100644 --- a/examples/multimodal/evaluate_vqav2.py +++ b/examples/multimodal/evaluate_vqav2.py @@ -55,7 +55,7 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False): # "We consider an answer to be correct if it is within 5% of the gold answer. # For non-numeric answers, we still need an exact match to consider an answer to be correct." if use_chartqa_metric: - acc = 0. + acc = 0.0 assert len(gt) == 1, "expected exactly one groundtruth answer." gt = gt[0] @@ -74,13 +74,15 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False): all_acc.append(acc) acc_avg = sum(all_acc) / len(all_acc) * 100 - print(f"===== Accuracy {acc_avg:.2f}% =====") + + return acc_avg def vqav2_eval(input_path): """Run VQAv2 evaluation.""" result_file = merge_input_files(input_path) - compute_vqa_accuracy(result_file) + avg_acc = compute_vqa_accuracy(result_file) + return avg_acc if __name__ == "__main__": @@ -88,4 +90,6 @@ def vqav2_eval(input_path): parser.add_argument('--input-path', type=str, help="Path to input file(s)") args = parser.parse_args() - vqav2_eval(args.input_path) + avg_acc = vqav2_eval(args.input_path) + + print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====") diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py new file mode 100644 index 0000000000..b4bab73cfb --- /dev/null +++ b/examples/multimodal/model.py @@ -0,0 +1,150 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import warnings +from copy import deepcopy + +import torch +from config import get_language_model_config, get_vision_model_config, get_vision_projection_config +from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec + +from megatron.core.models.multimodal.llava_model import LLaVAModel +from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings +from megatron.training import get_args, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args + + +def model_provider( + pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True +) -> LLaVAModel: + """Builds the model. + + Args: + pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True. + post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True. + add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder + will live on only a subset of the pipeline stages (specifically, only the first stage). + add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder + will live on only a subset of the pipeline stages (specifically, every stage after the first one). + parallel_output (bool): Enable parallel model output. + + Returns: + model: A multimodal model. + """ + args = get_args() + + use_te = args.use_te + + print_rank_0('building a multimodal model ...') + + num_image_embeddings = get_num_image_embeddings( + args.img_h, args.img_w, args.patch_dim, args.vision_model_type, + args.disable_vision_class_token, 1 + ) + old_seq_length = args.seq_length + args.seq_length = args.encoder_seq_length = num_image_embeddings + if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length: + warnings.warn( + f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})" + ) + + max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings + + assert ( + args.decoder_seq_length is not None + ), "Please provide --decoder-seq-length to set the language model sequence length" + assert ( + args.decoder_seq_length > max_num_image_embeddings + ), "Language model sequence length must be greater than the maximum number of image embeddings" + if args.decoder_seq_length > args.max_position_embeddings: + args.max_position_embeddings = args.decoder_seq_length + warnings.warn( + f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length" + ) + + base_config = core_transformer_config_from_args(get_args()) + base_config.language_model_type = args.language_model_type + base_config.vision_model_type = args.vision_model_type + base_config.calculate_per_token_loss = True + + language_config = deepcopy(base_config) + language_config = get_language_model_config(language_config) + + if use_te: + language_transformer_layer_spec = get_layer_spec_te( + is_vit=False + ) # TENorm detects LayerNorm/RMS automatically. + else: + language_transformer_layer_spec = get_layer_spec( + is_vit=False, normalization=language_config.normalization + ) + + vision_config = deepcopy(base_config) + vision_config = get_vision_model_config( + vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling + ) + + vision_model_type = args.vision_model_type + if vision_model_type in ["clip", "siglip"]: + if use_te: + vision_transformer_layer_spec = get_layer_spec_te( + is_vit=True + ) # TENorm detects LayerNorm/RMS automatically. + else: + vision_transformer_layer_spec = get_layer_spec( + is_vit=True, normalization=vision_config.normalization + ) + else: + raise RuntimeError("unsupported vision model type", vision_model_type) + + vision_projection_config = deepcopy(base_config) + vision_projection_config = get_vision_projection_config( + vision_projection_config, language_config.hidden_size + ) + + if args.encoder_pipeline_model_parallel_size > 0: + assert ( + args.encoder_pipeline_model_parallel_size == 1 + ), "vision model and projection can only live on 1 pipeline stage." + vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size + vision_projection_config.pipeline_model_parallel_size = ( + args.encoder_pipeline_model_parallel_size + ) + if args.encoder_tensor_model_parallel_size > 0: + vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size + vision_projection_config.tensor_model_parallel_size = ( + args.encoder_tensor_model_parallel_size + ) + + vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules + + model = LLaVAModel( + language_transformer_config=language_config, + language_transformer_layer_spec=language_transformer_layer_spec, + language_vocab_size=args.padded_vocab_size, + language_max_sequence_length=args.decoder_seq_length, + vision_transformer_config=vision_config, + vision_transformer_layer_spec=vision_transformer_layer_spec, + drop_vision_class_token=args.disable_vision_class_token, + vision_projection_config=vision_projection_config, + vision_projection_layer_spec=vision_projection_layer_spec, + vision_projection_type="mlp", + allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint, + parallel_output=parallel_output, + language_position_embedding_type=args.position_embedding_type, + language_rotary_percent=args.rotary_percent, + pre_process=pre_process, + post_process=post_process, + add_encoder=add_encoder, + add_decoder=add_decoder, + img_h=args.img_h, + img_w=args.img_w, + patch_dim=args.patch_dim, + language_rotary_base=args.rotary_base, + ) + + model.freeze( + freeze_language_model=args.freeze_LM, + freeze_vision_model=args.freeze_ViT, + freeze_vision_projection=False, + ) + + return model diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py new file mode 100644 index 0000000000..a7cb4235e3 --- /dev/null +++ b/examples/multimodal/multimodal_args.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + + +def add_multimodal_extra_args(parser): + """Extra arguments.""" + group = parser.add_argument_group(title='multimodal arguments') + group.add_argument('--dataset-config', type=str, default=None) + group.add_argument("--prompt-path", type=str, default=None) + group.add_argument('--freeze-LM', action='store_true', default=False) + group.add_argument('--freeze-ViT', action='store_true', default=False) + group.add_argument('--language-model-type', type=str, required=True) + group.add_argument('--vision-model-type', type=str, default="clip") + group.add_argument("--disable-vision-class-token", action="store_true", default=False) + group.add_argument( + "--allow-missing-vision-projection-checkpoint", action="store_true", default=False + ) + group.add_argument("--use-te", action="store_true", default=False) + group.add_argument( + "--dataloader-save", type=str, default=None, help="Energon dataloader state save path" + ) + group.add_argument( + "--use-tiling", action="store_true", default=False, help="Use input image tiling" + ) + group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles") + group.add_argument( + "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile" + ) + group.add_argument( + "--dataloader-seq-length", + type=int, + help="Make dataloader to produce sequences of specific length.", + ) + group.add_argument( + "--num-frames", + type=int, + default=1, + help="Number of frames to regularly sample from the video as input to the model.", + ) + group.add_argument( + "--online-evaluation-config", type=str, help="Config file for online evaluation." + ) + + return parser diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh index da72c335c0..b06dbfe53c 100755 --- a/examples/multimodal/pretrain_mistral_clip.sh +++ b/examples/multimodal/pretrain_mistral_clip.sh @@ -32,7 +32,6 @@ fi CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" -DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" DEBUG=0 if [[ $DEBUG -eq 1 ]]; then @@ -96,7 +95,6 @@ OPTIONS=" \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ --data-path ${DATA_TRAIN} \ - --valid-path ${DATA_VALID} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ --save-interval 1000 \ --save ${FINETUNE_DIR} \ diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py index 391f3071d0..6cf5fd6232 100644 --- a/examples/multimodal/run_text_generation.py +++ b/examples/multimodal/run_text_generation.py @@ -1,13 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. """Generate text using a vision language model.""" import glob +import itertools import json import logging import os +import re import sys from collections import defaultdict from functools import partial -import itertools # Add megatron to the path. sys.path.append( @@ -17,7 +18,8 @@ import datasets import numpy as np import torch -from torchvision.io import read_video +import yaml +from config import EvaluationConfig from dataset_helpers import tokenizer_image_token from image_processing import get_visual_transform from MMMU.mmmu.utils.data_utils import ( @@ -27,10 +29,14 @@ process_single_sample, ) from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response +from model import model_provider +from multimodal_args import add_multimodal_extra_args from PIL import Image -from train import add_multimodal_extra_args, get_num_image_embeddings, model_provider +from torchvision.io import read_video +from megatron.core import parallel_state from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX +from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.inference.text_generation.api import generate_and_post_process from megatron.inference.text_generation.forward_step import ForwardStep from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 @@ -48,14 +54,12 @@ def add_text_generation_args(parser): group.add_argument( "--out-seq-length", type=int, default=1024, help='Length of the output generated text.' ) - group.add_argument("--output-path", type=str, required=True, help='Output file path') - group.add_argument('--input-image-path', type=str, required=True, help="Input image directory") - group.add_argument('--input-metadata-path', type=str, help="Input metadata path") + group.add_argument("--output-path", type=str, help='Output file path') + group.add_argument('--input-image-path', type=str, help="Input image directory") group.add_argument( '--num-partitions', type=int, default=0, help="Number of partitions for inputs." ) group.add_argument('--partition-id', type=int, default=0, help="Partition index") - group.add_argument("--drop-vision-class-token", action="store_true", default=False) group.add_argument("--gt-path", type=str, help="Optional ground truth file") group.add_argument( "--task", @@ -69,10 +73,11 @@ def add_text_generation_args(parser): group.add_argument( "--prompt-format", type=str, - required=True, + default="mistral", choices=["llama3", "mistral"], help="Prompting format to use", ) + group.add_argument("--config-path", type=str, help="Config file to use.") # Add common multimodal arguments needed for e.g. building the model. parser = add_multimodal_extra_args(parser) @@ -85,61 +90,30 @@ def _get_partition_bounds( ): if num_samples_per_partition == 0: samples_per_partition = [ - int(x) for x in np.linspace(0, total_num_samples, num_partitions+1)] - return samples_per_partition[partition_id], samples_per_partition[partition_id+1] + int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1) + ] + return samples_per_partition[partition_id], samples_per_partition[partition_id + 1] return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1) -def get_evaluation_dataset( - task, - input_image_path, - gt_path, - img_h, - img_w, - use_tiling, - max_num_tiles, - use_thumbnail, - num_samples_per_partition, - num_partitions, - partition_id, - num_frames, -): - """Build evaluation dataset.""" - images = [] - tile_counts = [] - questions, answers = [], [] - samples, sample_ids = [], [] - - if task == "TextVQA": - samples = json.load(open(gt_path, encoding='utf-8'))['data'] - - # Optionally, process only a subset of the input files. - if num_partitions > 0: - lb, ub = _get_partition_bounds( - len(samples), num_samples_per_partition, num_partitions, partition_id - ) - samples = samples[lb:ub] - - for i in range(len(samples)): - sample = samples[i] - - img_file = "{}/{}.jpg".format(input_image_path, sample["image_id"]) - if not os.path.exists(img_file): - img_file = img_file.replace('.jpg', '.png') - - img = Image.open(img_file) - imgs = get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False - ) - - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) - - questions.append(sample["question"]) - answers.append(sample["answers"]) - sample_ids.append(sample["question_id"]) - elif task == "VQAv2": +class VQADataset(torch.utils.data.Dataset): + def __init__( + self, + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + keys, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ): samples = json.load(open(gt_path, encoding='utf-8')) + if "data" in samples: + samples = samples["data"] # Optionally, process only a subset of the input files. if num_partitions > 0: @@ -148,50 +122,72 @@ def get_evaluation_dataset( ) samples = samples[lb:ub] - for i in range(len(samples)): - sample = samples[i] + self._keys = keys + self._samples = samples + self._input_image_path = input_image_path + self._img_h = img_h + self._img_w = img_w + self._use_tiling = use_tiling + self._max_num_tiles = max_num_tiles + self._use_thumbnail = use_thumbnail - img_file = "{}/{}".format(input_image_path, sample["image"]) + def __len__(self): + return len(self._samples) - img = Image.open(img_file) - imgs = get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False - ) + def __getitem__(self, idx): + sample = self._samples[idx] - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) + img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]]) + if not os.path.exists(img_file): + img_file += ".jpg" - questions.append(sample["question"]) - answers.append(sample["answer"]) - sample_ids.append(sample["question_id"]) - elif task == "ChartQA": - samples = json.load(open(gt_path, encoding='utf-8')) + if not os.path.exists(img_file): + img_file = img_file.replace('.jpg', '.png') - # Optionally, process only a subset of the input files. - if num_partitions > 0: - lb, ub = _get_partition_bounds( - len(samples), num_samples_per_partition, num_partitions, partition_id - ) - samples = samples[lb:ub] + img = Image.open(img_file) + imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + tile_count = torch.tensor([len(imgs)], dtype=torch.int) - for i in range(len(samples)): - sample = samples[i] + sample_id = idx + if "sample_id" in self._keys: + sample_id = sample[self._keys["sample_id"]] - img_file = "{}/{}".format(input_image_path, sample["imgname"]) + metadata = "" # Not used. - img = Image.open(img_file) - imgs = get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False - ) + return ( + torch.stack(imgs), + tile_count, + sample_id, + sample[self._keys["question"]], + sample[self._keys["answer"]], + metadata, + ) - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) - questions.append(sample["query"]) - answers.append(sample["label"]) - sample_ids.append(i) - elif task == "captioning": +class CaptioningDataset(torch.utils.data.Dataset): + def __init__( + self, + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ): image_files = sorted(glob.glob(input_image_path + "/*")) + # Optionally, process only a subset of the input files. if num_partitions > 0: lb, ub = _get_partition_bounds( @@ -204,20 +200,54 @@ def get_evaluation_dataset( for gt in gts["annotations"]: answers[gt["image_id"]].append(gt['caption']) - # Run image preprocessing. - for i in range(len(image_files)): - image_file = image_files[i] - img = Image.open(image_file) - imgs = get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False - ) + self._image_files = image_files + self._answers = answers + self._img_h = img_h + self._img_w = img_w + self._use_tiling = use_tiling + self._max_num_tiles = max_num_tiles + self._use_thumbnail = use_thumbnail + + def __len__(self): + return len(self._image_files) + + def __getitem__(self, idx): + img_file = self._image_files[idx] + image_id = int(img_file.split("_")[-1].split(".")[0]) + + img = Image.open(img_file) + imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) + tile_count = torch.tensor([len(imgs)], dtype=torch.int) - image_id = int(image_file.split("_")[-1].split(".")[0]) - sample_ids.append(image_id) - elif task == 'MMMU': + question = "" # Fixed for all samples. + metadata = "" # Not used. + + return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata + + +class MMMUDataset(torch.utils.data.Dataset): + def __init__( + self, + input_image_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + single_image, + ): # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation. all_mmmu_datasets = [] @@ -225,9 +255,22 @@ def get_evaluation_dataset( assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE." for subject in CAT_SHORT2LONG.values(): - subject_dataset = datasets.load_dataset( - "MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache - ) + # Use a local copy of the dataset if exists (can be faster) or the HF one. + if os.path.exists(input_image_path): + subject_dataset = datasets.load_dataset( + os.path.join(input_image_path, subject), + split=datasets.Split.VALIDATION, + cache_dir=hf_datasets_cache, + verification_mode="no_checks", + ) + else: + subject_dataset = datasets.load_dataset( + "MMMU/MMMU", + subject, + split=datasets.Split.VALIDATION, + cache_dir=hf_datasets_cache, + ) + all_mmmu_datasets.append(subject_dataset) dataset = datasets.concatenate_datasets(all_mmmu_datasets) @@ -235,14 +278,11 @@ def get_evaluation_dataset( dataset = [s for s in dataset if s['id'].startswith("val")] # Optionally, process only a subset of the input files. - start_idx = 0 - end_idx = len(dataset) if num_partitions > 0: - start_idx, end_idx = _get_partition_bounds( + lb, ub = _get_partition_bounds( len(dataset), num_samples_per_partition, num_partitions, partition_id ) - - end_idx = min(len(dataset), end_idx) + dataset = dataset[lb:ub] # Using the LLaVA config from the MMMU repo. config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml") @@ -251,30 +291,119 @@ def get_evaluation_dataset( assert len(v) == 1, "only one value supported." config[k] = v[0] - for idx in range(start_idx, end_idx): - sample = dataset[idx] + self._config = config + + self._dataset = dataset + + self._img_h = img_h + self._img_w = img_w + self._use_tiling = use_tiling + self._max_num_tiles = max_num_tiles + self._use_thumbnail = use_thumbnail + self._single_image = single_image + + def __len__(self): + return len(self._dataset) + + def __getitem__(self, idx): + sample = self._dataset[idx] + + # Use the single image approach from the MMMU repo. + if self._single_image: sample = process_single_sample(sample) - sample = construct_prompt(sample, config) + sample = construct_prompt(sample, self._config) img = sample["image"] - imgs = get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False + sample_imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, ) + sample_num_tiles = [len(sample_imgs)] + else: + sample = construct_prompt(sample, self._config) + + sample_imgs = [] + sample_num_tiles = [] + + img_indices = re.findall(r"" + + img = sample[img_key] + assert img is not None, f"{img_str} is in prompt but not in sample images" + + # Note: Only replace the current image tag. + sample["final_input_prompt"] = sample["final_input_prompt"].replace( + img_str, "", 1 + ) + + imgs = get_visual_transform( + img, + self._img_h, + self._img_w, + self._use_tiling, + adjusted_max_num_tiles, + self._use_thumbnail, + augment=False, + ) # List of tiles. + + sample_imgs.extend(imgs) + sample_num_tiles.append(len(imgs)) + + # Sanity check. + for i in range(1, 8): + assert ( + f"" not in sample["final_input_prompt"] + ), "prompt contains unhandled image tags" + + # MMMU specific metadata. + metadata = {"question_type": sample["question_type"]} + if sample["question_type"] == "multiple-choice": + metadata["index2ans"] = sample["index2ans"] + metadata["all_choices"] = sample["all_choices"] + + prompt = sample['final_input_prompt'] + if self._single_image: + for i in range(8): + prompt = prompt.replace(f"", "") + prompt = f"\n{prompt}" - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) + tile_count = torch.tensor(sample_num_tiles, dtype=torch.int) - sample_ids.append(sample['id']) + return ( + torch.stack(sample_imgs), + tile_count, + sample["id"], + prompt, + sample["answer"], + metadata, + ) - # TODO: Support multiple input images and the original image position. Note: is added back in the prompt construction below. - prompt = sample['final_input_prompt'] - for i in range(8): - prompt = prompt.replace(f"", "") - questions.append(prompt) - answers.append(sample['answer']) - samples.append(sample) - elif task == "VideoMME": +class VideoMMMEDataset(torch.utils.data.Dataset): + def __init__( + self, + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + num_frames, + ): ground_truth_original = json.load(open(gt_path)) ground_truth = [] for gt in ground_truth_original: @@ -286,135 +415,295 @@ def get_evaluation_dataset( continue gt["video_path"] = video_path ground_truth.append(gt) - + ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"]) print_rank_0(f"Found {len(ground_truth)} videos to process.") if num_partitions > 0: start_idx, end_idx = _get_partition_bounds( - len(ground_truth), num_samples_per_partition, - num_partitions, partition_id + len(ground_truth), num_samples_per_partition, num_partitions, partition_id ) ground_truth = ground_truth[start_idx:end_idx] - # Run image preprocessing. - for idx, gt in enumerate(ground_truth): - print_rank_0(f"Processing input video: {idx} / {len(ground_truth)}") - video, _, _ = read_video( - gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec') - video = video.numpy() - selected_frames = torch.linspace( - 0, video.shape[0] - 1, num_frames).long() - video_frames = video[selected_frames] - if num_frames == 1: - video_frames = video_frames[None] - - imgs = list(itertools.chain.from_iterable( + self._ground_truth = ground_truth + self._img_h = img_h + self._img_w = img_w + self._use_tiling = use_tiling + self._max_num_tiles = max_num_tiles + self._use_thumbnail = use_thumbnail + self._num_frames = num_frames + + def __len__(self): + return len(self._ground_truth) + + def __getitem__(self, idx): + gt = self._ground_truth[idx] + + video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec') + video = video.numpy() + selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long() + video_frames = video[selected_frames] + if self._num_frames == 1: + video_frames = video_frames[None] + + imgs = list( + itertools.chain.from_iterable( get_visual_transform( - img, img_h, img_w, use_tiling, max_num_tiles, - use_thumbnail, augment=False) for img in video_frames)) - - for question in gt["questions"]: - # Very hacky, but we essentially re-create gt holding only the - # question of interest. This is the make this generation script - # compatible with the Video MME evaluation script. - question_dict = { - "video_id": gt["video_id"], - "duration_category": gt["duration_category"], - "video_category": gt["video_category"], - "video_subcategory": gt["video_subcategory"], - "url": gt["url"], - "questions": [question] - } - images.append(imgs) - tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int)) - questions.append(question_dict) - sample_ids.append(question["question_id"]) + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + for img in video_frames + ) + ) + + for question in gt["questions"]: + # Very hacky, but we essentially re-create gt holding only the + # question of interest. This is the make this generation script + # compatible with the Video MME evaluation script. + question_dict = { + "video_id": gt["video_id"], + "duration_category": gt["duration_category"], + "video_category": gt["video_category"], + "video_subcategory": gt["video_subcategory"], + "url": gt["url"], + "questions": [question], + } + + num_tiles = torch.tensor([len(imgs)], dtype=torch.int) + + answer = "" + metadata = "" + + return ( + torch.stack(imgs), + num_tiles, + question["question_id"], + question_dict, + answer, + metadata, + ) + + +def get_evaluation_dataloader( + task, + input_image_path, + gt_path, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + num_samples_per_partition, + num_partitions, + partition_id, + num_frames, + num_workers, +): + """Build evaluation dataset.""" + if task == "TextVQA": + keys = { + "image_id": "image_id", + "sample_id": "question_id", + "question": "question", + "answer": "answers", + } + + dataset = VQADataset( + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + keys, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ) + elif task == "VQAv2": + keys = { + "image_id": "image", + "sample_id": "question_id", + "question": "question", + "answer": "answer", + } + + dataset = VQADataset( + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + keys, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ) + elif task == "ChartQA": + keys = {"image_id": "imgname", "question": "query", "answer": "label"} + + dataset = VQADataset( + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + keys, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ) + elif task == "captioning": + dataset = CaptioningDataset( + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + ) + elif task == 'MMMU': + # Note: single_image=True uses only one image like in the MMMU repo example. + # single_image=False uses all images in the sample. + dataset = MMMUDataset( + input_image_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + single_image=True, + ) + elif task == "VideoMME": + dataset = VideoMMMEDataset( + input_image_path, + gt_path, + num_samples_per_partition, + num_partitions, + partition_id, + img_h, + img_w, + use_tiling, + max_num_tiles, + use_thumbnail, + num_frames, + ) else: - raise NotImplementedError("unsupported task") + raise NotImplementedError(f"unsupported task {task}") - return images, tile_counts, samples, sample_ids, questions, answers + dp_rank = parallel_state.get_data_parallel_rank() + dp_world_size = parallel_state.get_data_parallel_world_size() + sampler = torch.utils.data.DistributedSampler( + dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank + ) + # TODO: Batched inference is not supported yet. + dataloader = torch.utils.data.DataLoader( + dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True + ) + + return dataloader -def generate_samples(model): + +def generate_samples(model, config: EvaluationConfig): """Text generation using a trained vision language model.""" args = get_args() - images, tile_counts, samples, sample_ids, questions, answers = get_evaluation_dataset( - args.task, - args.input_image_path, - args.gt_path, + + rank = torch.distributed.get_rank() + + dataloader = get_evaluation_dataloader( + config.task, + config.input_image_path, + config.gt_path, args.img_h, args.img_w, args.use_tiling, args.max_num_tiles, args.use_thumbnail, - args.num_samples_per_partition, - args.num_partitions, - args.partition_id, - args.num_frames + config.num_samples_per_partition, + config.num_partitions, + config.partition_id, + args.num_frames, + args.num_workers, ) + num_img_embeddings_per_tile = get_num_image_embeddings( - args.img_h, args.img_w, args.patch_dim, - args.disable_vision_class_token, 1) - num_samples = len(sample_ids) - idx = 0 - while idx < num_samples: - imgs = torch.stack(images[idx]).cuda() - num_tiles = tile_counts[idx].cuda() - sample_id = sample_ids[idx] + args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1 + ) + + for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader): + imgs = imgs.to("cuda") + num_tiles = num_tiles.to("cuda") - prompt = get_prompt(args.task, questions, idx, args.prompt_format) + prompt = get_prompt(config.task, question, config.prompt_format) - forward_step = partial( - VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles) + forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles) - if torch.distributed.get_rank() == 0: + if rank == 0: resp_sentences, _, _, _ = generate_and_post_process( model, forward_step=forward_step, prompts=[prompt], - tokens_to_generate=args.out_seq_length, - top_k_sampling=args.top_k, - top_p_sampling=args.top_p, + tokens_to_generate=config.out_seq_length, + top_k_sampling=config.top_k, + top_p_sampling=config.top_p, add_BOS=False, - temperature=args.temperature, + temperature=config.temperature, random_seed=args.seed, detokenize_segments=False, ) for prompt, generation in zip([prompt], resp_sentences): + if isinstance(sample_id, torch.Tensor): + sample_id = sample_id.item() + output = {"sample_id": sample_id, "prompt": prompt} output_name = "" - if args.task == "captioning": + if config.task == "captioning": output_name = "caption" - elif args.task in ("TextVQA", "VQAv2", "ChartQA"): + elif config.task in ("TextVQA", "VQAv2", "ChartQA"): output_name = "answer" - elif args.task in ("MMMU"): + elif config.task in ("MMMU"): output_name = "text" - elif args.task == "VideoMME": + elif config.task == "VideoMME": output_name = "response" - output = questions[idx] + output = question - generated = get_generated(prompt, args.prompt_format, generation) - if args.task == "VideoMME": + generated = get_generated(generation, config.prompt_format) + if config.task == "VideoMME": output["questions"][0][output_name] = generated else: output[output_name] = generated - if args.task == "captioning": - output["ground_truth"] = answers[sample_id] - elif args.task in ("TextVQA", "VQAv2"): - output["gt_answer"] = [ans for ans in answers[idx]] - elif args.task == "ChartQA": - output["gt_answer"] = [answers[idx]] - elif args.task == "MMMU": - sample = samples[idx] - + if config.task == "captioning": + output["ground_truth"] = answers + elif config.task in ("TextVQA", "VQAv2"): + output["gt_answer"] = [ans for ans in answers] + elif config.task == "ChartQA": + output["gt_answer"] = [answers] + elif config.task == "MMMU": prediction = generated - if sample["question_type"] == "multiple-choice": + if metadata["question_type"] == "multiple-choice": prediction = parse_multi_choice_response( - generated, sample["all_choices"], sample["index2ans"] + generated, metadata["all_choices"], metadata["index2ans"] ) output["prediction"] = prediction @@ -429,27 +718,69 @@ def generate_samples(model): idx += 1 -def generate_and_write_samples(model): - """Generate text and write to an output file.""" +def get_evaluation_config(): + """Get evaluation config from a config file or command-line arguments.""" args = get_args() + if args.config_path: + with open(args.config_path, "r") as f: + config_dict = yaml.safe_load(f) - for output in generate_samples(model): - if torch.distributed.get_rank() == 0: - with open(args.output_path, 'a') as f: - f.write(json.dumps(output) + "\n") + config = EvaluationConfig(**config_dict) + else: + config = EvaluationConfig( + task=args.task, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + out_seq_length=args.out_seq_length, + output_path=args.output_path, + input_image_path=args.input_image_path, + gt_path=args.gt_path, + num_partitions=args.num_partitions, + partition_id=args.partition_id, + num_samples_per_partition=args.num_samples_per_partition, + prompt_format=args.prompt_format, + ) + + # Default output path if not defined... + if not config.output_path: + os.makedirs("generated", exist_ok=True) + config.output_path = "generated/" + args.language_model_type + + return config + + +def generate_and_write_samples(model, config): + """Generate text and write to an output file.""" + rank = torch.distributed.get_rank() + + if rank == 0: + output_file = open(config.output_path, "w") + print(f"output path: {output_file.name}") + + for output in generate_samples(model, config): + if rank == 0: + output_file.write(json.dumps(output) + "\n") + output_file.flush() class VLMForwardStep(ForwardStep): """Inference forward step for a multimodal model.""" - def __init__(self, num_img_embeddings_per_tile, images, num_tiles, model, - max_batch_size, max_sequence_length): + def __init__( + self, + num_img_embeddings_per_tile, + images, + num_tiles, + model, + max_batch_size, + max_sequence_length, + ): """Create multimodal forward step.""" total_num_tiles = torch.sum(num_tiles).item() - num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles + num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles - super().__init__( - model, max_batch_size, max_sequence_length + num_img_embeddings) + super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings) self._images = images self._num_tiles = num_tiles @@ -461,6 +792,7 @@ def _forward(self, tokens, position_ids, attention_mask): attention_mask=None, inference_params=self.inference_params, num_image_tiles=self._num_tiles, + runtime_gather_output=True, ) def __call__(self, tokens, position_ids, attention_mask): @@ -468,101 +800,90 @@ def __call__(self, tokens, position_ids, attention_mask): # On the first inference iteration, we compute image tokens. # Update the sequence length offset by the number of image tokens. - num_images = (tokens == -200).sum().item() + num_image_tokens = (tokens == -200).sum().item() num_tokens = tokens.size(1) - if num_tokens > 1 and num_images > 0: + if num_tokens > 1 and num_image_tokens > 0: self.inference_params.sequence_len_offset += ( - self.inference_params.key_value_memory_dict["image_tokens_count"] - num_images + self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens ) return logits -def get_prompt(task, questions, idx, prompt_format): +def get_prompt(task, question, prompt_format): """Get a prompt for the evaluation task.""" if task == "captioning": if prompt_format == "llama3": prompt = "<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\nProvide a one-sentence caption for provided image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" elif prompt_format == "mistral": - prompt = "Give a short and clear explanation of the subsequent image.\n" + prompt = ( + "[INST] Give a short and clear explanation of the subsequent image. [/INST]" + ) elif task == "TextVQA": - question = questions[idx] - if prompt_format == "llama3": prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format( question ) elif prompt_format == "mistral": - prompt = "\n{}\nAnswer the question using a single word or phrase.".format( + prompt = "[INST] \n{}\nAnswer the question using a single word or phrase. [/INST]".format( question ) elif task == "VQAv2": - question = questions[idx] - if prompt_format == "llama3": prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format( question ) elif prompt_format == "mistral": - prompt = "\n{}\nAnswer the question using a single word or phrase.".format( + prompt = "[INST] \n{}\nAnswer the question using a single word or phrase. [/INST]".format( question ) elif task == "ChartQA": - question = questions[idx] - if prompt_format == "llama3": prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format( - questions + question ) elif prompt_format == "mistral": - prompt = "\n{}\nAnswer the question using a single word or phrase.".format( + prompt = "[INST] \n{}\nAnswer the question using a single word or phrase. [/INST]".format( question ) elif task == "MMMU": - question = questions[idx] - if prompt_format == "llama3": - prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - prompt = prompt.format("", question) + prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + prompt = prompt.format(question) elif prompt_format == "mistral": - prompt = "\n{}\nAnswer the question using a single word or phrase.".format( - question - ) + prompt = "[INST] {} [/INST]".format(question) elif task == "VideoMME": - question = ( + q = ( "Select the best answer to the following multiple-choice " "question based on the video. Respond with only the letter " - "(A, B, C, or D) of the correct option.\n") - question += (questions[idx]["questions"][0]["question"] + "\n") - question += (questions[idx]["questions"][0]["choices"][0] + "\n") - question += (questions[idx]["questions"][0]["choices"][1] + "\n") - question += (questions[idx]["questions"][0]["choices"][2] + "\n") - question += (questions[idx]["questions"][0]["choices"][3] + "\n") + "(A, B, C, or D) of the correct option.\n" + ) + q += question["questions"][0]["question"] + "\n" + q += question["questions"][0]["choices"][0] + "\n" + q += question["questions"][0]["choices"][1] + "\n" + q += question["questions"][0]["choices"][2] + "\n" + q += question["questions"][0]["choices"][3] + "\n" if prompt_format == "llama3": - prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - prompt = prompt.format("", question) + prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + prompt = prompt.format(q) elif prompt_format == "mistral": - prompt = "\n{}".format( - question - ) + prompt = "[INST] \n{} [/INST]".format(q) return prompt -def get_generated(prompt, prompt_format, prompt_and_generation): +def get_generated(prompt_and_generation, prompt_format): """Strip prompt and other unnecessary text from generation.""" - start = len(prompt.replace("", "")) if prompt_format == "llama3": - start += len("<|begin_of_text|>") - start += 1 + generated = prompt_and_generation.split( + "<|start_header_id|>assistant<|end_header_id|>\n\n" + )[-1] + generated = generated.split("<|eot_id|>")[0] elif prompt_format == "mistral": - start += len(" ") + generated = prompt_and_generation.split("[/INST]")[-1] + generated = generated.split("")[0] - generated = prompt_and_generation[start:] - generated = generated.replace(" ", "") - generated = generated.split("<|eot_id|>")[0] - generated = generated.split("")[0] generated = generated.strip() generated = generated.split("\n\n")[0] generated = generated.split("\n")[0] @@ -577,15 +898,16 @@ def _decorate_tokenize(f): # When tokenizing, replace with the image token index (-200) def wrapper(prompt): tokens = tokenizer_image_token(args, prompt, f) + return tokens return wrapper def _decorate_detokenize(f): - # When detokenizing, replace image token index (-200) with a dummy value. + # When detokenizing, skip image token index. def wrapper(tokens): tokens = np.array(tokens) - tokens[tokens == IMAGE_TOKEN_INDEX] = 0 + tokens = tokens[tokens != IMAGE_TOKEN_INDEX] tokens = tokens.tolist() return f(tokens) @@ -617,9 +939,12 @@ def wrapped_model_provider(pre_process, post_process): _ = load_checkpoint(model, None, None) model = model[0] + model.eval() - generate_and_write_samples(model) + config = get_evaluation_config() + + generate_and_write_samples(model, config) if __name__ == "__main__": diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh index 93a0a91366..46fc996055 100755 --- a/examples/multimodal/sft_mistral_clip.sh +++ b/examples/multimodal/sft_mistral_clip.sh @@ -37,7 +37,6 @@ fi CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml" -DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml" DEBUG=0 if [[ $DEBUG -eq 1 ]]; then @@ -101,7 +100,6 @@ OPTIONS=" \ --tokenizer-type HuggingFaceTokenizer \ --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \ --data-path ${DATA_TRAIN} \ - --valid-path ${DATA_VALID} \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ --save-interval 500 \ --save ${FINETUNE_DIR} \ diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh index 30d1b06ab4..b78969ab59 100755 --- a/examples/multimodal/text_generation_mistral_clip.sh +++ b/examples/multimodal/text_generation_mistral_clip.sh @@ -4,7 +4,6 @@ export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_APPLY_QK_LAYER_SCALING=0 -INPUT_METADATA_PATH="placeholder" GROUNDTRUTH_PATH="placeholder" NUM_FRAMES=1 @@ -15,11 +14,6 @@ while [[ $# -gt 0 ]]; do shift shift ;; - --input-metadata-path) - INPUT_METADATA_PATH="$2" - shift - shift - ;; --num-frames) NUM_FRAMES="$2" shift @@ -112,7 +106,6 @@ do --no-load-rng \ --no-load-optim \ --input-image-path ${INPUT_IMAGE_PATH} \ - --input-metadata-path ${INPUT_METADATA_PATH} \ --num-partitions ${NUM_PARTITIONS} \ --partition-id ${PARTITION_ID} \ --output-path ${OUTPUT_PATH}-${TASK}-${PARTITION_ID}.jsonl \ diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py index e1cad7814e..386cdc03d0 100644 --- a/examples/multimodal/train.py +++ b/examples/multimodal/train.py @@ -1,131 +1,29 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. """Pretrain or SFT multimodal.""" -from copy import deepcopy -from functools import partial +import json import os import sys -import warnings +from functools import partial import torch +import yaml sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))) -from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0 -from megatron.training.arguments import core_transformer_config_from_args +from config import EvaluationConfig +from dataloader_provider import train_valid_test_dataloaders_provider +from evaluate_textvqa import textvqa_eval +from model import model_provider +from multimodal_args import add_multimodal_extra_args +from run_text_generation import generate_samples, patch_tokenizer + from megatron.core import mpu, tensor_parallel from megatron.core.enums import ModelType -from megatron.core.parallel_state import get_tensor_model_parallel_rank -from config import get_language_model_config, get_vision_model_config, get_vision_projection_config -from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.core.models.multimodal.llava_model import LLaVAModel -from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te -from megatron.training import pretrain -from dataloader_provider import train_valid_test_dataloaders_provider - -def model_provider( - pre_process=True, post_process=True, add_encoder=True, add_decoder=True, - parallel_output=True) -> LLaVAModel: - """Builds the model. - - Args: - pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True. - post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True. - add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder - will live on only a subset of the pipeline stages (specifically, only the first stage). - add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder - will live on only a subset of the pipeline stages (specifically, every stage after the first one). - parallel_output (bool): Enable parallel model output. - - Returns: - model: A multimodal model. - """ - args = get_args() - - use_te = args.use_te - - print_rank_0('building a multimodal model ...') - - num_image_embeddings = get_num_image_embeddings(args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1) - old_seq_length = args.seq_length - args.seq_length = args.encoder_seq_length = num_image_embeddings - if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length: - warnings.warn(f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})") - - max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings - - assert args.decoder_seq_length is not None, "Please provide --decoder-seq-length to set the language model sequence length" - assert args.decoder_seq_length > max_num_image_embeddings, "Language model sequence length must be greater than the maximum number of image embeddings" - if args.decoder_seq_length > args.max_position_embeddings: - args.max_position_embeddings = args.decoder_seq_length - warnings.warn(f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length") - - base_config = core_transformer_config_from_args(get_args()) - base_config.language_model_type = args.language_model_type - base_config.vision_model_type = args.vision_model_type - base_config.calculate_per_token_loss = True - - language_config = deepcopy(base_config) - language_config = get_language_model_config(language_config) - - if use_te: - language_transformer_layer_spec = get_layer_spec_te(is_vit=False) # TENorm detects LayerNorm/RMS automatically. - else: - language_transformer_layer_spec = get_layer_spec(is_vit=False, normalization=language_config.normalization) - - vision_config = deepcopy(base_config) - vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling) - - vision_model_type = args.vision_model_type - if vision_model_type == "clip": - if use_te: - vision_transformer_layer_spec = get_layer_spec_te(is_vit=True) # TENorm detects LayerNorm/RMS automatically. - else: - vision_transformer_layer_spec = get_layer_spec(is_vit=True, normalization=vision_config.normalization) - else: - raise RuntimeError("unsupported vision model type", vision_model_type) - - vision_projection_config = deepcopy(base_config) - vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size) - - if args.encoder_pipeline_model_parallel_size > 0: - assert args.encoder_pipeline_model_parallel_size == 1, "vision model and projection can only live on 1 pipeline stage." - vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size - vision_projection_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size - if args.encoder_tensor_model_parallel_size > 0: - vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size - vision_projection_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size - - vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules - - model = LLaVAModel( - language_transformer_config=language_config, - language_transformer_layer_spec=language_transformer_layer_spec, - language_vocab_size=args.padded_vocab_size, - language_max_sequence_length=args.decoder_seq_length, - vision_transformer_config=vision_config, - vision_transformer_layer_spec=vision_transformer_layer_spec, - drop_vision_class_token=args.disable_vision_class_token, - vision_projection_config=vision_projection_config, - vision_projection_layer_spec=vision_projection_layer_spec, - vision_projection_type="mlp", - allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint, - parallel_output=parallel_output, - language_position_embedding_type=args.position_embedding_type, - language_rotary_percent=args.rotary_percent, - pre_process=pre_process, - post_process=post_process, - add_encoder=add_encoder, - add_decoder=add_decoder, - img_h=args.img_h, - img_w=args.img_w, - patch_dim=args.patch_dim, - language_rotary_base=args.rotary_base, - ) - - model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False) - - return model +from megatron.core.parallel_state import get_tensor_model_parallel_rank +from megatron.training import get_args, get_timers, get_tokenizer, pretrain +from megatron.training.utils import is_last_rank def get_batch(data_iterator): @@ -314,32 +212,6 @@ def forward_step(data_iterator, model: LLaVAModel): return output_tensor, partial(loss_func, loss_mask) -def add_multimodal_extra_args(parser): - """Extra arguments.""" - group = parser.add_argument_group(title='multimodal arguments') - group.add_argument('--valid-path', nargs='*', default=None, - help='Path to the training dataset. Accepted format:' - '1) a single data path, 2) multiple datasets in the' - 'form: dataset1-weight dataset1-path dataset2-weight ' - 'dataset2-path ...') - group.add_argument('--dataset-config', type=str, default=None) - group.add_argument("--prompt-path", type=str, default=None) - group.add_argument('--freeze-LM', action='store_true', default=False) - group.add_argument('--freeze-ViT', action='store_true', default=False) - group.add_argument('--language-model-type', type=str, required=True) - group.add_argument('--vision-model-type', type=str, default="clip") - group.add_argument("--disable-vision-class-token", action="store_true", default=False) - group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False) - group.add_argument("--use-te", action="store_true", default=False) - group.add_argument("--dataloader-save", type=str, default=None, help="Energon dataloader state save path") - group.add_argument("--use-tiling", action="store_true", default=False, help="Use input image tiling") - group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles") - group.add_argument("--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile") - group.add_argument("--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.") - group.add_argument("--num-frames", type=int, default=1, help="Number of frames to regularly sample from the video as input to the model.") - - return parser - def llava_embedding_ranks(pp_ranks): """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings). @@ -375,6 +247,64 @@ def llava_position_embedding_ranks(pp_ranks): return [pp_ranks[epp]] + +def run_online_eval(model): + """Run an evaluation benchmark during training.""" + args = get_args() + + # Online evaluation config is not defined. Do nothing. + if not args.online_evaluation_config: + return [] + + with open(args.online_evaluation_config, "r") as f: + config_dict = yaml.safe_load(f) + + config = EvaluationConfig(**config_dict) + + patch_tokenizer(args) + + # The inference code assumes the first rank is the leader. + # Tensorboard writer is on the last rank. + # We must write to a storage space that all ranks see. + output_dir = os.path.join(args.save, "online_eval") + os.makedirs(output_dir, exist_ok=True) + config.output_path = os.path.join(output_dir, f"{config.task}.jsonl") + + if torch.distributed.get_rank() == 0: + output_file = open(config.output_path, "w") + + with torch.no_grad(): + for output in generate_samples(model[0].module, config): + if torch.distributed.get_rank() == 0: + output_file.write(json.dumps(output) + "\n") + + if torch.distributed.get_rank() == 0: + output_file.close() + + # Make sure the first rank is done writing so that the last rank can run eval. + torch.distributed.barrier() + + if not is_last_rank(): + return [] + + if config.task.lower() == "textvqa": + avg_acc = textvqa_eval(config.output_path) + + return [{"textvqa accuracy": avg_acc}] + else: + raise NotImplementedError(f"online evaluation of {config.task} not implemented yet") + + +def write_online_eval_to_tensorboard(data, iteration, writer): + """Write online evaluation data to Tensorboard.""" + if not writer: + return + + for item in data: + for k, v in item.items(): + writer.add_scalar(k, v, iteration) + + if __name__ == "__main__": train_valid_test_dataloaders_provider.is_distributed = True @@ -385,6 +315,8 @@ def llava_position_embedding_ranks(pp_ranks): forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, extra_args_provider=add_multimodal_extra_args, + process_non_loss_data_func=write_online_eval_to_tensorboard, get_embedding_ranks=llava_embedding_ranks, get_position_embedding_ranks=llava_position_embedding_ranks, + non_loss_data_func=run_online_eval ) diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py index 96f2c316c5..077d94eb77 100644 --- a/megatron/core/dist_checkpointing/strategies/torch.py +++ b/megatron/core/dist_checkpointing/strategies/torch.py @@ -10,7 +10,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast import torch -from pkg_resources import packaging +from packaging.version import Version as PkgVersion from torch.distributed import checkpoint from torch.distributed._shard.metadata import ShardMetadata from torch.distributed._shard.sharded_tensor import Shard, ShardedTensorMetadata, TensorProperties @@ -448,8 +448,9 @@ def __init__( nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None, **kwargs, ) -> None: - # `dedup_replicated_tensors` was deprecated in 2.3 - avoids tons of warnings during saving - if packaging.version.Version(torch.__version__) <= packaging.version.Version("2.2"): + # `dedup_replicated_tensors` was deprecated in 2.3; this check avoids warnings + # during saving. + if PkgVersion(torch.__version__) <= PkgVersion("2.2"): kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors super().__init__(*args, **kwargs) self.nd_flattened_global_shapes = nd_flattened_global_shapes or {} diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/export/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py new file mode 100644 index 0000000000..38fbdea8f6 --- /dev/null +++ b/megatron/core/export/data_type.py @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from enum import Enum + +DataType = Enum('DataType', ["bfloat16", "float16", "float32"]) diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py new file mode 100644 index 0000000000..2cc1e208be --- /dev/null +++ b/megatron/core/export/export_config.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from dataclasses import dataclass + + +@dataclass +class ExportConfig: + """Base configuration for Megatron Core Export + + These parameters control the export setting for trtllm + """ + + inference_tp_size: int = 1 + + inference_pp_size: int = 1 + + use_parallel_embedding: bool = False + + use_embedding_sharing: bool = False diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py new file mode 100644 index 0000000000..6a33d6440e --- /dev/null +++ b/megatron/core/export/model_type.py @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from enum import Enum + +ModelType = Enum( + 'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"] +) diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/export/trtllm/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/export/trtllm/engine_builder/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py new file mode 100644 index 0000000000..e729fec410 --- /dev/null +++ b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py @@ -0,0 +1,148 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import tensorrt_llm +from tensorrt_llm._common import check_max_num_tokens +from tensorrt_llm.builder import BuildConfig +from tensorrt_llm.commands.build import build as build_trtllm +from tensorrt_llm.logger import logger +from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights +from tensorrt_llm.plugin import PluginConfig + + +class TRTLLMEngineBuilder: + """A utility class to build TRTLLM engine""" + + @staticmethod + def build_and_save_engine( + engine_dir: str, + trtllm_model_weights: dict, + trtllm_model_config, + max_input_len: int = 1024, + max_output_len: int = 1024, + max_batch_size: int = 4, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank: int = 64, + lora_target_modules=None, + max_prompt_embedding_table_size: int = 0, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, + paged_context_fmha: bool = False, + use_refit: bool = False, + max_num_tokens: int = None, + max_seq_len: int = None, + opt_num_tokens: int = None, + max_beam_width: int = 1, + tokens_per_block: int = 128, + multiple_profiles: bool = False, + gpt_attention_plugin: str = "auto", + gemm_plugin: str = "auto", + ): + """Method to build the TRTLLM Engine + + This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir + + Args: + engine_dir (str): The file path to save the engine + trtllm_model_weights (dict): The TRTLLM converted model weights dict + trtllm_model_config : The TRTLLM Config + max_input_len (int, optional): Max input length. Defaults to 1024. + max_output_len (int, optional): Max output length. Defaults to 1024. + max_batch_size (int, optional): Max batch size. Defaults to 4. + model_type (ModelType, optional): ModelType enum. Defaults to ModelType.gpt. + lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None. + use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None. + max_lora_rank (int, optional): Max lora rank. Defaults to 64. + lora_target_modules (_type_, optional): Lora target modules. Defaults to None. + max_prompt_embedding_table_size (int, optional): Defaults to 0. + paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True. + remove_input_padding (bool, optional): Remove input padding. Defaults to True. + paged_context_fmha (bool, optional): Paged context fmha. Defaults to False. + use_refit (bool, optional): Use refit. Defaults to False. + max_num_tokens (int, optional): Max num of tokens. Defaults to None. + max_seq_len (int, optional): Max seq length. Defaults to None. + opt_num_tokens (int, optional): Opt number of tokens. Defaults to None. + max_beam_width (int, optional): Max beam width. Defaults to 1. + tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128. + multiple_profiles (bool, optional): Use multiple profiles. Defaults to False. + gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto". + gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto". + """ + architecture = ( + "LLaMAForCausalLM" + if trtllm_model_config.architecture == "LlamaForCausalLM" + else trtllm_model_config.architecture + ) + try: + model_cls = getattr(tensorrt_llm.models, architecture) + except: + raise AttributeError(f"Could not find TRTLLM model for architecture: {architecture}!") + + logger.set_level("info") + plugin_config = PluginConfig() + plugin_config.gpt_attention_plugin = gpt_attention_plugin + plugin_config.gemm_plugin = gemm_plugin + if paged_kv_cache: + plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block) + else: + plugin_config.paged_kv_cache = False + plugin_config.remove_input_padding = remove_input_padding + plugin_config.use_paged_context_fmha = paged_context_fmha + plugin_config.multiple_profiles = multiple_profiles + + if max_seq_len is None: + max_seq_len = max_input_len + max_output_len + + max_num_tokens, opt_num_tokens = check_max_num_tokens( + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, + max_input_len=max_input_len, + max_beam_width=max_beam_width, + remove_input_padding=remove_input_padding, + enable_context_fmha=plugin_config.context_fmha, + tokens_per_block=tokens_per_block, + multiple_profiles=multiple_profiles, + ) + + build_dict = { + 'max_input_len': max_input_len, + 'max_output_len': max_output_len, + 'max_batch_size': max_batch_size, + 'max_beam_width': max_beam_width, + 'max_seq_len': max_seq_len, + 'max_num_tokens': max_num_tokens, + 'opt_num_tokens': opt_num_tokens, + 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, + 'gather_context_logits': False, + 'gather_generation_logits': False, + 'strongly_typed': False, + 'builder_opt': None, + 'use_refit': use_refit, + 'multiple_profiles': multiple_profiles, + } + build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config) + + if use_lora_plugin is not None: + # build_config.plugin_config.set_lora_plugin(use_lora_plugin) + # build_config.plugin_config._lora_plugin = use_lora_plugin + lora_config = LoraConfig( + lora_dir=lora_ckpt_list, + lora_ckpt_source='nemo', # TODO : NEED TO SEE HOW TO HANDLE THIS FOR MCORE + max_lora_rank=max_lora_rank, + lora_target_modules=lora_target_modules, + ) + build_config.lora_config = lora_config + + model = model_cls.from_config(trtllm_model_config) + model = optimize_model( + model, + use_parallel_embedding=trtllm_model_config.use_parallel_embedding, + share_embedding_table=trtllm_model_config.share_embedding_table, + ) + preprocess_weights(trtllm_model_weights, trtllm_model_config) + model.load(trtllm_model_weights) + engine = build_trtllm(model, build_config) + engine.save(engine_dir) diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py new file mode 100644 index 0000000000..cad9315034 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.model_type import ModelType +from megatron.core.export.trtllm.model_to_trllm_mapping.falcon_model import FALCON_DICT +from megatron.core.export.trtllm.model_to_trllm_mapping.gemma_model import GEMMA_DICT +from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT +from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_next_model import GPT_NEXT_DICT +from megatron.core.export.trtllm.model_to_trllm_mapping.llama_model import LLAMA_DICT +from megatron.core.export.trtllm.model_to_trllm_mapping.starcoder_model import STARCODER_DICT + +DEFAULT_CONVERSION_DICT = { + ModelType.llama: LLAMA_DICT, + ModelType.falcon: FALCON_DICT, + ModelType.gemma: GEMMA_DICT, + ModelType.starcoder: STARCODER_DICT, + ModelType.gpt: GPT_DICT, + ModelType.gptnext: GPT_NEXT_DICT, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py new file mode 100644 index 0000000000..d1469d02ba --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +# pylint: disable=line-too-long +FALCON_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + 'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding, + # ATTENTION + 'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias, + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + # MLP + 'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight, + 'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias, + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + 'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py new file mode 100644 index 0000000000..47a0211706 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +# pylint: disable=line-too-long +GEMMA_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + # ATTENTION + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + # MLP + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + 'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py new file mode 100644 index 0000000000..eda27600c6 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +GPT_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + 'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding, + # ATTENTION + 'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias, + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + 'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias, + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + 'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias, + # MLP + 'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight, + 'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias, + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + 'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + 'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py new file mode 100644 index 0000000000..ac5f84ef1b --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +# pylint: disable=line-too-long +GPT_NEXT_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + # ATTENTION + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias, + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + # MLP + 'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight, + 'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias, + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + 'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py new file mode 100644 index 0000000000..5fd2067081 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +# pylint: disable=line-too-long +LLAMA_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + 'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding, + # ATTENTION + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + # MLP + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + 'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py new file mode 100644 index 0000000000..dce61d26c5 --- /dev/null +++ b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers + +# pylint: disable=line-too-long +STARCODER_DICT = { + # INPUT + 'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding, + # ATTENTION + 'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias, + 'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight, + 'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias, + 'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight, + 'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias, + 'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight, + 'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias, + # MLP + 'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight, + 'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias, + 'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight, + 'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias, + 'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight, + 'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias, + # FINAL LAYER NORM + 'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight, + 'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias, + # OUTPUT LAYER + 'output_layer.weight': TRTLLMLayers.lm_head, +} diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py new file mode 100644 index 0000000000..2ed09398c2 --- /dev/null +++ b/megatron/core/export/trtllm/trt_model_config.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import tensorrt_llm + +from megatron.core.export.model_type import ModelType + +TRT_MODEL_CONFIG = { + ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig, + ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig, + ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig, + ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig, + ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig, + ModelType.gemma: tensorrt_llm.models.GemmaConfig, + ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig, +} diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py new file mode 100644 index 0000000000..f45ff1786e --- /dev/null +++ b/megatron/core/export/trtllm/trt_model_type.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.export.model_type import ModelType + +TRT_MODEL_TYPE_STRING = { + ModelType.gpt: 'GPTForCausalLM', + ModelType.gptnext: 'GPTForCausalLM', + ModelType.starcoder: 'GPTForCausalLM', + ModelType.mixtral: 'LlamaForCausalLM', + ModelType.llama: 'LlamaForCausalLM', + ModelType.gemma: 'GemmaForCausalLM', + ModelType.falcon: 'FalconForCausalLM', +} diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py new file mode 100644 index 0000000000..d8bef18b33 --- /dev/null +++ b/megatron/core/export/trtllm/trtllm_helper.py @@ -0,0 +1,461 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import tensorrt_llm +from tensorrt_llm.functional import non_gated_version +from tensorrt_llm.layers import MoeConfig + +from megatron.core.export.data_type import DataType +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.model_type import ModelType +from megatron.core.export.trtllm.engine_builder.trtllm_engine_builder import TRTLLMEngineBuilder +from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import ( + DEFAULT_CONVERSION_DICT, +) +from megatron.core.export.trtllm.trt_model_config import TRT_MODEL_CONFIG +from megatron.core.export.trtllm.trt_model_type import TRT_MODEL_TYPE_STRING + +# pylint: disable=line-too-long +from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import ( + DistributedTRTLLMModelWeightsConverter, +) +from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import ( + SingleDeviceTRTLLMModelWeightsConverter, +) +from megatron.core.transformer.transformer_config import TransformerConfig + + +class TRTLLMHelper: + """TRTLLM Helper class to convert export and build TRTLLM model.""" + + def __init__( + self, + transformer_config: TransformerConfig, + model_type: ModelType, + trtllm_conversion_dict: dict = {}, + position_embedding_type: str = 'learned_absolute', + max_position_embeddings: int = None, + rotary_percentage: int = 1.0, + rotary_base: int = 10000, + moe_tp_mode: int = 2, + multi_query_mode: bool = False, + activation: str = "gelu", + seq_len_interpolation_factor: float = None, + moe_renorm_mode=None, + share_embeddings_and_output_weights=False, + ): + """Constructor for the TRTLLMHelper + + There are two public API's supported by this helper. + a) get_trtllm_pretrained_config_and_model_weights + b) build_and_save_engine + + Args: + transformer_config (TransformerConfig): The transformer config + model_type (ModelType): The type of the input model. Enum (megatron.core.export.model_type.ModelType) + conversion_dict (dict, optional): A conversion dictionary that will map your model layer names to trtllm equivalent layer names. Sample dictionaries are given megatron/core/export/model_mapping. NOTE: Ingore layer numbers in the model layer names. (e.g) decoder.layers.0.attention_qkv.weight will be decoder.layers.attention_qkv.weight in the mapping dictionary. Defaults to {}. + position_embedding_type (str, optional): The position embedding type. Defaults to None. + max_position_embeddings (int, optional): Max posistion embeddings value. Defaults to None. + rotary_percentage (int, optional): The rotary percentage if using rope embedding. Defaults to 1.0. + rotary_base (int, optional): The rotary base (theta value) if using rope embeddings. Defaults to 10000. + moe_tp_mode (int, optional): TRTLLM Config. Defaults to 2. + multi_query_mode (bool, optional): Defaults to False. + activation (str, optional): Defaults to "gelu". + seq_len_interpolation_factor (float, optional): The sequence length interpolation factor if using rope embeddings. Defaults to None. + moe_renorm_mode (optional) : Renormalization mode if using mixture of experts. Defaults to None. + share_embeddings_and_output_weights (bool, optional): True if input and output layers share weights. Defaults to False. + """ + + self.transformer_config = transformer_config + self.model_type = model_type + self.trtllm_conversion_dict = DEFAULT_CONVERSION_DICT[model_type] + self.trtllm_conversion_dict.update(trtllm_conversion_dict) + assert position_embedding_type in [ + 'learned_absolute', + 'rope', + ], f"Position embedding type should be one of learned_absolute, rope. You entered {position_embedding_type}" + self.position_embedding_type = position_embedding_type + self.max_position_embeddings = max_position_embeddings + self.rotary_percentage = rotary_percentage + self.rotary_base = rotary_base + self.moe_tp_mode = moe_tp_mode + self.multi_query_mode = multi_query_mode + self.activation = activation + self.seq_len_interpolation_factor = seq_len_interpolation_factor + self.moe_renorm_mode = moe_renorm_mode + self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + + def _get_trtllm_config( + self, + export_config: ExportConfig, + world_size: int, + gpus_per_node: int, + vocab_size_padded: int, + dtype: DataType, + ): + """Get TRTLLM Config + + Returns appropriate TRTLLM PretrainedConfig used by TRTLLM for building engine + + Args: + export_config (ExportConfig): The export config that defines inference tp , pp size etc. + world_size (int): The number of gpus (Mostly TP * PP) + gpus_per_node (int): Num gpus per node + vocab_size_padded (int): Padded vocab size + dtype (DataType): The datatype or model precision + + Returns: + GPTConfig or the LLamaConfig or the PretrainedConfig constructed from your model config + """ + hidden_act = self.activation + hidden_act = ( + hidden_act.split("-")[-1] + if self.transformer_config.num_moe_experts + else non_gated_version(hidden_act) + ) + + config = { + 'architecture': TRT_MODEL_TYPE_STRING[self.model_type], + 'dtype': dtype.name, + 'num_hidden_layers': self.transformer_config.num_layers, + 'num_attention_heads': self.transformer_config.num_attention_heads, + 'num_key_value_heads': ( + self.transformer_config.num_query_groups + if self.transformer_config.num_query_groups + else self.transformer_config.num_attention_heads + ), + 'head_size': self.transformer_config.kv_channels, + 'hidden_size': self.transformer_config.hidden_size, + 'intermediate_size': self.transformer_config.ffn_hidden_size, + 'norm_epsilon': self.transformer_config.layernorm_epsilon, + 'vocab_size': vocab_size_padded, + 'position_embedding_type': ( + "rope_gpt_neox" if self.position_embedding_type == "rope" else "learned_absolute" + ), + 'max_position_embeddings': self.max_position_embeddings, + 'hidden_act': hidden_act, + 'use_parallel_embedding': export_config.use_parallel_embedding, + 'embedding_sharding_dim': 0, + 'share_embedding_table': export_config.use_embedding_sharing, + 'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None}, + 'bias': self.transformer_config.add_bias_linear, + 'apply_query_key_layer_scaling': False, + 'rotary_pct': self.rotary_percentage, + 'rotary_base': self.rotary_base, + 'moe_num_experts': ( + 0 + if self.transformer_config.moe_router_topk == 0 + else (self.transformer_config.num_moe_experts or 1) + ), + 'moe_top_k': self.transformer_config.moe_router_topk, + 'moe_normalization_mode': self.moe_renorm_mode + or MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, + 'moe_tp_mode': self.moe_tp_mode, + 'logits_dtype': 'float32', + 'world_size': world_size, + 'tp_size': export_config.inference_tp_size, + 'pp_size': export_config.inference_pp_size, + 'gpus_per_node': gpus_per_node, + } + + if self.model_type == ModelType.falcon: + config["new_decoder_architecture"] = ( + False if self.transformer_config.num_layers == 32 else True + ) + config["parallel_attention"] = True + + if self.seq_len_interpolation_factor is not None: + config["rotary_scaling"] = { + "type": "linear", + "factor": float(self.seq_len_interpolation_factor), + } + + config_cls = TRT_MODEL_CONFIG[self.model_type] + return config_cls(**config) + + # pylint: disable=line-too-long + def get_trtllm_pretrained_config_and_model_weights( + self, + model_state_dict, + dtype: DataType, + export_config: ExportConfig = None, + on_device_distributed_conversion: bool = False, + vocab_size: int = None, + gpus_per_node: int = None, + state_dict_split_by_layer_numbers: bool = True, + ): + """Get TRTLLM Config and Converted Model Weights + + This function returns the trtllm model weights as a list. + There are two modes for conversion. The default is to use a single device cpu/gpu for conversion. + NOTE: For faster performance, if your entire model will fit in memory, pre transfer the model state dict to cuda device and then call this function. + For on device conversion it returns weights which will be used on the device itself. + Same thing happens with the pretrained config + + Args: + model_state_dict (dict, optional): The input model state dictionary (Entire model state loaded on CPU). Used only when on device conversion is set to False. Defaults to None. + False, or the model state dict of each GPU in the case of on_device conversion) + export_config (ExportConfig): The export config used to define inference tp size, pp size etc. Used only for on device conversion. + dtype (DataType): The data type of model precision + on_device_distributed_conversion (bool, optional): Convert on gpus in distributed setting. This assumes that the model state dict is sharded according to required inference model parallelism and that each gpu gets its part of the model state dict . Defaults to False. + vocab_size (int, optional): The vocabulary size. Defaults to None. + gpus_per_node (int, optional): The number of gpus per node. Used for on device conversion. + state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True + + Returns: + Two lists . First list of trtllm converted model weights(Either on device, or a list of weights for each gpu) and the trtllm_model_configs. + """ + if on_device_distributed_conversion: + assert (vocab_size is not None, "Need to pass in vocab_size for on device") + assert ( + self.model_type in [ModelType.gpt, ModelType.gptnext, ModelType.llama], + "On device conversion only supported for model types gptnext and llama", + ) + assert ( + export_config is None, + "Export config is inferred based on the parallel state. If you want to set inference tp 2, then load the model with this TP2 setting and just pass in the model state dict. ", + ) + assert ( + gpus_per_node is not None + ), "Need to pass in gpus_per_node for on device conversion" + trtllm_model_weights_on_device, trtllm_model_config = ( + self._get_trtllm_pretrained_config_and_model_weights_in_distributed_setting( + model_state_dict, dtype, vocab_size, gpus_per_node + ) + ) + return [trtllm_model_weights_on_device], [trtllm_model_config] + + else: + assert not ( + self.share_embeddings_and_output_weights and not export_config.use_embedding_sharing + ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True" + assert ( + vocab_size is None + ), "Vocab size is inferred from the input layer for cpu conversion. So leave it as None" + trtllm_model_weights_list, trtllm_model_config_list = ( + self._get_trtllm_pretrained_config_and_model_weights_list_on_single_device( + export_config, + model_state_dict, + dtype, + gpus_per_node, + state_dict_split_by_layer_numbers, + ) + ) + + return trtllm_model_weights_list, trtllm_model_config_list + + def _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting( + self, model_state_dict: dict, dtype: DataType, vocab_size: int, gpus_per_node: int + ): + """Get the TRTLLM Pretrained config and model weights list in a distributed setting + + This function assumes the model state dict is distributed according to model parallelism . + Each device gets its own model state dict + + Args: + export_config (ExportConfig): The export config to set inference tp, pp size etc. + model_state_dict (dict): The model state dictionary (All collected on cpu) + dtype (DataType): The data type or model precision + vocab_size (int): Tokenizer vocab size + gpus_per_node (int): The number of gpus per node + + Returns: + Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu). + """ + + distributed_trtllm_model_weights_converter = DistributedTRTLLMModelWeightsConverter( + transformer_config=self.transformer_config, + dtype=dtype, + multi_query_mode=self.multi_query_mode, + activation=self.activation, + ) + distributed_trtllm_model_weights_converter.convert( + model_state_dict=model_state_dict, + trtllm_conversion_dict=self.trtllm_conversion_dict, + tokenizer_vocab_size=vocab_size, + ) + + export_config = ExportConfig( + inference_pp_size=distributed_trtllm_model_weights_converter.inference_pp_size, + inference_tp_size=distributed_trtllm_model_weights_converter.inference_tp_size, + use_parallel_embedding=True, + use_embedding_sharing=self.share_embeddings_and_output_weights, + ) + + world_size = export_config.inference_tp_size * export_config.inference_pp_size + + trtllm_model_config = self._get_trtllm_config( + export_config=export_config, + world_size=world_size, + gpus_per_node=gpus_per_node, + vocab_size_padded=vocab_size, + dtype=dtype, + ) + + model_parallel_rank = ( + distributed_trtllm_model_weights_converter.pp_rank + * distributed_trtllm_model_weights_converter.inference_tp_size + + distributed_trtllm_model_weights_converter.tp_rank + ) + + trtllm_model_config.mapping = tensorrt_llm.Mapping( + world_size=world_size, + rank=model_parallel_rank, + tp_size=export_config.inference_tp_size, + pp_size=export_config.inference_pp_size, + ) + + return distributed_trtllm_model_weights_converter.trtllm_model_weights, trtllm_model_config + + def _get_trtllm_pretrained_config_and_model_weights_list_on_single_device( + self, + export_config: ExportConfig, + model_state_dict: dict, + dtype: DataType, + gpus_per_node=None, + state_dict_split_by_layer_numbers=True, + ): + """Get the TRTLLM Pretrained config and model weights list (one per gpu rank) on single device (CPU/GPU) + + This function assumes the entire model state dict is present in CPU or on one GPU + + Args: + export_config (ExportConfig): The export config to set inference tp, pp size etc. + model_state_dict (dict): The model state dictionary (All collected on cpu) + dtype (DataType): The data type or model precision + gpus_per_node (int, optional): Number of gpus per node + state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True + + Returns: + Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu). + """ + trtllm_model_configs_list = [] + trtllm_model_weights_list = [] + + single_device_trtllm_model_weights_converter = SingleDeviceTRTLLMModelWeightsConverter( + export_config=export_config, + transformer_config=self.transformer_config, + dtype=dtype, + activation=self.activation, + multi_query_mode=self.multi_query_mode, + ) + # Convert the input model state dict to trtllm model weights dictionary + single_device_trtllm_model_weights_converter.convert( + model_state_dict=model_state_dict, + trtllm_conversion_dict=self.trtllm_conversion_dict, + state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers, + ) + + vocab_size_padded = single_device_trtllm_model_weights_converter.get_padded_vocab_size() + world_size = export_config.inference_tp_size * export_config.inference_pp_size + gpus_per_node = gpus_per_node or export_config.inference_tp_size + + for gpu_rank in range(world_size): + mapping = tensorrt_llm.Mapping( + world_size=world_size, + rank=gpu_rank, + tp_size=export_config.inference_tp_size, + pp_size=export_config.inference_pp_size, + ) + + # Important to create a new instance everytime so that the list elements have differnt rank values in the mapping object + trtllm_model_config = self._get_trtllm_config( + export_config=export_config, + world_size=world_size, + gpus_per_node=gpus_per_node, + vocab_size_padded=vocab_size_padded, + dtype=dtype, + ) + trtllm_model_config.mapping = mapping + trtllm_model_configs_list.append(trtllm_model_config) + + # Get the model weights for each rank and append it to the trtllm_model_weights_list + trtllm_model_weights_per_gpu = ( + single_device_trtllm_model_weights_converter.get_local_model_weights_per_gpu( + mapping, trtllm_model_config + ) + ) + trtllm_model_weights_list.append(trtllm_model_weights_per_gpu) + + return trtllm_model_weights_list, trtllm_model_configs_list + + def build_and_save_engine( + self, + engine_dir: str, + trtllm_model_weights: dict, + trtllm_model_config, + max_input_len: int = 1024, + max_output_len: int = 1024, + max_batch_size: int = 4, + lora_ckpt_list=None, + use_lora_plugin=None, + max_lora_rank: int = 64, + lora_target_modules=None, + max_prompt_embedding_table_size: int = 0, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, + paged_context_fmha: bool = False, + use_refit: bool = False, + max_num_tokens: int = None, + max_seq_len: int = None, + opt_num_tokens: int = None, + max_beam_width: int = 1, + tokens_per_block: int = 128, + multiple_profiles: bool = False, + gpt_attention_plugin: str = "auto", + gemm_plugin: str = "auto", + ): + """Method to build the TRTLLM Engine + + This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir + + Args: + engine_dir (str): The file path to save the engine + trtllm_model_weights (dict): The TRTLLM converted model weights dict + trtllm_model_config : The TRTLLM Config + max_input_len (int, optional): Max input length. Defaults to 1024. + max_output_len (int, optional): Max output length. Defaults to 1024. + max_batch_size (int, optional): Max batch size. Defaults to 4. + lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None. + use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None. + max_lora_rank (int, optional): Max lora rank. Defaults to 64. + lora_target_modules (_type_, optional): Lora target modules. Defaults to None. + max_prompt_embedding_table_size (int, optional): Max size of prompt embedding table. Defaults to 0. + paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True. + remove_input_padding (bool, optional): Remove input padding. Defaults to True. + paged_context_fmha (bool, optional): Paged context fmha. Defaults to False. + use_refit (bool, optional): Use refit. Defaults to False. + max_num_tokens (int, optional): Max num of tokens. Defaults to None. + max_seq_len (int, optional): Max seq length. Defaults to None. + opt_num_tokens (int, optional): Opt number of tokens. Defaults to None. + max_beam_width (int, optional): Max beam width. Defaults to 1. + tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128. + multiple_profiles (bool, optional): Use multiple profiles. Defaults to False. + gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto". + gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto". + """ + + TRTLLMEngineBuilder.build_and_save_engine( + engine_dir, + trtllm_model_weights, + trtllm_model_config, + max_input_len, + max_output_len, + max_batch_size, + lora_ckpt_list, + use_lora_plugin, + max_lora_rank, + lora_target_modules, + max_prompt_embedding_table_size, + paged_kv_cache, + remove_input_padding, + paged_context_fmha, + use_refit, + max_num_tokens, + max_seq_len, + opt_num_tokens, + max_beam_width, + tokens_per_block, + multiple_profiles, + gpt_attention_plugin, + gemm_plugin, + ) diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py new file mode 100644 index 0000000000..0cf805dcb6 --- /dev/null +++ b/megatron/core/export/trtllm/trtllm_layers.py @@ -0,0 +1,157 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import re +from enum import Enum +from typing import Tuple + + +class TRTLLMLayers(Enum): + """TRTLLM Layer names + + This Enum will be used to map input model layer names to TRTLLM Layer names + """ + + # ONE TIME LAYERS (NOT ASSOCIATED TO TRANSFORMER BLOCK) + # Input layers + position_embedding = 'transformer.position_embedding.weight' + vocab_embedding = 'transformer.vocab_embedding.weight' + lm_head = 'lm_head.weight' + + # Output layers + final_layernorm_weight = 'transformer.ln_f.weight' + final_layernorm_bias = 'transformer.ln_f.bias' + + # TRANSFORMER LAYERS + # Attention block related layers + input_layernorm_weight = 'transformer.layers.input_layernorm.weight' + input_layernorm_bias = 'transformer.layers.input_layernorm.bias' + attention_qkv_weight = 'transformer.layers.attention.qkv.weight' + attention_qkv_bias = 'transformer.layers.attention.qkv.bias' + attention_dense_weight = 'transformer.layers.attention.dense.weight' + attention_dense_bias = 'transformer.layers.attention.dense.bias' + + # mlp layers + mlp_fc_weight = 'transformer.layers.mlp.fc.weight' + mlp_fc_bias = 'transformer.layers.mlp.fc.bias' + post_layernorm_weight = 'transformer.layers.post_layernorm.weight' + post_layernorm_bias = 'transformer.layers.post_layernorm.bias' + mlp_projection_weight = 'transformer.layers.mlp.proj.weight' + mlp_projection_bias = 'transformer.layers.mlp.proj.bias' + + # mixture of expert layers + mlp_router_weight = 'transformer.layers.mlp.router.weight' + mlp_fc_weight_mixture_of_experts = 'transformer.layers.mlp.fc.weight.expert' + mlp_projection_weight_mixture_of_experts = 'transformer.layers.mlp.proj.weight.expert' + + @staticmethod + def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]: + """Helper function to return layer name and number + Given an input layer e.g decoder.layers.2.self_attention.linear_qkv.weight, + this function returns decoder.layers.self_attention.linear_qkv.weight and layernumber 2. + In case no layer number is present, it returns None for the layer number + Args: + layer_name (dict): The input layer name + + Returns: + Tuple[str, int]: The layer name , layer number (layer number could be None) + """ + # Use regular expression to find the number specifically after 'layers.' + match = re.search(r'(?<=layers\.)\d+(?=\.)', layer_name) + if match: + # Extract the number and remove it from the layer name + number = match.group(0) + layer_name_without_number = re.sub(r'\.{}\.'.format(number), '.', layer_name) + return layer_name_without_number, int(number) + else: + # Return the original name if no number is found + return layer_name, None + + # pylint: disable=line-too-long + @staticmethod + def rename_input_layer_names_to_trtllm_layer_names( + model_state_dict: dict, + trtllm_conversion_dict: dict, + state_dict_split_by_layer_numbers: bool = True, + ) -> dict: + """Helper function to rename model layer names to TRTLLM Layer names + + We go through each layer (keys) in the model state dict, + and map it to the equivalent TRTLLMLayer name (megatron/core/export/trtllm/trtllm). + If we have a layer number associated with layer, we extract it out, + map the original layer name to equivalent trtllm layer name and add layer number back. + CPU Conversion will pass in model state dict without layer numbers + (i.e decoder.layers.mlp.linear_fc1.weight of shape [num_layers, hidden_dim, 4 * hidden_dim]) . + GPU conversion will pass model state dict with each layer seperated + (i.e decoder.layers.2.mlp.linear_fc1.weight of shape [hidden_dim, 4 * hidden_dim]). + + Args: + model_state_dict (dict): The original model state dict + trtllm_conversion_dict (dict): The conversion dictionary mapping input model layer names to trtllm layer names + state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True + + Raises: + ValueError: In case the keys dont match to trtllm keys or if all model layers are not mapped to equivalent trtllm keys + + Returns: + dict: The model state dict with the key (i.e original model layer name) replaced by trtllm layer names + """ + for original_model_layer_name in list(model_state_dict.keys()): + if "_extra_state" in original_model_layer_name: + del model_state_dict[original_model_layer_name] + continue + + original_layer_name_without_number, layer_number = ( + TRTLLMLayers.return_layer_name_and_number(original_model_layer_name) + ) + if 'layers' in original_layer_name_without_number and state_dict_split_by_layer_numbers: + assert ( + layer_number is not None + ), f"Layer number is None for {original_model_layer_name} and state_dict_split_by_layer_numbers is set to True. Consider setting it False" + + if original_layer_name_without_number not in trtllm_conversion_dict: + raise ValueError( + f'Unable to rename key {original_layer_name_without_number}. Provide an appropriate mapping in the trtllm_conversion_dict when you initialize TRTLLMHelper' + ) + + trtllm_layer = trtllm_conversion_dict[original_layer_name_without_number] + assert isinstance( + trtllm_layer, TRTLLMLayers + ), f"{trtllm_layer} is not supported for conversion. Please use one of the TRTLLMLayerNames we provided in megatron/core/export/trtllm/trtllm_layer_names" + + value = model_state_dict.pop(original_model_layer_name) + + if layer_number is not None: + trtllm_layer_name_with_number = re.sub( + r'(?<=layers\.)', f'{layer_number}.', trtllm_layer.value + ) + model_state_dict[trtllm_layer_name_with_number] = value + else: + model_state_dict[trtllm_layer.value] = value + + return model_state_dict + + +# These layers are not associated within the transformer block. +# So they dont have a layer number (i.e independant of number of layers in the model) +NON_TRANSFORMER_LAYERS_NAMES = [ + TRTLLMLayers.vocab_embedding.value, + TRTLLMLayers.position_embedding.value, + TRTLLMLayers.lm_head.value, + TRTLLMLayers.final_layernorm_weight.value, + TRTLLMLayers.final_layernorm_bias.value, +] + + +def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str: + """Get TRTLayer name without prefix + + Given a layer e.g TRTLLMLayers.attention_qkv_weight it returns 'attention.qkv.weight' + + Args: + layer (TRTLLMLayers): The TRTLLMLayer + + Returns: + str: The TRTLLMLayers suffix (i.e Removing transformer.layers. fromt he layer name) + """ + layer_name_without_prefix = layer.value.replace("transformer.layers.", "") + return layer_name_without_prefix diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py new file mode 100644 index 0000000000..035e23a16c --- /dev/null +++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py @@ -0,0 +1,258 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import torch +from tqdm import tqdm + +from megatron.core import parallel_state +from megatron.core.export.data_type import DataType +from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers +from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix +from megatron.core.tensor_parallel.utils import VocabUtility +from megatron.core.transformer.transformer_config import TransformerConfig + + +def str_dtype_to_torch(dtype: DataType): + """Get torch datatype from input datatype""" + from tensorrt_llm._utils import str_dtype_to_torch + + return str_dtype_to_torch(dtype.name) + + +# pylint: disable=line-too-long +class DistributedTRTLLMModelWeightsConverter: + """The TRTLLM Converter class used for GPU (on device) conversion + + This class is used to convert models sharded and on gpus. (It assumes that the model is already sharded appropriate to how you want to export it). (i.e) If you want to export to tp2pp2, then load the model in tp2pp2 setting and pass in their respective state dictionaries + """ + + def __init__( + self, + transformer_config: TransformerConfig, + dtype: DataType, + multi_query_mode: bool = False, + activation: str = "gelu", + ): + """Constructor for the TRTLLMModelWeightsConverterGPU class + + This class is responsible to convert the model weights to TRTLLM equivalent weights. + + Args: + transformer_config (TransformerConfig): The transformer config + dtype (DataType): The data type or model precision + multi_query_mode (bool, optional): Defaults to False. + activation (str, optional): Defaults to "gelu". + """ + self.transformer_config = transformer_config + self.trtllm_model_weights = {} + self.storage_type = str_dtype_to_torch(dtype) + self.activation = activation + num_kv_heads = self.transformer_config.num_query_groups + if num_kv_heads == 0: + if multi_query_mode: + num_kv_heads = 1 + else: + num_kv_heads = self.transformer_config.num_attention_heads + self.num_kv_heads = num_kv_heads + + self.inference_pp_size = parallel_state.get_pipeline_model_parallel_world_size() + self.inference_tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.pp_rank = parallel_state.get_pipeline_model_parallel_rank() + self.tp_group = parallel_state.get_tensor_model_parallel_group() + vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size() + + assert ( + vp_size is None or vp_size == 1 + ), "Virtual parallelism is not supported in GPU Converter. Gather the VP chunks and use PP config." + + def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: str): + assert torch.is_tensor(val), f"Expected a tensor for {layer_name} but got {type(val)}" + val = val.to(self.storage_type) + val = val.detach().contiguous() + if val.ndim >= 2: + val = torch.transpose(val.reshape(val.shape[0], -1), 0, 1) + if layer_name not in self.trtllm_model_weights: + self.trtllm_model_weights[layer_name] = torch.empty( + val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True + ) + self.trtllm_model_weights[layer_name] = val + + def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor): + """Convert Transformer layers to TRTLLM weights + + Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits + + Args: + model_state_dict (dict): The input model state dictionary (All collected on CPU) + layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change + """ + if val.ndim == 2: + val = val.T + + if ( + layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight)) + ): + # Same as layernorm1p in NeMo + if ( + self.transformer_config.layernorm_zero_centered_gamma + and self.transformer_config.normalization == "LayerNorm" + and 'layernorm.weight' in layer_name + ): + val = val + 1.0 + + self._add_to_trtllm_model_weights(val=val, layer_name=layer_name) + + elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith( + suffix(TRTLLMLayers.mlp_fc_bias) + ): + + split_gated_activation = self.activation in [ + "swiglu", + "geglu", + "fast-swiglu", + "fast-geglu", + ] + if split_gated_activation: + vals, gates = [[n] for n in torch.chunk(val, 2, axis=-1)] + gate_layer_name = layer_name.replace("fc", "gate") + self._add_to_trtllm_model_weights(val=gates[0], layer_name=gate_layer_name) + val = vals[0] + + self._add_to_trtllm_model_weights(val=val, layer_name=layer_name) + + elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)): + qkv_hidden_dim = val.shape[0] + size_per_head = ( + qkv_hidden_dim + // (self.transformer_config.num_attention_heads + 2 * self.num_kv_heads) + * self.inference_tp_size + ) + q_num = self.transformer_config.num_attention_heads // self.num_kv_heads + + # We first concat all sub weights per tp rank together. + val = val.reshape(self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head) + qkv = torch.split(val, [q_num, 1, 1], dim=1) + split_vals = torch.concatenate( + [qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=0 + ) + self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name) + + # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here" + elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)): + hidden_dim = val.shape[0] + size_per_head = self.transformer_config.kv_channels + if size_per_head is None: + size_per_head = hidden_dim // self.transformer_config.num_attention_heads + q_num = self.transformer_config.num_attention_heads // self.num_kv_heads + + val = val.reshape( + hidden_dim, self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head + ) + qkv = torch.split(val, [q_num, 1, 1], dim=2) + split_vals = torch.concatenate( + [ + qkv[0].reshape(hidden_dim, -1), + qkv[1].reshape(hidden_dim, -1), + qkv[2].reshape(hidden_dim, -1), + ], + dim=1, + ) + self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name) + + else: + raise ValueError(f"{layer_name} cannot be handled by GPU converter") + + def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str): + """Convert Non Transformer layers to TRTLLM weights + + Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights + + Args: + model_state_dict (dict): The input model state dictionary (All collected on CPU) + layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change + """ + if layer_name in model_state_dict: + val = model_state_dict.pop(layer_name) + self._add_to_trtllm_model_weights(val=val, layer_name=layer_name) + + # ----------------Convert Embeddings---------------- + def _get_remove_vocab_padding(self, layer_name, model_state_dict, tokenizer_vocab_size): + val = model_state_dict.get(layer_name, None) + if val is None: + return None + + if self.inference_tp_size > 1: # Gather padded tensor chunks + vocab_size_padded = val.shape[0] * self.inference_tp_size + vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size( + vocab_size_padded, self.tp_rank, self.inference_tp_size + ) + dim_size = list(val.size()) + dim_size[0] = vocab_size_padded + gathered_val = torch.zeros( + dim_size, dtype=val.dtype, device=torch.cuda.current_device() + ) + gathered_val[vocab_start_index:vocab_end_index] = val + torch.distributed.all_reduce(gathered_val, group=self.tp_group) + val = gathered_val + unpadded = val[:tokenizer_vocab_size] + if self.inference_tp_size > 1: # Split gathered val for val parallel embedding + vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size( + tokenizer_vocab_size, self.tp_rank, self.inference_tp_size + ) + unpadded = unpadded[vocab_start_index:vocab_end_index] + return unpadded.T # TRTLLM expects (vocab_size, hidden_size) so need extra transpose + + @torch.no_grad() + def convert( + self, model_state_dict: dict, trtllm_conversion_dict: dict, tokenizer_vocab_size: int + ): + """Convert model weights to trtllm model weights + + This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc. + + Args: + model_state_dict (dict): The full model state dict (all on CPU) + trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names + tokenizer_vocab_size (int): The vocab size of the tokenizer + """ + + # First step is to convert input model layer names to equivalent trtllm layer names + model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=model_state_dict, trtllm_conversion_dict=trtllm_conversion_dict + ) + + # Convert the non transformer layers + for layer_name in NON_TRANSFORMER_LAYERS_NAMES: + if ( + layer_name in TRTLLMLayers.vocab_embedding.value + or layer_name in TRTLLMLayers.lm_head.value + ): + # For embedding layers alone we do some pre processing + embed_val = self._get_remove_vocab_padding( + layer_name, model_state_dict, tokenizer_vocab_size + ) + model_state_dict[layer_name] = embed_val + # TODO : Check if this handling of position embedding is right. + if layer_name == TRTLLMLayers.position_embedding.value: + position_embedding = model_state_dict[layer_name] + req_position_embedding = position_embedding.chunk(self.inference_tp_size)[ + self.tp_rank + ] + model_state_dict[layer_name] = req_position_embedding.T + self._convert_non_transformer_layer( + model_state_dict=model_state_dict, layer_name=layer_name + ) + + for layer_name, value in tqdm( + model_state_dict.items(), desc="Converting to TRTLLM Weights" + ): + self._convert_transformer_layer(layer_name, value) diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py new file mode 100644 index 0000000000..c7a98972d2 --- /dev/null +++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py @@ -0,0 +1,437 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +import re + +import torch +from tqdm import tqdm + +from megatron.core.export.data_type import DataType +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers +from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix +from megatron.core.transformer.transformer_config import TransformerConfig + + +# pylint: disable=line-too-long +# TODO: Writing TRT imports this way so that it can be mocked in the test_trtllm_cpu_converter.py unit test +# TODO: Figure out how to patch it directly from the trtllm library +def pad_vocab_size(vocab_size: int, tp_size: int): + """Pad vocab size based on inference size""" + from tensorrt_llm._utils import pad_vocab_size + + return pad_vocab_size(vocab_size, tp_size) + + +def str_dtype_to_torch(dtype: DataType): + """Get torch datatype from input datatype""" + from tensorrt_llm._utils import str_dtype_to_torch + + return str_dtype_to_torch(dtype.name) + + +class SingleDeviceTRTLLMModelWeightsConverter: + """Class to convert Model weights to TRTLLM weights on CPU""" + + def __init__( + self, + export_config: ExportConfig, + transformer_config: TransformerConfig, + dtype: DataType, + multi_query_mode: bool = False, + activation: str = "gelu", + ): + """Constructor for the TRTLLMModelWeightsConverterCPU class + + This class is responsible to convert the model weights to TRTLLM equivalent weights and also split them for each GPU rank and return as a list. + + Args: + export_config (ExportConfig): The export config with inference tp size, pp size etc. + transformer_config (TransformerConfig): The transformer config + dtype (DataType): The data type or model precision + multi_query_mode (bool, optional): Defaults to False. + activation (str, optional): Defaults to "gelu". + """ + self.export_config = export_config + self.transformer_config = transformer_config + self.trtllm_model_weights = {} + self.storage_type = str_dtype_to_torch(dtype) + self.activation = activation + num_kv_heads = self.transformer_config.num_query_groups + if num_kv_heads == 0: + if multi_query_mode: + num_kv_heads = 1 + else: + num_kv_heads = self.transformer_config.num_attention_heads + self.num_kv_heads = num_kv_heads + + def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str): + """Convert Non Transformer layers to TRTLLM weights + + Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights + + Args: + model_state_dict (dict): The input model state dictionary (All collected on CPU) + layer_name (str): The TRTLLM Layer name that we want to convert + """ + if layer_name in model_state_dict: + val = model_state_dict.pop(layer_name) + val = val.to(self.storage_type).detach().contiguous() + self.trtllm_model_weights[layer_name] = val + + def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor): + """Convert Transformer layers to TRTLLM weights + + Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits + + Args: + model_state_dict (dict): The input model state dictionary (All collected on CPU) + layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change + """ + + def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=None): + """Add the input weight to trtllm_model_weights + + Depending on split (Expert split/Tensor split/None) we split the input data and add accordingly + + Args: + val (torch.Tensor): The model weight to be added + layer_name (str): The TRTLLMlayername as a string + split_type (str, optional): The split type. Defaults to None. + """ + if split_type == 'expert_split': + for split_num, split_val in enumerate(val): + self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = ( + split_val.to(self.storage_type).detach().contiguous() + ) + elif split_type == 'tensor_split': + for split_num, split_val in enumerate(val): + if split_val.ndim >= 2: + split_val = torch.transpose(split_val.reshape(split_val.shape[0], -1), 1, 0) + + self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = ( + split_val.to(self.storage_type).detach().contiguous() + ) + else: + if val.ndim >= 2: + val = torch.transpose(val.reshape(val.shape[0], -1), 1, 0) + self.trtllm_model_weights[layer_name] = ( + val.to(self.storage_type).detach().contiguous() + ) + + if val.ndim == 2: + val = val.T + + if ( + layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight)) + or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias)) + or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight)) + ): + # Same as layernorm1p in NeMo + if ( + self.transformer_config.layernorm_zero_centered_gamma + and self.transformer_config.normalization == "LayerNorm" + and 'layernorm.weight' in layer_name + ): + val = val + 1.0 + + _add_to_trtllm_model_weights(val=val, layer_name=layer_name, split_type=None) + + elif layer_name.endswith( + suffix(TRTLLMLayers.attention_dense_weight) + ) or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight)): + split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=0) + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='tensor_split' + ) + + elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith( + suffix(TRTLLMLayers.mlp_fc_bias) + ): + split_gated_activation = self.activation in [ + "swiglu", + "geglu", + "fast-swiglu", + "fast-geglu", + ] + if split_gated_activation: + val, gate = torch.chunk(val, 2, axis=-1) + gate_layer_name = layer_name.replace("fc", "gate") + split_vals = torch.chunk(gate, self.export_config.inference_tp_size, axis=-1) + _add_to_trtllm_model_weights( + val=split_vals, layer_name=gate_layer_name, split_type='tensor_split' + ) + + split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1) + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='tensor_split' + ) + + elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)): + qkv_hidden_dim = val.shape[0] + size_per_head = qkv_hidden_dim // ( + self.transformer_config.num_attention_heads + 2 * self.num_kv_heads + ) + q_num = self.transformer_config.num_attention_heads // self.num_kv_heads + + # We first concat all sub weights per tp rank together. + val = val.reshape(self.num_kv_heads, q_num + 2, size_per_head) + + qkv = torch.split(val, [q_num, 1, 1], dim=1) + q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=0) + k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=0) + v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=0) + + # Concatenate Q, K, and V together + split_vals = [ + torch.concatenate( + [q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], dim=0 + ) + for i in range(self.export_config.inference_tp_size) + ] + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='tensor_split' + ) + + # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here" + elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)): + hidden_dim = val.shape[0] + size_per_head = self.transformer_config.kv_channels + if size_per_head is None: + size_per_head = hidden_dim // self.transformer_config.num_attention_heads + q_num = self.transformer_config.num_attention_heads // self.num_kv_heads + + # When the merge factor exceeds 1, the 'vals' list will have multiple entries. + # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA). + # We first concat all sub weights per tp rank together. + val = val.reshape(hidden_dim, self.num_kv_heads, q_num + 2, size_per_head) + + # Split the QKV to separate variables. + qkv = torch.split(val, [q_num, 1, 1], dim=2) + + query_groups_shape = qkv[0].shape + if len(query_groups_shape) > 1: + if (query_groups_shape[1] % self.export_config.inference_tp_size) != 0: + raise Exception( + "Number of query groups of the models is {0}. Please select tensor parallelism size " + "that can split the number of query groups to equal number of query matrices in the " + "each GPU.".format(query_groups_shape[1]) + ) + + q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=1) + k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=1) + v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=1) + + # Concatenate Q, K, and V together + split_vals = [ + torch.concatenate( + [ + q_split[i].reshape(hidden_dim, -1), + k_split[i].reshape(hidden_dim, -1), + v_split[i].reshape(hidden_dim, -1), + ], + dim=1, + ) + for i in range(self.export_config.inference_tp_size) + ] + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='tensor_split' + ) + + elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight_mixture_of_experts)): + w1, w3 = torch.chunk(val, 2, axis=1) + # w1 splits + split_w1s = torch.chunk(w1, self.export_config.inference_tp_size, axis=1) + # w3 splits + split_w3s = torch.chunk(w3, self.export_config.inference_tp_size, axis=1) + + split_vals = [torch.concatenate(item, dim=1) for item in zip(split_w3s, split_w1s)] + layer_name = layer_name.replace(".expert", "") # Remove suffix .expert from key + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='expert_split' + ) + + elif layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight_mixture_of_experts)): + split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1) + layer_name = layer_name.replace(".expert", "") # Remove suffix .expert from key + _add_to_trtllm_model_weights( + val=split_vals, layer_name=layer_name, split_type='expert_split' + ) + else: + raise ValueError(f"{layer_name} cannot be handled by converter") + + @torch.no_grad() + def convert( + self, model_state_dict: dict, trtllm_conversion_dict, state_dict_split_by_layer_numbers=True + ): + """Convert model weights to trtllm model weights + + This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc. + + Args: + model_state_dict (dict): The full model state dict (all on CPU) + trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names + state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True + """ + + # First step is to convert input model layer names to equivalent trtllm layer names + model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=model_state_dict, + trtllm_conversion_dict=trtllm_conversion_dict, + state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers, + ) + + # Convert the non transformer layers + for layer_name in NON_TRANSFORMER_LAYERS_NAMES: + # For vocab embedding layer alone we pad the weights to be divisible by inference tp size + if ( + layer_name == TRTLLMLayers.vocab_embedding.value + and self.export_config.use_parallel_embedding + ): + val = model_state_dict[TRTLLMLayers.vocab_embedding.value] + vocab_size = val.shape[0] + if vocab_size % self.export_config.inference_tp_size != 0: + vocab_size_padded = pad_vocab_size( + vocab_size, self.export_config.inference_tp_size + ) + pad_width = vocab_size_padded - vocab_size + val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0) + model_state_dict[layer_name] = val + + self._convert_non_transformer_layer( + model_state_dict=model_state_dict, layer_name=layer_name + ) + + transformer_layers_dict = {} + # Convert the transformer layers + if state_dict_split_by_layer_numbers: + # Already model dict is split by layer numbers + transformer_layers_dict = model_state_dict + else: + # Here we split the model state dict into individual layers + for layer_name in list(model_state_dict.keys()): + value = model_state_dict.pop(layer_name) + for layer_number in range(self.transformer_config.num_layers): + # e.g transformer.layers.mlp.fc.bias => transformer.layers.2.mlp.fc.bias + layer_name_with_layer_number = re.sub( + r'(?<=layers\.)', f'{layer_number}.', layer_name + ) + transformer_layers_dict[layer_name_with_layer_number] = value[layer_number] + + for layer_name, value in tqdm( + transformer_layers_dict.items(), desc="Converting to TRTLLM Weights" + ): + self._convert_transformer_layer(layer_name, value) + + def get_padded_vocab_size(self) -> int: + """Return the paded vocab size + + We extract the lm head and vocab embedding and use that to determine padded_vocab_size + + Returns: + int: Padded vocab size + """ + lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None) + vocab_size = self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value].shape[0] + vocab_size_padded = ( + vocab_size + if lm_head_weight is None + else pad_vocab_size(vocab_size, self.export_config.inference_tp_size) + ) + return vocab_size_padded + + def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config: dict): + """Get the trtllm model weights split per gpu + + Given the trtllm mapping information (tp, pp rank etc) we split the model weights in a list, with each element of the list corresponding to the weights of each gpu rank + + Args: + mapping : The trtllm mapping information + trtllm_model_config (dict): The trtllm model config + """ + + def _split(torch_tensor, tp_size, idx, dim=0): + """Splits the np tensor v on dim and return the idx's slice.""" + if tp_size == 1: + return torch_tensor + if len(torch_tensor.shape) == 1: + return torch.chunk(torch_tensor, tp_size)[idx].contiguous() + else: + return torch.chunk(torch_tensor, tp_size, axis=dim)[idx].contiguous() + + pp_layer_range = mapping.pp_layers(self.transformer_config.num_layers) + + trtllm_model_weights_per_gpu = {} + for layer_name, value in self.trtllm_model_weights.items(): + if layer_name in NON_TRANSFORMER_LAYERS_NAMES: + continue + + # Happens in the case of TP split or expert split + if layer_name.endswith(".bin"): + if layer_name.endswith(f"{mapping.tp_rank}.bin"): + layer_name = layer_name.replace(f".{mapping.tp_rank}.bin", "") + else: + continue + + layer_num = int(layer_name.split(".")[2]) + if layer_num in pp_layer_range: + layer_name = layer_name.replace( + f"layers.{layer_num}", f"layers.{layer_num - pp_layer_range[0]}" + ) + else: + continue + if ( + hasattr(trtllm_model_config, 'new_decoder_architecture') + and trtllm_model_config.new_decoder_architecture + and "post_layernorm" in layer_name + ): + layer_name = layer_name.replace("post_layernorm", "mlp_layernorm") + + trtllm_model_weights_per_gpu[layer_name] = value + + if mapping.is_first_pp_rank(): + embedding_weight = ( + _split( + self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value], + mapping.tp_size, + mapping.tp_rank, + ) + if self.export_config.use_parallel_embedding + else self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value] + ) + + trtllm_model_weights_per_gpu[TRTLLMLayers.vocab_embedding.value] = embedding_weight + + pos_embedding_weight = self.trtllm_model_weights.get( + TRTLLMLayers.position_embedding.value + ) + if pos_embedding_weight is not None: + if self.export_config.use_parallel_embedding: + pos_embedding_weight = _split( + pos_embedding_weight, mapping.tp_size, mapping.tp_rank + ) + + trtllm_model_weights_per_gpu[TRTLLMLayers.position_embedding.value] = ( + pos_embedding_weight + ) + + if mapping.is_last_pp_rank(): + lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None) + if lm_head_weight is not None: + trtllm_model_weights_per_gpu[TRTLLMLayers.lm_head.value] = _split( + lm_head_weight, mapping.tp_size, mapping.tp_rank + ) + + trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_weight.value] = ( + self.trtllm_model_weights[TRTLLMLayers.final_layernorm_weight.value] + ) + + ln_f_bias = self.trtllm_model_weights.get(TRTLLMLayers.final_layernorm_bias.value) + if ln_f_bias is not None: + trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_bias.value] = ln_f_bias + + return trtllm_model_weights_per_gpu diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 751bcedb13..0dbd1a58f2 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -3,13 +3,13 @@ import dataclasses import os import warnings -from importlib.metadata import version from typing import Callable import torch import transformer_engine as te -from pkg_resources import packaging +from packaging.version import Version as PkgVersion from torch import Tensor +from torch.nn.parameter import Parameter from megatron.core import ModelParallelConfig, parallel_state from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding @@ -19,33 +19,25 @@ get_context_parallel_group, get_tensor_and_expert_parallel_world_size, get_tensor_model_parallel_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, ) from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name +from megatron.core.tensor_parallel.layers import ( + _initialize_affine_weight_cpu, + set_tensor_model_parallel_attributes, +) from megatron.core.tensor_parallel.utils import divide from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint - - -def get_te_version(): - """Get TE version from __version__; if not available use pip's. Use caching.""" - - def get_te_version_str(): - if hasattr(te, '__version__'): - return str(te.__version__) - else: - return version("transformer-engine") - - return packaging.version.Version(get_te_version_str()) - - -_te_version = get_te_version() +from megatron.core.utils import get_te_version, is_te_min_version def _get_extra_te_kwargs(config: TransformerConfig): extra_transformer_engine_kwargs = {"params_dtype": config.params_dtype} - if _te_version >= packaging.version.Version("0.12.0"): + if is_te_min_version("0.12.0"): if config.use_cpu_initialization: extra_transformer_engine_kwargs["device"] = 'cpu' else: @@ -131,9 +123,9 @@ def __init__( extra_kwargs = _get_extra_te_kwargs(config) - if _te_version >= packaging.version.Version("0.8.0"): + if is_te_min_version("0.8.0"): if self.config.tp_comm_overlap: - if _te_version > packaging.version.Version("1.5.0"): + if is_te_min_version("1.5.0"): # Use old overlap flags if they were supplied instead extra_kwargs["ub_overlap_ag"] = ( self.config.tp_comm_overlap_ag @@ -160,7 +152,7 @@ def __init__( extra_kwargs["ub_atomic_gemm_ag"] = False extra_kwargs["ub_split_rs"] = False extra_kwargs["ub_atomic_gemm_rs"] = False - if _te_version > packaging.version.Version("1.0.0"): + if is_te_min_version("1.0.0", check_equality=False): assert ( tp_comm_buffer_name is not None ), "Buffer name should be set to configure communication overlap settings" @@ -171,7 +163,7 @@ def __init__( rng_tracker_name = get_expert_parallel_rng_tracker_name() else: rng_tracker_name = None - if _te_version >= packaging.version.Version("1.7.0"): + if is_te_min_version("1.7.0"): extra_kwargs["rng_tracker_name"] = rng_tracker_name # Disable communications in TE when using SP or EP by making TE agnostic of model parallel. @@ -268,25 +260,26 @@ def __init__( extra_kwargs = _get_extra_te_kwargs(config) # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm` - if _te_version >= packaging.version.Version("0.11.0"): + if is_te_min_version("0.11.0"): extra_kwargs["normalization"] = self.config.normalization elif self.config.normalization != "LayerNorm": + te_version = get_te_version() raise ValueError( - f"Transformer Engine v{_te_version} does not support {self.config.normalization}." + f"Transformer Engine v{te_version} does not support {self.config.normalization}." ) - if _te_version >= packaging.version.Version("0.8.0"): + if is_te_min_version("0.8.0"): if self.config.tp_comm_overlap: extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad - if _te_version > packaging.version.Version("1.5.0"): + if is_te_min_version("1.5.0", check_equality=False): # Use old overlap flags if they were supplied instead extra_kwargs["ub_overlap_ag"] = ( self.config.tp_comm_overlap_ag if hasattr(self.config, "tp_comm_overlap_ag") else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag ) - if _te_version > packaging.version.Version("1.6.0.dev0"): + if is_te_min_version("1.6.0.dev0", check_equality=False): extra_kwargs["ub_overlap_rs_dgrad"] = ( self.config.tp_comm_overlap_rs_dgrad if hasattr(self.config, "tp_comm_overlap_rs_dgrad") @@ -302,7 +295,7 @@ def __init__( else: extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag - if _te_version > packaging.version.Version("1.0.0"): + if is_te_min_version("1.0.0", check_equality=False): assert ( tp_comm_buffer_name is not None ), "Buffer name should be set to configure communication overlap settings" @@ -319,7 +312,11 @@ def __init__( get_rng_state_tracker=( get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None ), - init_method=condition_init_method(config, init_method), + init_method=( + condition_init_method(config, init_method) + if not config.use_cpu_initialization + else lambda w: None + ), bias=bias, return_bias=self.te_return_bias, parallel_mode="column", @@ -328,6 +325,33 @@ def __init__( **extra_kwargs, ) + world_size = get_tensor_model_parallel_world_size() + rank = get_tensor_model_parallel_rank() + + if config.use_cpu_initialization: + output_size_per_partition = divide(output_size, world_size) + _ = _initialize_affine_weight_cpu( + self.weight, + output_size, + input_size, + output_size_per_partition, + 0, + init_method, + stride=1, + return_master_weight=False, + rank=rank, + world_size=world_size, + skip_set_tensor_parallel_attributes=True, + ) + if bias: + self.bias = Parameter( + torch.empty(output_size_per_partition, dtype=config.params_dtype) + ) + set_tensor_model_parallel_attributes(self.bias, True, 0, 1) + with torch.no_grad(): + self.bias.zero_() + setattr(self.bias, 'allreduce', True) + def forward(self, x): """Forward.""" _is_first_microbatch = ( @@ -379,7 +403,11 @@ def __init__( output_size=output_size, parallel_mode="column", config=config, - init_method=condition_init_method(config, init_method), + init_method=( + condition_init_method(config, init_method) + if not config.use_cpu_initialization + else lambda w: None + ), bias=bias, skip_bias_add=skip_bias_add, is_expert=is_expert, @@ -387,6 +415,32 @@ def __init__( tp_comm_buffer_name=tp_comm_buffer_name, ) + world_size = get_tensor_model_parallel_world_size() + rank = get_tensor_model_parallel_rank() + if config.use_cpu_initialization: + output_size_per_partition = divide(output_size, world_size) + _ = _initialize_affine_weight_cpu( + self.weight, + output_size, + input_size, + output_size_per_partition, + 0, + init_method, + stride=1, + return_master_weight=False, + rank=rank, + world_size=world_size, + skip_set_tensor_parallel_attributes=True, + ) + if bias: + self.bias = Parameter( + torch.empty(output_size_per_partition, dtype=config.params_dtype) + ) + set_tensor_model_parallel_attributes(self.bias, True, 0, 1) + with torch.no_grad(): + self.bias.zero_() + setattr(self.bias, 'allreduce', True) + def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): """Sharding along axis 0, bias sharded""" state_dict = self.state_dict(prefix='', keep_vars=True) @@ -424,13 +478,42 @@ def __init__( output_size=output_size, parallel_mode="row", config=config, - init_method=condition_init_method(config, init_method), + init_method=( + condition_init_method(config, init_method) + if not config.use_cpu_initialization + else lambda w: None + ), bias=bias, skip_bias_add=skip_bias_add, skip_weight_param_allocation=False, # We don't currently use this for row parallel layers # pylint: disable=line-too-long is_expert=is_expert, tp_comm_buffer_name=tp_comm_buffer_name, ) + world_size = get_tensor_model_parallel_world_size() + rank = get_tensor_model_parallel_rank() + if config.use_cpu_initialization: + input_size_per_partition = divide(input_size, world_size) + self.master_weight = _initialize_affine_weight_cpu( + self.weight, + output_size, + input_size, + input_size_per_partition, + 1, + init_method, + stride=1, + return_master_weight=False, + params_dtype=config.params_dtype, + rank=rank, + world_size=world_size, + skip_set_tensor_parallel_attributes=True, + ) + if bias: + self.bias = Parameter(torch.empty(output_size, dtype=config.params_dtype)) + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + setattr(self.bias, 'allreduce', True) + setattr(self.bias, 'sequence_parallel', config.sequence_parallel) def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None): """Sharding along axis 1, bias not sharded""" @@ -459,6 +542,9 @@ def __init__( attn_mask_type: AttnMaskType, attention_type: str, attention_dropout: float = None, + softmax_scale: float = None, + k_channels: int = None, + v_channels: int = None, ): self.config = config self.te_forward_mask_type = False @@ -475,25 +561,25 @@ def __init__( ) extra_kwargs = {} - if _te_version >= packaging.version.Version("0.11.0"): + if is_te_min_version("0.11.0"): extra_kwargs["num_gqa_groups"] = self.config.num_query_groups elif self.config.num_query_groups != self.config.num_attention_heads: raise ValueError( - f"Transformer Engine v{_te_version} does not support Grouped Query Attention, " + f"Transformer Engine v{get_te_version()} does not support Grouped Query Attention, " f"use a newer version of Transformer Engine. " f"(num_query_groups ({self.config.num_query_groups}) != " f"num_attention_heads ({self.config.num_attention_heads}))" ) - if _te_version >= packaging.version.Version("0.10.0"): + if is_te_min_version("0.10.0"): extra_kwargs["attention_type"] = attention_type # older version don't need attention_type - if _te_version > packaging.version.Version("0.12.0"): + if is_te_min_version("0.12.0", check_equality=False): self.te_forward_mask_type = True # Only Transformer-Engine version >= 1.0.0 supports context parallelism - if _te_version >= packaging.version.Version("1.0.0"): + if is_te_min_version("1.0.0"): if getattr(TEDotProductAttention, "cp_stream") is None: TEDotProductAttention.cp_stream = torch.cuda.Stream() extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False) @@ -516,15 +602,26 @@ def __init__( if config.window_size is not None: # Check version - assert _te_version >= packaging.version.Version("1.2.0"), ( - f"Transformer-Engine version ({str(_te_version)}) must be >= 1.2.0 to support" + assert is_te_min_version("1.2.0"), ( + f"Transformer-Engine v{get_te_version()} must be >= 1.2.0 to support" "sliding window attention." ) extra_kwargs['window_size'] = config.window_size + if is_te_min_version("1.10.0"): + # TE 1.10.0 introduces the ability to set the different k and v channels + kv_channels = ( + (k_channels, v_channels) + if k_channels is not None and v_channels is not None + else self.config.kv_channels + ) + extra_kwargs['softmax_scale'] = softmax_scale + else: + kv_channels = self.config.kv_channels + super().__init__( num_attention_heads=self.config.num_attention_heads, - kv_channels=self.config.kv_channels, + kv_channels=kv_channels, attention_dropout=( self.config.attention_dropout if attention_dropout is None else attention_dropout ), @@ -554,18 +651,25 @@ def forward( ) # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set # after init - if self.config.apply_rope_fusion and _te_version > packaging.version.Version("0.13.0"): + if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False): self.qkv_format = 'bshd' qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) - if _te_version < packaging.version.Version("1.3.0"): + if get_te_version() < PkgVersion("1.3.0"): # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H # copies (#555) # These two arguments did not exist prior to 1.3.0 packed_seq_kwargs.pop("max_seqlen_q", None) packed_seq_kwargs.pop("max_seqlen_kv", None) + if get_te_version() < PkgVersion("1.8.0"): + # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted + # in each individual sequence in THD format dataset + # These two arguments did not exist prior to 1.8.0 + packed_seq_kwargs.pop("cu_seqlens_q_padded", None) + packed_seq_kwargs.pop("cu_seqlens_kv_padded", None) + if self.config.apply_rope_fusion and qkv_format == 'bshd': query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)] # In PyTorch, the following two tensors are in fact the same: @@ -578,7 +682,7 @@ def forward( value = value.as_strided(value.shape, key.stride()) if self.te_forward_mask_type: - if qkv_format == 'thd' and _te_version >= packaging.version.Version("1.7.0"): + if qkv_format == 'thd' and is_te_min_version("1.7.0"): # thd format uses flash attention with cuDNN kernel which requires is_padding=True, # so the only acceptable mask types are `padding_causal` and `padding`. These do not # necessarily indicate there are padded tokens in the sequence. @@ -603,7 +707,7 @@ def forward( return core_attn_out -if _te_version >= packaging.version.Version("1.9.0.dev0"): +if is_te_min_version("1.9.0.dev0"): class TEGroupedLinear(te.pytorch.GroupedLinear): """ @@ -865,10 +969,10 @@ def __init__( override_linear_precision: tuple = (False, False, False), ): extra_kwargs = _get_extra_te_kwargs(config) - if _te_version >= packaging.version.Version("1.6.0.dev0"): + if is_te_min_version("1.6.0.dev0"): extra_kwargs["fp8_dpa"] = config.fp8_dot_product_attention extra_kwargs["fp8_mha"] = config.fp8_multi_head_attention - if _te_version < packaging.version.Version("1.8.0"): + if get_te_version() < PkgVersion("1.8.0"): extra_kwargs["interval"] = config.fp8_interval elif config.fp8_interval != 1: warnings.warn("fp8_interval is deprecated and ignored from Transformer-Engine v1.8.0.") @@ -921,7 +1025,7 @@ def te_checkpoint( """Checkpointing with Transformer-Engine.""" from transformer_engine.pytorch.distributed import checkpoint - if _te_version >= packaging.version.Version("1.5.0"): + if is_te_min_version("1.5.0"): return checkpoint( forward_func, hidden_states, @@ -967,7 +1071,7 @@ def get_cpu_offload_context( enabled, num_layers, model_layers, activation_offloading, weight_offloading ): """Get CPU offload context and sync function.""" - if _te_version >= packaging.version.Version("1.10.0.dev0"): + if is_te_min_version("1.10.0.dev0"): context, sync_func = _get_cpu_offload_context( enabled, num_layers, model_layers, activation_offloading, weight_offloading ) diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py index 496a288bae..fe8160228b 100644 --- a/megatron/core/inference/engines/mcore_engine.py +++ b/megatron/core/inference/engines/mcore_engine.py @@ -13,47 +13,66 @@ class MCoreEngine(AbstractEngine): + """The Megatron core backend constructor + + This is the backend that does a simple forward pass on the model. + Supports any model that is callable (Accepts the inputs and outputs the tensor) + + Args: + text_generation_controller (SimpleTextGenerationController): A text generation + controller that will be used to define how to preprocess prompts, generate + outputs and detokenizer the output tokens. + max_batch_size : The maxinum number of requests to process at once + random_seed (int, optional): Use a random seed if you want deterministic + results. Defaults to None. + """ + def __init__( self, text_generation_controller: SimpleTextGenerationController, max_batch_size, random_seed: int = None, ): - """The Megatron core backend constructor - - This is the backend that does a simple forward pass on the model. Supports any model that is callable (Accepts the inputs and outputs the tensor) - - Args: - text_generation_controller (SimpleTextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens. - max_batch_size : The maxinum number of requests to process at once - random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None. - """ - self.text_generation_controller = text_generation_controller self.random_seed = random_seed self.scheduler = Scheduler(max_batch_size=max_batch_size) - def generate(self, prompts: List[str], common_inference_params: CommonInferenceParams) -> dict: + def generate( + self, + prompts: List[str], + add_BOS: bool = False, + encoder_prompts: List[str] = None, + common_inference_params: CommonInferenceParams = None, + ) -> dict: """The megatron core inference backend generate function - This backend returns the output generations as a dictionary. It returns the prompt tokens along with the generated tokens, the prompt plus the generated string and the output log probabilities if requested + This backend returns the output generations as a dictionary. + It returns the prompt tokens along with the generated tokens, the prompt + plus the generated string and the output log probabilities if requested Args: prompts (List[str]): All the prompts as a list of strings + add_BOS (bool): Whether to add BOS token to beginning of prompts + encoder_prompts (List[dict]): All the encoder prompts as a list of strings common_inference_params (CommonInferenceParams): The inference parameters Returns: - List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required + List[InferenceRequest]: The output is list of inference requests containing the + generated tokens, texts and log probs if required """ # TODO :M core- get rng state tracker if self.random_seed: torch.random.manual_seed(self.random_seed) - for prompt in prompts: - prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt) + for i in range(len(prompts)): + prompt = prompts[i] + encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None + prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS) + self.scheduler.add_request( prompt=prompt, prompt_tokens=prompt_tokens, + encoder_prompt=encoder_prompt, inference_parameters=common_inference_params, ) @@ -68,7 +87,9 @@ def run_engine(self): Runs the engine until there are no requests in the queue. Args: - dynamic_generation (bool, optional): Set this to True, if you want to enable dynamic batching. Mainly used with an inference server. Defaults to False. + dynamic_generation (bool, optional): Set this to True, if you want + to enable dynamic batching. Mainly used with an inference server. + Defaults to False. """ while self.scheduler.have_requests_pending(): active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy() diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py index a03834c7e4..4825dfd366 100644 --- a/megatron/core/inference/inference_request.py +++ b/megatron/core/inference/inference_request.py @@ -10,6 +10,8 @@ # class syntax class Status(Enum): + """Enum for status""" + WAITING_IN_QUEUE = 1 ACTIVE_AND_GENERATING_TOKENS = 2 ACTIVE_BUT_NOT_GENERATING_TOKENS = 3 @@ -18,12 +20,19 @@ class Status(Enum): @dataclass class InferenceRequest: + """Class for one inference request + + Containing relevant data for an inference request + + """ + request_id: str prompt: str inference_parameters: CommonInferenceParams prompt_tokens: List[int] arrival_time: float status: Status + encoder_prompt: str = None generated_text: str = None generated_tokens: torch.Tensor = None generated_log_probs: torch.Tensor = None diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py new file mode 100644 index 0000000000..f8011007a5 --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py new file mode 100644 index 0000000000..10e1da4812 --- /dev/null +++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from argparse import Namespace +from collections import deque +from typing import Any, List, Tuple + +import numpy +import torch + +from megatron.core import tensor_parallel +from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset +from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import ( + AbstractModelInferenceWrapper, +) +from megatron.core.models.T5 import T5Model + + +class T5InferenceWrapper(AbstractModelInferenceWrapper): + """Constructor for the model inference wrapper + + The wrapper prepares the model for inference, provides the required input + data, and runs the forward pass + + Args: + model (T5Model): The T5 model (MCore or legacy) + args (Namespace): The command line arguments that were passed + """ + + def __init__(self, model: T5Model, args: Namespace): + super().__init__(model, args) + + def prep_model_for_inference( + self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None + ): + """A utility function for preparing model for inference + + This function is called before the forward pass. It puts the model in eval mode, builds + position ids, and creates attention masks so that required slices can be extracted during + the forward pass. + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] + encoder_prompts (dict): List of string of encoder input prompts + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text + """ + + super().prep_model_for_inference(prompts_tokens=prompts_tokens) + + encoder_prompts_tokens_list = [ + self.tokenize_encoder_prompt(encoder_prompt, tokenizer) + for encoder_prompt in encoder_prompts + ] + self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens( + encoder_prompts_tokens_list, self.model.max_sequence_length, tokenizer + ) + + # create batch mask for encoder_prompt (self.batch_input_tokens) and + # decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py + decoder_prompts_tokens = self.prompts_tokens.cpu().numpy() + encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy() + self.batch_mask_encoder = [] + self.batch_mask_decoder = [] + self.batch_mask_encoder_decoder = [] + for i in range(len(self.prompts_tokens)): + self.batch_mask_encoder.append( + T5MaskedWordPieceDataset._make_attention_mask( + encoder_prompts_tokens[i], encoder_prompts_tokens[i] + ) + ) + self.batch_mask_decoder.append( + T5MaskedWordPieceDataset._make_attention_mask( + decoder_prompts_tokens[i], decoder_prompts_tokens[i] + ) + * T5MaskedWordPieceDataset._make_history_mask(decoder_prompts_tokens[i]) + ) + self.batch_mask_encoder_decoder.append( + T5MaskedWordPieceDataset._make_attention_mask( + decoder_prompts_tokens[i], encoder_prompts_tokens[i] + ) + ) + self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda() + self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda() + self.batch_mask_encoder_decoder = torch.tensor( + numpy.array(self.batch_mask_encoder_decoder) + ).cuda() + self.batch_mask_encoder = self.batch_mask_encoder < 0.5 + self.batch_mask_decoder = self.batch_mask_decoder < 0.5 + self.batch_mask_encoder_decoder = self.batch_mask_encoder_decoder < 0.5 + + def tokenize_encoder_prompt( + self, encoder_prompt: str, tokenizer + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility to tokenize the encoder_prompt + + Args: + encoder_prompt (str): The encoder_prompt + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing string + + Returns: + torch.Tensor: Returns the tokenized prompt + """ + + # if there is the word "" in prompt, replacing it with special_additional_token, + # similar to processing step in megatron/core/datasets/t5_dataset.py + divided_encoder_prompt_list = encoder_prompt.split("") + masks_count = len(divided_encoder_prompt_list) - 1 + sentinels = deque(tokenizer.additional_special_tokens_ids) + + encoder_prompt_tokens = [] + for divided_encoder_prompt in divided_encoder_prompt_list: + divided_encoder_prompt_tokens = tokenizer.tokenize(divided_encoder_prompt) + encoder_prompt_tokens.extend(divided_encoder_prompt_tokens) + if masks_count > 0: + sentinel = sentinels.popleft() + encoder_prompt_tokens.extend([sentinel]) + + return encoder_prompt_tokens + + def pad_encoder_prompts_tokens( + self, encoder_prompts_tokens_list: List[List[int]], max_sequence_length: int, tokenizer + ) -> torch.Tensor: + """Method to pad input prompts + + Given a list of prompts, pad them all to uniform length + + Args: + encoder_prompts_tokens_list (List[List[int]]): A list containing the + encoder_input_tokens + max_sequence_length (int): Maximum of the length of the encoder inputs tokens + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text + + Returns: + torch.Tensor: A torch tensor of shape [bs, max_sequence_length] + """ + + for encoder_prompt_tokens in encoder_prompts_tokens_list: + padding_size = max_sequence_length - len(encoder_prompt_tokens) + encoder_prompt_tokens.extend([tokenizer.pad] * padding_size) + + return torch.tensor(encoder_prompts_tokens_list).cuda() + + def get_batch_for_context_window( + self, context_start_position: int, context_end_position: int + ) -> List: + """Returns the inference data given context window + + This function gets called iteratively in a loop . Given the start and end context + positions , it extracts the appropriate data. + + Args: + context_start_position (int): Start of the context window. During + the first inference step it is mostly 0 + context_end_position (int): End of the context window. During the + last inference step it will mostly be the max generated sequence length. + + Returns: + List: A list of inputs that will be used by your model in the forward step + """ + + # rerun encoder every step + # T5 inference not yet support kv_cache + encoder_tokens2use = self.batch_encoder_prompts_tokens + decoder_tokens2use = self.prompts_tokens[:, :context_end_position] + encoder_mask2use = self.batch_mask_encoder + decoder_mask2use = self.batch_mask_decoder[:, :context_end_position, :context_end_position] + encoder_decoder_mask2use = self.batch_mask_encoder_decoder[:, :context_end_position, :] + data_at_step_idx = [ + encoder_tokens2use, + decoder_tokens2use, + encoder_mask2use, + decoder_mask2use, + encoder_decoder_mask2use, + ] + + return data_at_step_idx + + def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor: + """Utility to carry out simple forward pass for TP or no model parallel models + + Runs a very simple forward pass for model. Used in the case of models without + any parallelism or only tensor parallelism. + + Args: + inference_input (List): A list containg the inputs for the gpt + model [tokens, position ids, attention mask] + + Returns: + torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size] + """ + [encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = ( + inference_input + ) + tokens = decoder_tokens + + # T5 inference not yet support kv_cache + logits = self.model( + encoder_tokens, + decoder_tokens, + encoder_mask, + decoder_mask, + encoder_decoder_mask, + inference_params=None, + ) + logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) + + return logits diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py index abcb325185..00ab81b4ab 100644 --- a/megatron/core/inference/scheduler.py +++ b/megatron/core/inference/scheduler.py @@ -2,7 +2,7 @@ import time import typing from collections import OrderedDict -from typing import Dict, List +from typing import Dict import torch @@ -12,14 +12,16 @@ class Scheduler: - def __init__(self, max_batch_size: int): - """Scheduler for handling requests to inference engine + """Scheduler for handling requests to inference engine - This class is responsible for handing of all the incomign requests + This class is responsible for handing of all the incomign requests - Args: - max_batch_size (int): The max batch size that we can pass to the inference engine at a time. - """ + Args: + max_batch_size (int): The max batch size that we can pass to the + inference engine at a time. + """ + + def __init__(self, max_batch_size: int): self.max_batch_size = max_batch_size self.active_request_pool: Dict[int, InferenceRequest] = OrderedDict() self.waiting_request_pool: Dict[int, InferenceRequest] = OrderedDict() @@ -30,16 +32,19 @@ def add_request( self, prompt: str, prompt_tokens: torch.Tensor, - inference_parameters: CommonInferenceParams, + encoder_prompt: str = None, + inference_parameters: CommonInferenceParams = None, arrival_time: float = None, ): """Add an incoming request - This method will add the request to either the active pool or the waiting pool depending on the batch size. + This method will add the request to either the active pool or the waiting pool + depending on the batch size. Args: prompt (str): Input prompt string prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized + encoder_prompt (str): Encoder input string inference_parameters (CommonInferenceParams): The inference parameters arrival_time (float, optional): The incoming request time. Defaults to None. """ @@ -61,6 +66,7 @@ def add_request( arrival_time=arrival_time, prompt_tokens=prompt_tokens, status=status, + encoder_prompt=encoder_prompt, ) if status == status.ACTIVE_BUT_NOT_GENERATING_TOKENS: @@ -79,7 +85,8 @@ def have_requests_pending(self) -> bool: def add_earliest_waiting_request_to_active_pool(self): """Utility to add the waiting request to active pool - This method will add the earliest request (FIFO) that is in the waiting request pool to the active request pool. + This method will add the earliest request (FIFO) that is in the waiting request + pool to the active request pool. """ assert ( len(self.active_request_pool) < self.max_batch_size @@ -94,11 +101,15 @@ def add_earliest_waiting_request_to_active_pool(self): def update_requests_pools(self, result_dict: typing.OrderedDict[int, InferenceRequest] = None): """Update request pool status - This method will full up the active request pool, if it has less than max batch size elements from the waiting request pool. - If provided with a request dict, it will put the completed requests into the completed request pool and add waiting request into active pool. + This method will full up the active request pool, if it has less than max batch size + elements from the waiting request pool. + If provided with a request dict, it will put the completed requests into the completed + request pool and add waiting request into active pool. Args: - result (typing.OrderedDict[int, InferenceRequest], optional): The result returned by the engine. A dictionary with keys as the request ids, and values as the requests. Defaults to None + result (typing.OrderedDict[int, InferenceRequest], optional): The result returned + by the engine. A dictionary with keys as the request ids, and values as the + requests. Defaults to None """ for result_request_id in list(result_dict.keys()): active_request = self.active_request_pool[result_request_id] diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py new file mode 100644 index 0000000000..61beff0211 --- /dev/null +++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +from typing import OrderedDict + +import torch + +from megatron.core.inference.inference_request import InferenceRequest +from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import ( + SimpleTextGenerationController, +) + + +class EncoderDecoderTextGenerationController(SimpleTextGenerationController): + """The text generation controller for encoder-decoder architecture + + This class ingherits from SimpleTextGenerationController, adding features + relating to encoder input encoder_prompt + + """ + + def prep_model_for_inference( + self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] + ): + """Preparing batch for inference, using respective wrapper's prep_model_for_inference method + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] + active_requests (OrderedDict[int, InferenceRequest]): The input active requests + """ + encoder_prompts = list( + map(lambda request: request.encoder_prompt, active_requests.values()) + ) + + self.inference_wrapped_model.prep_model_for_inference( + prompts_tokens=prompts_tokens, encoder_prompts=encoder_prompts, tokenizer=self.tokenizer + ) diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py index e4db83f6b3..0667af8373 100644 --- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py @@ -14,15 +14,18 @@ class SimpleTextGenerationController: - def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): - """The basic text generation controller + """The basic text generation controller - This class is responsible for tokenizing the input , running the inference, sampling and also detokenizing the output + This class is responsible for tokenizing the input , running the inference, sampling + and also detokenizing the output - Args: - inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py - tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts - """ + Args: + inference_wrapped_model (AbstractModelInferenceWrapper): A model that + is wrapped using the specs given in the abstract_model_inference_wrapper.py + tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts + """ + + def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer): self.inference_wrapped_model = inference_wrapped_model self.tokenizer = tokenizer @@ -31,7 +34,9 @@ def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, token parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage() ) - def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: + def tokenize_prompt( + self, prompt: str, add_BOS: bool = False + ) -> Tuple[torch.Tensor, torch.Tensor]: """Utility to tokenize the input prompts Args: @@ -40,13 +45,19 @@ def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: Returns: torch.Tensor: Returns the tokenized prompt """ - return self.tokenizer.tokenize(prompt) + prompt_tokens = self.tokenizer.tokenize(prompt) + + if add_BOS: + prompt_tokens = [self.tokenizer.bos] + prompt_tokens + + return prompt_tokens def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: """Detokenize the output generations Args: - prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt tokens plus the generated tokens + prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt + tokens plus the generated tokens Returns: str: The detokenized output @@ -62,11 +73,15 @@ def sample_from_logits( ) -> torch.Tensor: """Samples the logits to generate outputs - Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples + Given the logits of the last token, this function samples it + according to the parameters defined in common_inference_params + and returns the samples Args: - last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size] - common_inference_params (CommonInferenceParams): The paramters to use for inference + last_token_logits (torch.Tensor): The last token logits. A tensor of + size [batch_size, vocab_size] + common_inference_params (CommonInferenceParams): The paramters to use + for inference vocab_size (int): Obtained from the tokenizer. Defaults to None Returns: @@ -141,23 +156,35 @@ def update_generation_status( ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Checks which prompts have reached an end condition - We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True. The generated sequence lengths increase as we keep generating, until that prompts hits an end condition. The generation_started tensor determines which prompts have started generating. + We check which prompts have reached an end condition and set the corresponding + flags of the is_generation_done_tensor to True. The generated sequence lengths + increase as we keep generating, until that prompts hits an end condition. The + generation_started tensor determines which prompts have started generating. Args: - updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest generated tokens. A tensor of shape [batch_size, max_seq_len] (i.e max_seq_len = max_prompt_len + tokens_to_generate) - generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens. - current_context_end_position (int): An integer indicating which position to extract from the prompts tokens to get the latest generated tokens. - is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has reached end condition. - generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. Each value represents the generated sequence lengths for that prompt. + updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest + generated tokens. A tensor of shape [batch_size, max_seq_len] + (i.e max_seq_len = max_prompt_len + tokens_to_generate) + generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True + indicates the prompt at that index has started generating tokens. + current_context_end_position (int): An integer indicating which position to + extract from the prompts tokens to get the latest generated tokens. + is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. + True indicates the prompt at that index has reached end condition. + generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. + Each value represents the generated sequence lengths for that prompt. Returns: - Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean is_generation_done_tensor and the generated_sequence_lengths after updating it + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean + is_generation_done_tensor and the generated_sequence_lengths after updating it """ latest_samples = updated_prompts_tokens[:, current_context_end_position] - # Make sure we are checking eod criterion only for prompts that have started generating (i.e) We only look at the generated tokenns and not the input tokens. + # Make sure we are checking eod criterion only for prompts that have started generating + # (i.e) We only look at the generated tokenns and not the input tokens. reached_eod = (latest_samples == self.tokenizer.eod) & generation_started is_generation_done_tensor = is_generation_done_tensor | reached_eod - # We increment generated sequence lengths when that prompt has not hit the EOD and generation has started + # We increment generated sequence lengths when that prompt has not hit the + # EOD and generation has started generated_sequence_lengths += ~is_generation_done_tensor & generation_started return is_generation_done_tensor, generated_sequence_lengths @@ -178,7 +205,9 @@ def pad_input_prompt_tokens( num_tokens_togenerate (int): The number of tokens to generate for each prompt Returns: - torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, with extra indices for each tensor padded with mask id. + torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) + max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, + with extra indices for each tensor padded with mask id. """ max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate @@ -193,13 +222,16 @@ def generate_output_tokens_dynamic_batch( ) -> OrderedDict[int, InferenceRequest]: """Utility to generate the output tokens and probabilities for the prompts - This utility generates the output tokens for a dynamic batch. It will run one forward step at a time, and pass control back to the engine, which will update the request pool and call this method again. + This utility generates the output tokens for a dynamic batch. It will run one forward step + at a time, and pass control back to the engine, which will update the request pool and call + this method again. Args: active_requests (OrderedDict[int, InferenceRequest]): The input active requests. Returns: - OrderedDict[int, InferenceRequest]: The result for each of the incoming requests after running one forward step. + OrderedDict[int, InferenceRequest]: The result for each of the incoming requests + after running one forward step. """ raise Exception("Not implemented yet") @@ -208,7 +240,9 @@ def generate_all_output_tokens_static_batch( ) -> OrderedDict[int, InferenceRequest]: """Utility to generate the all the output tokens and probabilities for the prompts . - This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests + This utility generates the output tokens for a static batch. It runs the forward steps till + all prompts complete generation, updates the status of these requests to completed, adds + the generated result and returns these requests Args: active_requests (OrderedDict[int, InferenceRequest]): The input active requests. @@ -252,8 +286,9 @@ def generate_all_output_tokens_static_batch( generated_sequence_lengths = torch.zeros(batch_size).cuda() with torch.no_grad(): - self.inference_wrapped_model.prep_model_for_inference( - prompts_tokens=batch_prompt_tokens + + self.prep_model_for_inference( + prompts_tokens=batch_prompt_tokens, active_requests=active_requests ) context_start_position = 0 @@ -275,14 +310,17 @@ def generate_all_output_tokens_static_batch( tensor=logits, ) - # Indicates which of the input prompts have started generating tokens. A 1D boolean tensor with [batch_size] elements (i.e) The shortest prompts will start generating first and so on + # Indicates which of the input prompts have started generating tokens. + # A 1D boolean tensor with [batch_size] elements (i.e) The shortest + # prompts will start generating first and so on generation_started = prompt_lengths_in_batch <= context_end_position last_token_logits = logits[:, -1, :] sampled_logits = self.sample_from_logits( last_token_logits, common_inference_params, self.tokenizer.vocab_size ) - # Substitute the sampled logits only for only the prompts that have started generating tokens + # Substitute the sampled logits only for only the prompts that + # have started generating tokens batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[ generation_started ] @@ -302,7 +340,8 @@ def generate_all_output_tokens_static_batch( context_start_position = context_end_position - # Check end of generation status for each tensor and update generated sequence lengths + # Check end of generation status for each tensor + # and update generated sequence lengths (is_generation_done_tensor, generated_sequence_lengths) = ( self.update_generation_status( updated_prompts_tokens=batch_prompt_tokens, @@ -348,3 +387,14 @@ def generate_all_output_tokens_static_batch( request.generated_text = self.detokenize_generations(required_result_tokens) return active_requests + + def prep_model_for_inference( + self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest] + ): + """Preparing batch for inference, using respective wrapper's prep_model_for_inference method + + Args: + prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length] + active_requests (OrderedDict[int, InferenceRequest]): The input active requests + """ + self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens) diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py index caae41cb4a..f2751673e4 100644 --- a/megatron/core/model_parallel_config.py +++ b/megatron/core/model_parallel_config.py @@ -182,8 +182,8 @@ class ModelParallelConfig: tp_comm_atomic_ag: bool = False """Deprecated from TransformerEngine v1.6.0. - If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both - done atomically. Don't care if tp_comm_overlap is False. + If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather + both done atomically. Don't care if tp_comm_overlap is False. """ tp_comm_split_rs: bool = True @@ -213,6 +213,11 @@ class ModelParallelConfig: If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled """ + tp_comm_bootstrap_backend: str = 'nccl' + """ + Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo' + """ + ################### # Pipeline Parallel ################### @@ -257,7 +262,8 @@ class ModelParallelConfig: wgrad_deferral_limit: int = 0 """This value tunes the number of micro-batches for which the embedding weight gradient compute - needs to be deferred to pipeline flush, this argument is invalid if `defer_embedding_wgrad_compute` is False. + needs to be deferred to pipeline flush, this argument is invalid if + `defer_embedding_wgrad_compute` is False. Defaults to 0, which means all micro-batches are deferred. """ @@ -276,7 +282,9 @@ class ModelParallelConfig: """Tells the number of transformer layers for which activations has to be offloaded.""" _cpu_offloading_context: ContextManager = ( - None # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible. + None + # Used for internal use only, not to be set by a user. + # TODO: Need to move to the 'right' place when possible. ) """For internal use only, do not set.""" @@ -297,7 +305,8 @@ class ModelParallelConfig: def __post_init__(self): """Python dataclass method that is used to modify attributes after initialization. - See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details. + See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more + details. """ if self.sequence_parallel: if self.tensor_model_parallel_size <= 1: @@ -324,11 +333,12 @@ def __post_init__(self): if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0: raise ValueError( - "Wgrad deferral limit should be greater than or equal to 0 when this optimization is enabled!" + "Wgrad deferral limit should be greater than or equal to 0 when it is enabled!" ) if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1: if self.sequence_parallel is False: raise ValueError( - "When using expert parallelism and tensor parallelism, sequence parallelism must be used" + "When using expert parallelism and tensor parallelism, sequence parallelism " + "must be used" ) diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py index 42da1889a9..ecdcdbc260 100644 --- a/megatron/core/models/T5/t5_spec.py +++ b/megatron/core/models/T5/t5_spec.py @@ -52,7 +52,7 @@ def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec: submodules=TransformerLayerSubmodules( self_attention=ModuleSpec( module=SelfAttention, - params={"attn_mask_type": AttnMaskType.padding}, + params={"attn_mask_type": AttnMaskType.arbitrary}, submodules=SelfAttentionSubmodules( linear_qkv=TELayerNormColumnParallelLinear, core_attention=TEDotProductAttention, @@ -94,6 +94,7 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec: pre_cross_attn_layernorm=TENorm, cross_attention=ModuleSpec( module=CrossAttention, + params={"attn_mask_type": AttnMaskType.arbitrary}, submodules=CrossAttentionSubmodules( linear_q=TEColumnParallelLinear, linear_kv=TEColumnParallelLinear, @@ -122,7 +123,7 @@ def encoder_model_with_local_spec() -> ModuleSpec: input_layernorm=LNImpl, self_attention=ModuleSpec( module=SelfAttention, - params={"attn_mask_type": AttnMaskType.padding}, + params={"attn_mask_type": AttnMaskType.arbitrary}, submodules=SelfAttentionSubmodules( linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, @@ -170,6 +171,7 @@ def decoder_model_with_local_spec() -> ModuleSpec: pre_cross_attn_layernorm=LNImpl, cross_attention=ModuleSpec( module=CrossAttention, + params={"attn_mask_type": AttnMaskType.arbitrary}, submodules=CrossAttentionSubmodules( linear_q=ColumnParallelLinear, linear_kv=ColumnParallelLinear, diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py index d9d1be449c..eb08d4cfd6 100644 --- a/megatron/core/models/bert/bert_model.py +++ b/megatron/core/models/bert/bert_model.py @@ -1,15 +1,14 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. import os -from importlib.metadata import version +import warnings from typing import Literal, Optional import torch -from pkg_resources import packaging from torch import Tensor from megatron.core import parallel_state, tensor_parallel from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk -from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec +from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec from megatron.core.models.bert.bert_lm_head import BertLMHead from megatron.core.models.bert.pooler import Pooler from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding @@ -20,11 +19,14 @@ from megatron.core.transformer.transformer_block import TransformerBlock from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import get_linear_layer +from megatron.core.utils import get_te_version as _get_te_version +from megatron.core.utils import is_te_min_version def get_te_version(): - """Returns the installed version of transformer engine""" - return packaging.version.Version(version("transformer-engine")) + """Included for backwards compatibility.""" + warnings.warn("`get_te_version` will be deprecated in a future release") + return _get_te_version() class BertModel(LanguageModule): @@ -91,9 +93,7 @@ def __init__( # megatron core pipelining currently depends on model type self.model_type = ModelType.encoder_or_decoder - self.attn_mask_dimensions = self._santiy_check_attention_and_get_attn_mask_dimension( - transformer_layer_spec - ) + self.attn_mask_dimensions = self._sanity_check_attention_and_get_attn_mask_dimension() # Embeddings. if self.pre_process: @@ -152,44 +152,71 @@ def __init__( if self.pre_process or self.post_process: self.setup_embeddings_and_output_layer() - def _santiy_check_attention_and_get_attn_mask_dimension( - self, transformer_layer_spec: ModuleSpec - ) -> str: + # pylint: disable=line-too-long + def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str: """We do some checks and return attention mask dimensions for self attention Transformer engine library underwent a lot of change. So we need to change dimensions of the attention mask depending on the TE version. We also santiy check some arguments. 1. If we use local version of attention dimension of the mask is [b,1,s,s] - 2. If we use transformer engine < 1.7 - (Flash and Fused attention not supported. We use unfused path). - Attn mask dimension is [b,1,s,s] - 2. If we use transformer engine >= 1.7 - (Flash and fused attention supported with attn mask dimension [b,1,1,s]). - Unfused path will use attn mask dimension [b,1,s,s] with attn mask type arbitrary. - Default if you dont set any NVTE_ATTN flag will just use unfused path. + 2. If we use transformer engine > 1.10 we support all 3 backends with padding mask and [b,1,s,s] + 3. If we use transformer engine >= 1.7 but less than 1.10 + a ) Flash and Fused attention uses padding mask with [b,1,1,s] + b ) Unfused attention works with arbitrary mask with [b,1,s,s] + 4. If we use transformer engine < 1.7 + Flash and fused attention is not supported. Unfused attention will work with padding mask [b,1,s,s] + + Default if you dont set any NVTE_ATTN flag will it will just use the fused path for transformer engine version >= 1.7 and unfused path for other Args: - transformer_layer_spec (ModuleSpec): _description_ + transformer_layer_spec (ModuleSpec): The transformer layer spec Returns: - str: _description_ + str: A string showing the format of the attn mask dimensions """ - attn_mask_dimensions = "b1ss" - if transformer_layer_spec == bert_layer_with_transformer_engine_spec: - if get_te_version() >= packaging.version.Version("1.7.0"): - # pylint: disable=line-too-long - if os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0': - assert ( - transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] - == AttnMaskType.arbitrary - ), "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary" - else: + attn_mask_dimensions = None + # For local layer spec we just use b1ss + if self.transformer_layer_spec == bert_layer_local_spec: + attn_mask_dimensions = "b1ss" + else: + attn_mask_type = self.transformer_layer_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] + flash_attention_enabled = os.getenv('NVTE_FLASH_ATTN') == '1' + fused_attention_enabled = os.getenv('NVTE_FUSED_ATTN') == '1' + # For TE >= 1.10 (We always use padding mask and use b11s) + if is_te_min_version("1.10.0"): + attn_mask_dimensions = "b11s" + if attn_mask_type != AttnMaskType.padding: + warnings.warn( + f'For TE versions >= 1.10 , flash/fused/unfused support padding mask. Setting attention mask from {attn_mask_type} to padding' + ) + self.transformer_layer_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] = AttnMaskType.padding + # For 1.7 >= TE < 1.10 flash and fused path use padding mask with b11s and unfused path uses arbitrary mask with b1ss + elif is_te_min_version("1.7.0"): + if flash_attention_enabled or fused_attention_enabled: attn_mask_dimensions = "b11s" + else: + if attn_mask_type != AttnMaskType.arbitrary: + warnings.warn( + f'For TE versions >= 1.7 but < 1.10 , unfused path supports only arbitrary mask. Setting attention mask from {attn_mask_type} to arbitray' + ) + self.transformer_layer_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] = AttnMaskType.arbitrary + attn_mask_dimensions = "b1ss" + # For TE < 1.7 we only support unfused attention with b1ss and padding mask else: - assert ( - os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0' - ), "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7" + attn_mask_dimensions = "b1ss" + assert not flash_attention_enabled and not fused_attention_enabled, ( + "Flash and fused attention is not supported with transformer engine version " + "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer " + "engine >= 1.7" + ) + return attn_mask_dimensions def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor: diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py index e69de29bb2..865f96da5d 100644 --- a/megatron/core/models/common/embeddings/__init__.py +++ b/megatron/core/models/common/embeddings/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from .rope_utils import apply_rotary_pos_emb +from .rotary_pos_embedding import RotaryEmbedding +from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py new file mode 100644 index 0000000000..accb251961 --- /dev/null +++ b/megatron/core/models/common/embeddings/rope_utils.py @@ -0,0 +1,191 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from megatron.core.transformer.transformer_config import TransformerConfig + +import logging + +import torch +from torch import Tensor + +from megatron.core import parallel_state + +logger = logging.getLogger(__name__) + +try: + from apex.transformer.functional import ( + fused_apply_rotary_pos_emb, + fused_apply_rotary_pos_emb_thd, + ) + + HAVE_APPLY_ROPE_FUSION = True +except ImportError: + HAVE_APPLY_ROPE_FUSION = False + + +def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor: + """Get the position embedding on the current context parallel rank. + + Args: + pos_emb (Tensor): Positional embedding tensor + seq_dim (int): Sequence dimension + """ + cp_size = parallel_state.get_context_parallel_world_size() + cp_rank = parallel_state.get_context_parallel_rank() + cp_idx = torch.tensor( + [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True + ).cuda(non_blocking=True) + pos_emb = pos_emb.view( + *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :] + ) + pos_emb = pos_emb.index_select(seq_dim, cp_idx) + pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :]) + return pos_emb + + +def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor: + """Change sign so the last dimension becomes [-odd, +even] + + Args: + x (Tensor): Input tensor + + Returns: + Tensor: Tensor rotated half + """ + if not rotary_interleaved: + x1, x2 = torch.chunk(x, 2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1 = x[:, :, :, ::2] + x2 = x[:, :, :, 1::2] + x_new = torch.stack((-x2, x1), dim=-1) + return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1) + + +def _apply_rotary_pos_emb_bshd( + t: Tensor, + freqs: Tensor, + rotary_interleaved: bool = False, + multi_latent_attention: bool = False, + mscale: float = 1.0, +) -> Tensor: + """Apply rotary positional embedding to input tensor T. + + check https://kexue.fm/archives/8265 for detailed formulas + + Args: + t (Tensor): Input tensor T is of shape [seq_length, ... , dim] + freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim] + + Returns: + Tensor: The input tensor after applying RoPE + """ + rot_dim = freqs.shape[-1] + + # ideally t_pass is empty so rotary pos embedding is applied to all tensor t + t, t_pass = t[..., :rot_dim], t[..., rot_dim:] + + if multi_latent_attention: + x1 = t[..., 0::2] + x2 = t[..., 1::2] + t = torch.cat((x1, x2), dim=-1) + + # first part is cosine component + # second part is sine component, need to change signs with _rotate_half method + cos_ = (torch.cos(freqs) * mscale).to(t.dtype) + sin_ = (torch.sin(freqs) * mscale).to(t.dtype) + + t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_) + return torch.cat((t, t_pass), dim=-1) + + +def _apply_rotary_pos_emb_thd( + t: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotary_interleaved: bool = False, + multi_latent_attention: bool = False, + mscale: float = 1.0, +) -> Tensor: + """A baseline implementation of applying RoPE for `thd` format. + + Args: + t (Tensor): Input tensor T is of shape [t, h, d] + cu_seqlens(Tensor): Cumulative sum of sequence lengths in a batch for `t`, + with shape [b + 1] and dtype torch.int32. + freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d] + + Returns: + Tensor: Shape [t, h, d]. The input tensor after applying RoPE. + """ + + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return torch.cat( + [ + _apply_rotary_pos_emb_bshd( + x.unsqueeze(1), + freqs[: x.size(0)], + rotary_interleaved=rotary_interleaved, + multi_latent_attention=multi_latent_attention, + mscale=mscale, + ) + for x in torch.split(t, seqlens) + ] + ).squeeze(1) + + +def apply_rotary_pos_emb( + t: Tensor, + freqs: Tensor, + config: TransformerConfig, + cu_seqlens: Optional[Tensor] = None, + mscale: float = 1.0, +): + """ + Reroute to the appropriate apply_rotary_pos_emb function depending on + fused/unfused kernels, or bshd (conventional) / thd (packed seq) format + """ + if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: + # setting apply_rope_fusion in config to False + # so that subsequent queries to this config also return False + config.apply_rope_fusion = False + if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): + logger.warning( + "Setting apply_rope_fusion to false because its implementation" + " is not included in Apex. Try upgrading to the latest version" + ) + apply_rotary_pos_emb.printed_fused_warning = True + + if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved: + logger.warning( + "rotary_interleaved is not supported with multi_latent_attention, setting it to False" + ) + config.rotary_interleaved = False + + if config.apply_rope_fusion: + if cu_seqlens is None: + return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) + else: + return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) + else: + if cu_seqlens is None: + return _apply_rotary_pos_emb_bshd( + t, + freqs, + rotary_interleaved=config.rotary_interleaved, + multi_latent_attention=config.multi_latent_attention, + mscale=mscale, + ) + else: + return _apply_rotary_pos_emb_thd( + t, + cu_seqlens, + freqs, + rotary_interleaved=config.rotary_interleaved, + multi_latent_attention=config.multi_latent_attention, + mscale=mscale, + ) diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py index 0a4e5bf6de..5232faec60 100644 --- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py +++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py @@ -2,58 +2,50 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING if TYPE_CHECKING: from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_block import TransformerBlock import logging +import math import torch from torch import Tensor, nn from megatron.core import parallel_state +from megatron.core.models.common.embeddings.rope_utils import ( # for backward compatibility; pylint: disable=unused-import + _apply_rotary_pos_emb_bshd, + _apply_rotary_pos_emb_thd, + _rotate_half, + apply_rotary_pos_emb, + get_pos_emb_on_this_cp_rank, +) logger = logging.getLogger(__name__) -try: - from apex.transformer.functional import ( - fused_apply_rotary_pos_emb, - fused_apply_rotary_pos_emb_thd, - ) - HAVE_APPLY_ROPE_FUSION = True -except ImportError: - HAVE_APPLY_ROPE_FUSION = False - - -__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] - - -def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim): - cp_size = parallel_state.get_context_parallel_world_size() - cp_rank = parallel_state.get_context_parallel_rank() - cp_idx = torch.tensor( - [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True - ).cuda(non_blocking=True) - pos_emb = pos_emb.view( - *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :] - ) - pos_emb = pos_emb.index_select(seq_dim, cp_idx) - pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :]) - return pos_emb +__all__ = ['RotaryEmbedding'] class RotaryEmbedding(nn.Module): """Rotary Embedding for language model. Args: - kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config - rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. - seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None - rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000. - use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on the GPU. Defaults to False + kv_channels (int): Projection weights dimension in multi-head attention. Obtained + from transformer config + rotary_percent (float): Percent of rotary dimension to use for rotary position + embeddings. + rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings. + Defaults to False. + seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE + for longer sequences. The value must be a float larger than 1.0. Defaults to None + rotary_base (int, optional): Base period for rotary position embeddings. Defaults to + 10000. + rope_scaling (bool, optional): Apply rope scaling as used in llama 3.1 + use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly + on the GPU. Defaults to False """ def __init__( @@ -63,6 +55,7 @@ def __init__( rotary_interleaved: bool = False, seq_len_interpolation_factor: float = None, rotary_base: int = 10000, + rope_scaling: bool = False, use_cpu_initialization: bool = False, ) -> None: super().__init__() @@ -78,6 +71,44 @@ def __init__( rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) ) + if rope_scaling: + self.inv_freq = self._apply_scaling(self.inv_freq) + + def _apply_scaling( + self, + freqs, + factor=8, + low_freq_factor=1, + high_freq_factor=4, + original_max_position_embeddings=8192, + ): + # This implementation is adapted from: + # https://github.com/huggingface/transformers/blob/2a5a6ad18aa22e98429bb5ecb880660328030ea0/src/transformers/modeling_rope_utils.py#L303-L343 + + factor = factor # `8` in the original implementation + low_freq_factor = low_freq_factor # `1` in the original implementation + high_freq_factor = high_freq_factor # `4` in the original implementation + old_context_len = original_max_position_embeddings # `8192` in the original implementation + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + wavelen = 2 * math.pi / freqs + # wavelen < high_freq_wavelen: do nothing + # wavelen > low_freq_wavelen: divide by factor + inv_freq_llama = torch.where(wavelen > low_freq_wavelen, freqs / factor, freqs) + # otherwise: interpolate between the two, using a smooth factor + smooth_factor = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor + ) + smoothed_inv_freq = ( + 1 - smooth_factor + ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama + is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) + + return inv_freq_llama + def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: """Forward pass of RoPE embedding. @@ -111,7 +142,8 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: # emb [seq_length, .., dim] emb = emb[:, None, None, :] if parallel_state.get_context_parallel_world_size() > 1: - # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank + # slice rotary_pos_emb along sequence dimension and select the parition of the current + # CP rank emb = get_pos_emb_on_this_cp_rank(emb, 0) return emb @@ -130,8 +162,9 @@ def get_rotary_seq_len( Args: inference_params : Used during Inference time - transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model - transformer_input (Tensor): _description_ + transformer (TransformerBlock): The transformer block (decoder/encoder) used + by the model + transformer_input (Tensor): Input tensor to the transformer transformer_config (TransformerConfig): Transformer config used by the model Returns: @@ -151,102 +184,3 @@ def get_rotary_seq_len( rotary_seq_len *= transformer_config.context_parallel_size return rotary_seq_len - - -def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor: - """Change sign so the last dimension becomes [-odd, +even] - - Args: - x (Tensor): Input tensor - - Returns: - Tensor: Tensor rotated half - """ - if not rotary_interleaved: - x1, x2 = torch.chunk(x, 2, dim=-1) - return torch.cat((-x2, x1), dim=-1) - else: - x1 = x[:, :, :, ::2] - x2 = x[:, :, :, 1::2] - x_new = torch.stack((-x2, x1), dim=-1) - return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1) - - -def apply_rotary_pos_emb_bshd(t: Tensor, freqs: Tensor, rotary_interleaved: bool = False) -> Tensor: - """Apply rotary positional embedding to input tensor T. - - check https://kexue.fm/archives/8265 for detailed formulas - - Args: - t (Tensor): Input tensor T is of shape [seq_length, ... , dim] - freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim] - - Returns: - Tensor: The input tensor after applying RoPE - """ - rot_dim = freqs.shape[-1] - - # ideally t_pass is empty so rotary pos embedding is applied to all tensor t - t, t_pass = t[..., :rot_dim], t[..., rot_dim:] - - # first part is cosine component - # second part is sine component, need to change signs with _rotate_half method - cos_ = torch.cos(freqs).to(t.dtype) - sin_ = torch.sin(freqs).to(t.dtype) - - t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_) - return torch.cat((t, t_pass), dim=-1) - - -def apply_rotary_pos_emb_thd( - t: Tensor, cu_seqlens: Tensor, freqs: Tensor, rotary_interleaved: bool = False -) -> Tensor: - """A baseline implementation of applying RoPE for `thd` format. - - Args: - t (Tensor): Input tensor T is of shape [t, h, d] - cu_seqlens(Tensor): Cumulative sum of sequence lengths in a batch for `t`, - with shape [b + 1] and dtype torch.int32. - freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d] - - Returns: - Tensor: Shape [t, h, d]. The input tensor after applying RoPE. - """ - - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() - return torch.cat( - [ - apply_rotary_pos_emb_bshd(x.unsqueeze(1), freqs[: x.size(0)]) - for x in torch.split(t, seqlens) - ] - ).squeeze(1) - - -def apply_rotary_pos_emb( - t: Tensor, freqs: Tensor, config: TransformerConfig, cu_seqlens: Optional[Tensor] = None -): - """ - Reroute to the appropriate apply_rotary_pos_emb function depending on - fused/unfused kernels, or bshd (conventional) / thd (packed seq) format - """ - if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION: - # setting apply_rope_fusion in config to False so that subsequent queries to this config also return False - config.apply_rope_fusion = False - if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False): - logger.warning( - "Setting apply_rope_fusion to false because its implementation" - " is not included in Apex. Try upgrading to the latest version" - ) - apply_rotary_pos_emb.printed_fused_warning = True - if config.apply_rope_fusion: - if cu_seqlens is None: - return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True) - else: - return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs) - else: - if cu_seqlens is None: - return apply_rotary_pos_emb_bshd(t, freqs, rotary_interleaved=config.rotary_interleaved) - else: - return apply_rotary_pos_emb_thd( - t, cu_seqlens, freqs, rotary_interleaved=config.rotary_interleaved - ) diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py new file mode 100644 index 0000000000..14d147ea34 --- /dev/null +++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py @@ -0,0 +1,169 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +from __future__ import annotations + +import logging +import math + +import torch +from torch import Tensor + +from megatron.core import parallel_state +from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank +from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding + +logger = logging.getLogger(__name__) + + +class YarnRotaryEmbedding(RotaryEmbedding): + """Yarn Rotary Embedding for language model. + + Args: + kv_channels (int): Projection weights dimension in multi-head attention. Obtained from + transformer config + rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings. + rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings. + Defaults to False. + seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for + longer sequences. The value must be a float larger than 1.0. Defaults to None + rotary_base (float, optional): Base period for rotary position embeddings. Defaults to + 10000. + use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on + the GPU. Defaults to False + scaling_factor (float, optional): Scaling factor for Yarn RoPE. Defaults to 1.0. + original_max_position_embeddings (int, optional): Original maximum position embeddings + length. Defaults to 4096. + beta_fast (float, optional): Fast beta value for Yarn RoPE. Defaults to 32. + beta_slow (float, optional): Slow beta value for Yarn RoPE. Defaults to 1. + mscale (float, optional): Mscale value for Yarn RoPE. Defaults to 1. + mscale_all_dim (float, optional): Mscale all dim value for Yarn RoPE. Defaults to 0. + """ + + def __init__( + self, + kv_channels: int, + rotary_percent: float = 1.0, + rotary_interleaved: bool = False, + seq_len_interpolation_factor: float = None, + rotary_base: float = 10000.0, + use_cpu_initialization: bool = False, + scaling_factor: float = 1.0, + original_max_position_embeddings: int = 4096, + beta_fast: float = 32.0, + beta_slow: float = 1.0, + mscale: float = 1.0, + mscale_all_dim: float = 0.0, + ): + self.dim = kv_channels + self.rotary_base = rotary_base + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = mscale + self.mscale_all_dim = mscale_all_dim + + device = 'cpu' if use_cpu_initialization else torch.cuda.current_device() + self.inv_freq_extra = 1.0 / ( + self.rotary_base + ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim) + ) + self.inv_freq_inter = 1.0 / ( + self.scaling_factor + * self.rotary_base + ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim) + ) + super().__init__( + kv_channels, + rotary_percent, + rotary_interleaved, + seq_len_interpolation_factor, + rotary_base, + use_cpu_initialization, + ) + + def forward(self, max_seq_len: int, offset: int = 0) -> Tensor: + + assert ( + not self.rotary_interleaved + ), "Yarn RoPE does not support interleaved rotary embeddings" + + if self.inv_freq_extra.device.type == 'cpu': + # move `inv_freq_extra` to GPU once at the first micro-batch forward pass + self.inv_freq_extra = self.inv_freq_extra.to(device=torch.cuda.current_device()) + + if self.inv_freq_inter.device.type == 'cpu': + # move `inv_freq_inter` to GPU once at the first micro-batch forward pass + self.inv_freq_inter = self.inv_freq_inter.to(device=torch.cuda.current_device()) + + low, high = _yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.dim, + self.rotary_base, + self.original_max_position_embeddings, + ) + inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to( + device=self.inv_freq_extra.device, dtype=torch.float32 + ) + inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask + + seq = ( + torch.arange( + max_seq_len, device=self.inv_freq_extra.device, dtype=self.inv_freq_extra.dtype + ) + + offset + ) + + freqs = torch.outer(seq, inv_freq) + + _mscale = float( + _yarn_get_mscale(self.scaling_factor, self.mscale) + / _yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) + ) + + emb = torch.cat((freqs, freqs), dim=-1) + # emb [seq_length, .., dim] + emb = emb[:, None, None, :] + if parallel_state.get_context_parallel_world_size() > 1: + # slice rotary_pos_emb along sequence dimension + # and select the parition of the current CP rank + emb = get_pos_emb_on_this_cp_rank(emb, 0) + return emb, _mscale + + +# Inverse dim formula to find dim based on number of rotations +def _yarn_find_correction_dim( + num_rotations: float, dim: int, rotary_base: float = 10000, max_position_embeddings: int = 2048 +) -> float: + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(rotary_base) + ) + + +# Find dim range bounds based on rotations +def _yarn_find_correction_range( + low_rot: float, + high_rot: float, + dim: int, + rotary_base: float = 10000, + max_position_embeddings: int = 2048, +) -> tuple[int, int]: + low = math.floor(_yarn_find_correction_dim(low_rot, dim, rotary_base, max_position_embeddings)) + high = math.ceil(_yarn_find_correction_dim(high_rot, dim, rotary_base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor: + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py index d469f5e4ce..1db68dc886 100755 --- a/megatron/core/models/gpt/gpt_layer_specs.py +++ b/megatron/core/models/gpt/gpt_layer_specs.py @@ -11,6 +11,10 @@ from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules from megatron.core.transformer.moe.shared_experts import SharedExpertMLP +from megatron.core.transformer.multi_latent_attention import ( + MLASelfAttention, + MLASelfAttentionSubmodules, +) from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules @@ -49,6 +53,7 @@ def get_gpt_layer_with_transformer_engine_spec( num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, fp8: Optional[str] = None, ) -> ModuleSpec: """Use this spec to use lower-level Transformer Engine modules (required for fp8 training). @@ -66,34 +71,63 @@ def get_gpt_layer_with_transformer_engine_spec( mlp = _get_mlp_module_spec( use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8 ) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=TELayerNormColumnParallelLinear, - core_attention=TEDotProductAttention, - linear_proj=TERowParallelLinear, - # TENorm significantly harms convergence when used - # for QKLayerNorm; we instead use the Apex implementation. - q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, - k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, + + if multi_latent_attention: + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=TEColumnParallelLinear, + linear_q_down_proj=TEColumnParallelLinear, + linear_q_up_proj=TEColumnParallelLinear, + linear_kv_down_proj=TEColumnParallelLinear, + linear_kv_up_proj=TEColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + q_layernorm=TENorm if qk_layernorm else IdentityOp, + kv_layernorm=TENorm if qk_layernorm else IdentityOp, + ), ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm if num_experts else IdentityOp, + input_layernorm=TENorm if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=TENorm if num_experts else IdentityOp, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - ), - ) + ) + else: + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=TELayerNormColumnParallelLinear, + core_attention=TEDotProductAttention, + linear_proj=TERowParallelLinear, + # TENorm significantly harms convergence when used + # for QKLayerNorm; we instead use the Apex implementation. + q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, + k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + ), + ) def get_gpt_layer_local_spec( num_experts: Optional[int] = None, moe_grouped_gemm: Optional[bool] = False, qk_layernorm: Optional[bool] = False, + multi_latent_attention: Optional[bool] = False, ) -> ModuleSpec: """Use this spec for an implementation using only modules in Megatron-Core. @@ -109,31 +143,58 @@ def get_gpt_layer_local_spec( mlp = _get_mlp_module_spec( use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm ) - return ModuleSpec( - module=TransformerLayer, - submodules=TransformerLayerSubmodules( - input_layernorm=LNImpl, - self_attention=ModuleSpec( - module=SelfAttention, - params={"attn_mask_type": AttnMaskType.causal}, - submodules=SelfAttentionSubmodules( - linear_qkv=ColumnParallelLinear, - core_attention=DotProductAttention, - linear_proj=RowParallelLinear, - q_layernorm=LNImpl if qk_layernorm else IdentityOp, - k_layernorm=LNImpl if qk_layernorm else IdentityOp, + if multi_latent_attention: + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + self_attention=ModuleSpec( + module=MLASelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=MLASelfAttentionSubmodules( + linear_q_proj=ColumnParallelLinear, + linear_q_down_proj=ColumnParallelLinear, + linear_q_up_proj=ColumnParallelLinear, + linear_kv_down_proj=ColumnParallelLinear, + linear_kv_up_proj=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=LNImpl if qk_layernorm else IdentityOp, + kv_layernorm=LNImpl if qk_layernorm else IdentityOp, + ), ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl if num_experts else IdentityOp, + input_layernorm=LNImpl if num_experts else IdentityOp, + mlp=mlp, + mlp_bda=get_bias_dropout_add, ), - self_attn_bda=get_bias_dropout_add, - pre_mlp_layernorm=LNImpl, - mlp=mlp, - mlp_bda=get_bias_dropout_add, - sharded_state_dict_keys_map={ - 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', - 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', - }, - ), - ) + ) + else: + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=LNImpl, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=LNImpl if qk_layernorm else IdentityOp, + k_layernorm=LNImpl if qk_layernorm else IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=LNImpl, + mlp=mlp, + mlp_bda=get_bias_dropout_add, + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) def _get_mlp_module_spec( diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 20f83976c4..bd52f89680 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -69,6 +69,7 @@ def __init__( position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute', rotary_percent: float = 1.0, rotary_base: int = 10000, + rope_scaling: bool = False, seq_len_interpolation_factor: Optional[float] = None, ) -> None: super().__init__(config=config) @@ -90,9 +91,11 @@ def __init__( # TODO: remove this dependency ? self.model_type = ModelType.encoder_or_decoder - # These 2 attributes are needed for TensorRT-LLM export. + # These 4 attributes are needed for TensorRT-LLM export. self.max_position_embeddings = max_sequence_length self.rotary_percent = rotary_percent + self.rotary_base = rotary_base + self.rotary_scaling = rope_scaling if self.pre_process: self.embedding = LanguageModelEmbedding( @@ -102,13 +105,14 @@ def __init__( position_embedding_type=position_embedding_type, ) - if self.position_embedding_type == 'rope': + if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention: self.rotary_pos_emb = RotaryEmbedding( kv_channels=self.config.kv_channels, rotary_percent=rotary_percent, rotary_interleaved=self.config.rotary_interleaved, seq_len_interpolation_factor=seq_len_interpolation_factor, rotary_base=rotary_base, + rope_scaling=rope_scaling, use_cpu_initialization=self.config.use_cpu_initialization, ) @@ -185,12 +189,17 @@ def forward( inference_params: InferenceParams = None, packed_seq_params: PackedSeqParams = None, extra_block_kwargs: dict = None, + runtime_gather_output: Optional[bool] = None, ) -> Tensor: """Forward function of the GPT Model This function passes the input tensors through the embedding layer, and then the decoeder and finally into the post processing layer (optional). It either returns the Loss values if labels are given or the final hidden units + + Args: + runtime_gather_output (bool): Gather output at runtime. Default None means + `parallel_output` arg in the constructor will be used. """ # If decoder_input is provided (not None), then input_ids and position_ids are ignored. # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input. @@ -207,7 +216,7 @@ def forward( # Rotary positional embeddings (embedding is None for PP intermediate devices) rotary_pos_emb = None - if self.position_embedding_type == 'rope': + if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention: rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( inference_params, self.decoder, decoder_input, self.config ) @@ -230,7 +239,9 @@ def forward( output_weight = None if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() - logits, _ = self.output_layer(hidden_states, weight=output_weight) + logits, _ = self.output_layer( + hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output + ) if has_config_logger_enabled(self.config): payload = OrderedDict( diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py index a8ddc94ced..074cfaae93 100644 --- a/megatron/core/models/multimodal/llava_model.py +++ b/megatron/core/models/multimodal/llava_model.py @@ -76,6 +76,7 @@ def __init__( img_w: int = 336, patch_dim: int = 14, language_rotary_base: int = 10000, + language_rope_scaling: bool = False, ) -> None: super().__init__(config=language_transformer_config) @@ -112,6 +113,7 @@ def __init__( pre_process=self.pre_process, post_process=self.post_process, rotary_base=language_rotary_base, + rope_scaling=language_rope_scaling, ) self.share_embeddings_and_output_weights = ( self.language_model.share_embeddings_and_output_weights @@ -123,6 +125,16 @@ def __init__( class_token_len = 1 if self.add_encoder: + self._drop_vision_class_token = drop_vision_class_token + add_class_token = True + if vision_transformer_config.vision_model_type == "siglip": + class_token_len = 0 + add_class_token = False + error_msg = ( + "Siglip does not support vision class token, " + "set disable-vision-class-token to False." + ) + assert not self._drop_vision_class_token, error_msg self.vision_model = CLIPViTModel( vision_transformer_config, vision_transformer_layer_spec, @@ -130,8 +142,9 @@ def __init__( img_w=img_w, class_token_len=class_token_len, patch_dim=patch_dim, + model_subtype=vision_transformer_config.vision_model_type, + add_class_token=add_class_token, ) - self._drop_vision_class_token = drop_vision_class_token # Map (intermediate) vision model outputs to the language model input dimension. self.vision_projection = MultimodalProjector( vision_projection_config, @@ -153,7 +166,12 @@ def __init__( ) self._img_seq_len = get_num_image_embeddings( - img_h, img_w, patch_dim, drop_vision_class_token, class_token_len + img_h, + img_w, + patch_dim, + vision_transformer_config.vision_model_type, + drop_vision_class_token, + class_token_len, ) def shared_embedding_or_output_weight(self): @@ -351,7 +369,9 @@ def _preprocess_data( ] # Put image embeddings to image positions. - final_embedding[images_mask] = image_embeddings.reshape(-1, embed_dim).contiguous() + final_embedding[images_mask] = ( + image_embeddings.permute(1, 0, 2).reshape(-1, embed_dim).contiguous() + ) # Create the final labels and loss mask (if this is the last language model stage). final_labels, final_loss_mask = None, None @@ -429,6 +449,7 @@ def forward( inference_params: Optional[InferenceParams] = None, num_image_tiles: Optional[List[int]] = None, image_token_index: Optional[int] = IMAGE_TOKEN_INDEX, + runtime_gather_output: Optional[bool] = None, ) -> torch.Tensor: """Forward function of the LLaVA model. @@ -445,6 +466,8 @@ def forward( inference_params (InferenceParams): Inference-time parameters including KV cache. num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image. image_token_index (int): ID for input images. + runtime_gather_output (bool): Gather output at runtime. Default None means + `parallel_output` arg in the constructor will be used. Returns: output (torch.Tensor): Loss of shape [b, s] if labels are provided, @@ -463,7 +486,9 @@ def forward( image_embeddings = None elif self.add_encoder and not has_images: # If no images provided, use an empty image embeddings tensor. - image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device) + image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device).reshape( + 0, 0, 0 + ) elif self.add_encoder and has_images: image_embeddings = self.vision_model(images) # [num_tiles, img_seq_len, h_vision] if self._drop_vision_class_token: @@ -528,6 +553,7 @@ def forward( decoder_input=combined_embeddings, labels=new_labels, inference_params=inference_params, + runtime_gather_output=runtime_gather_output, ) if labels is None or loss_mask is None: diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py index f9ed05f470..d4b5c9684b 100644 --- a/megatron/core/models/retro/config.py +++ b/megatron/core/models/retro/config.py @@ -4,11 +4,9 @@ import os from dataclasses import dataclass -from importlib.metadata import version - -from pkg_resources import packaging from megatron.core.transformer import TransformerConfig +from megatron.core.utils import is_te_min_version @dataclass @@ -65,8 +63,7 @@ def __post_init__(self) -> None: super().__post_init__() # Validate Transformer Engine version. - te_version = packaging.version.Version(version("transformer-engine")) - if te_version >= packaging.version.Version("1.3"): + if is_te_min_version("1.3"): try: assert os.getenv("NVTE_FLASH_ATTN") == "0" assert os.getenv("NVTE_FUSED_ATTN") == "0" diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py index 110a8687f7..53c3feddee 100644 --- a/megatron/core/models/vision/clip_vit_model.py +++ b/megatron/core/models/vision/clip_vit_model.py @@ -33,12 +33,22 @@ def __init__( transformer_config: TransformerConfig, transformer_layer_spec: ModuleSpec, ln_pre_impl: Union[ModuleSpec, type] = TENorm, + ln_post_impl: Union[ModuleSpec, type] = TENorm, add_class_token: bool = True, class_token_len: int = 1, patch_dim: int = 14, img_h: int = 336, img_w: int = 336, + model_subtype: str = "clip", ) -> None: + + error_msg = f"CLIPViTModel model subtype {model_subtype} is not supported." + assert model_subtype in ["clip", "siglip"], error_msg + + if model_subtype == "siglip": + assert class_token_len == 0, "SigLIP does not support class tokens." + assert not add_class_token, "SigLIP does not support class tokens." + super().__init__(config=transformer_config) if has_config_logger_enabled(transformer_config): @@ -61,12 +71,34 @@ def __init__( self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0) + self.ln_pre = None + self.ln_post = None + if model_subtype == "clip": + self.ln_pre = build_module( + ln_pre_impl, + config=transformer_config, + hidden_size=self.visual_hidden_size, + eps=transformer_config.layernorm_epsilon, + ) + conv_bias = False + padding = 0 + if model_subtype == "siglip": + self.ln_post = build_module( + ln_post_impl, + config=transformer_config, + hidden_size=self.visual_hidden_size, + eps=transformer_config.layernorm_epsilon, + ) + conv_bias = True + padding = "valid" + self.conv1 = torch.nn.Conv2d( in_channels=3, out_channels=self.visual_hidden_size, kernel_size=self.patch_dim, stride=self.patch_dim, - bias=False, + bias=conv_bias, + padding=padding, ) self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() @@ -79,13 +111,6 @@ def __init__( torch.randn(1, self.class_token_len, self.visual_hidden_size) ) - self.ln_pre = build_module( - ln_pre_impl, - config=transformer_config, - hidden_size=self.visual_hidden_size, - eps=transformer_config.layernorm_epsilon, - ) - self.model_type = ModelType.encoder_or_decoder # Transformer layers. @@ -134,7 +159,8 @@ def forward( assert x.shape[1] == self.seq_length, f"{x.shape[1]} != {self.seq_length}" x = x + self.position_embeddings(self.position_ids) - x = self.ln_pre(x) + if self.ln_pre: + x = self.ln_pre(x) x = x.permute(1, 0, 2) # [b, s, h] -> [s, b, h] # `permute` can make the tensor non-contiguous, breaking pipelining. x = x.contiguous() @@ -142,17 +168,23 @@ def forward( x = self.decoder(x, attention_mask) x = x.permute(1, 0, 2) # [s, b, h] -> [b, s, h] x = x.contiguous() - + if self.ln_post: + x = self.ln_post(x) return x -def get_num_image_embeddings(img_h, img_w, patch_dim, disable_vision_class_token, class_token_len): +def get_num_image_embeddings( + img_h, img_w, patch_dim, vision_model_type, disable_vision_class_token, class_token_len +): """Get the number of image embeddings per image tile.""" - add_class_token = not disable_vision_class_token + if vision_model_type == "siglip": + keep_class_token = False + elif vision_model_type == "clip": + keep_class_token = not disable_vision_class_token num_patches_per_dim_h = img_h // patch_dim num_patches_per_dim_w = img_w // patch_dim num_patches = num_patches_per_dim_h * num_patches_per_dim_w - num_image_embeddings_per_tile = num_patches + (class_token_len if add_class_token else 0) + num_image_embeddings_per_tile = num_patches + (class_token_len if keep_class_token else 0) return num_image_embeddings_per_tile diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py index 16bd95a7b4..5850e512ca 100644 --- a/megatron/core/num_microbatches_calculator.py +++ b/megatron/core/num_microbatches_calculator.py @@ -320,6 +320,8 @@ def __init__( if rank == 0: logger.info( f'decreasing batch size from {global_batch_size} to {running_global_batch_size}' + f'to keep divisiblity by micro_batch_size={micro_batch_size} * ' + f'data_parallel_size={data_parallel_size}' ) self.num_micro_batches = ( running_global_batch_size // micro_batch_times_data_parallel_size @@ -424,7 +426,7 @@ def __init__( self.rampup_samples_per_increment = self.ramup_samples / num_increments # Initialize number of microbatches. - self.update(0, False) + self.update(0, consistency_check=False, verbose=True) def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = False) -> None: """Update number of microbatches. @@ -450,10 +452,13 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = if old_current_global_batch_size != self.current_global_batch_size: global_batch_size_changed = True if self.rank == 0 and global_batch_size_changed and verbose: - logger.info( - f'ramping up batch size from {old_current_global_batch_size} to ' - f'{self.current_global_batch_size}' - ) + if old_current_global_batch_size is None: + logger.info(f'setting initial batch size to {self.current_global_batch_size}') + else: + logger.info( + f'ramping up batch size from {old_current_global_batch_size} to ' + f'{self.current_global_batch_size}' + ) # Check consistency of the current global batch size. if consistency_check and not self.decrease_batch_size_if_needed: @@ -477,7 +482,9 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = if self.rank == 0 and global_batch_size_changed and verbose: logger.info( f'decreasing batch size from {self.current_global_batch_size} to ' - f'{self.current_running_global_batch_size}' + f'{self.current_running_global_batch_size} to keep divisiblity by ' + f'micro_batch_size={self.micro_batch_size} * ' + f'data_parallel_size={self.data_parallel_size}' ) assert ( self.current_running_global_batch_size % self.micro_batch_times_data_parallel_size diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py index bc385ad268..6135dc52c8 100644 --- a/megatron/core/package_info.py +++ b/megatron/core/package_info.py @@ -2,7 +2,7 @@ MAJOR = 0 -MINOR = 9 +MINOR = 10 PATCH = 0 PRE_RELEASE = 'rc0' diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py index fe63e13e99..dff0cc5992 100644 --- a/megatron/core/packed_seq_params.py +++ b/megatron/core/packed_seq_params.py @@ -6,9 +6,15 @@ @dataclass class PackedSeqParams: - # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format, + ''' + parameters to TEDotProductAttention and fused rope kernels for the + `thd` (packed) sequence format + ''' + qkv_format: str = None cu_seqlens_q: Tensor = None cu_seqlens_kv: Tensor = None + cu_seqlens_q_padded: Tensor = None + cu_seqlens_kv_padded: Tensor = None max_seqlen_q: Tensor = None max_seqlen_kv: Tensor = None diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt index 08ed5eeb4b..a03ef133e7 100644 --- a/megatron/core/requirements.txt +++ b/megatron/core/requirements.txt @@ -1 +1,2 @@ -torch \ No newline at end of file +torch +packaging diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index ff0be00bb8..903b4ed873 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -69,6 +69,8 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride): def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor): + """Set default model parallel attributes if not set explicitly already.""" + def maybe_set(attribute, value): if not hasattr(tensor, attribute): setattr(tensor, attribute, value) @@ -78,6 +80,8 @@ def maybe_set(attribute, value): def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor): + """Copy model parallel attributes from one tensor to another.""" + def maybe_copy(attribute): if hasattr(source_tensor, attribute): setattr(destination_tensor, attribute, getattr(source_tensor, attribute)) @@ -116,21 +120,22 @@ def _initialize_affine_weight_cpu( params_dtype=torch.float32, rank=None, world_size=None, + skip_set_tensor_parallel_attributes=False, ): """Initialize affine weight for model parallel. Build the master weight on all processes and scatter the relevant chunk.""" - set_tensor_model_parallel_attributes( - tensor=weight, is_parallel=True, dim=partition_dim, stride=stride - ) + if not skip_set_tensor_parallel_attributes: + set_tensor_model_parallel_attributes( + tensor=weight, is_parallel=True, dim=partition_dim, stride=stride + ) # Initialize master weight master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False) init_method(master_weight) master_weight = master_weight.to(dtype=params_dtype) - # Split and copy per_partition_per_stride_size = divide(per_partition_size, stride) weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim) @@ -219,6 +224,11 @@ def __init__( _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1) def forward(self, input_): + """Forward. + + Args: + input_ (torch.Tensor): Input tensor. + """ if self.tensor_model_parallel_size > 1: # Build the mask. input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index) @@ -278,6 +288,7 @@ class LinearWithFrozenWeight(torch.autograd.Function): @staticmethod @custom_fwd def forward(ctx, input, weight, bias, allreduce_dgrad): + """Forward with frozen weight.""" ctx.save_for_backward(weight) ctx.allreduce_dgrad = allreduce_dgrad output = torch.matmul(input, weight.t()) @@ -288,6 +299,7 @@ def forward(ctx, input, weight, bias, allreduce_dgrad): @staticmethod @custom_bwd def backward(ctx, grad_output): + """Backward with frozen weight.""" (weight,) = ctx.saved_tensors grad_input = grad_output.matmul(weight) @@ -389,6 +401,7 @@ def forward( grad_output_buffer, wgrad_deferral_limit, ): + """Forward.""" ctx.save_for_backward(input, weight) ctx.use_bias = bias is not None ctx.gradient_accumulation_fusion = gradient_accumulation_fusion @@ -418,6 +431,7 @@ def forward( @staticmethod @custom_bwd def backward(ctx, grad_output): + """Backward.""" input, weight = ctx.saved_tensors use_bias = ctx.use_bias grad_output_buffer = ctx.grad_output_buffer @@ -847,7 +861,12 @@ def __init__( ) ) - def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): + def forward( + self, + input_: torch.Tensor, + weight: Optional[torch.Tensor] = None, + runtime_gather_output: Optional[bool] = None, + ): """Forward of ColumnParallelLinear Args: @@ -855,6 +874,8 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): 3D tensor whose order of dimension is [sequence, batch, hidden] weight (optional): weight tensor to use, compulsory when skip_weight_param_allocation is True. + runtime_gather_output (bool): Gather output at runtime. Default None means + `gather_output` arg in the constructor will be used. Returns: - output @@ -927,7 +948,13 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): ), allreduce_dgrad=allreduce_dgrad, ) - if self.gather_output: + + gather_output = self.gather_output + # Use the runtime gather output if it's set explicitly. + if runtime_gather_output is not None: + gather_output = runtime_gather_output + + if gather_output: # All-gather across the partitions. assert not self.sequence_parallel output = gather_from_tensor_model_parallel_region(output_parallel) diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 3724f81648..4b144d4163 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -16,7 +16,7 @@ get_expert_model_parallel_rank, get_tensor_model_parallel_rank, ) -from megatron.core.utils import safely_set_viewless_tensor_data +from megatron.core.utils import is_te_min_version, safely_set_viewless_tensor_data from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks @@ -175,6 +175,8 @@ def initialize_rng_tracker(use_te_rng_tracker: bool = False): return if use_te_rng_tracker: + if not is_te_min_version("1.5.0"): + raise RuntimeError("use_te_rng_tracker requires TransformerEngine version >= 1.5") from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker _CUDA_RNG_STATE_TRACKER = TECudaRNGStatesTracker() diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py index 7cc10776b7..0e3cdcfa57 100644 --- a/megatron/core/transformer/__init__.py +++ b/megatron/core/transformer/__init__.py @@ -2,5 +2,5 @@ from .module import MegatronModule from .spec_utils import ModuleSpec, build_module -from .transformer_config import TransformerConfig +from .transformer_config import MLATransformerConfig, TransformerConfig from .transformer_layer import TransformerLayer, TransformerLayerSubmodules diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py index 6f81787f67..850dec88e1 100644 --- a/megatron/core/transformer/attention.py +++ b/megatron/core/transformer/attention.py @@ -6,7 +6,7 @@ import torch from megatron.core import parallel_state, tensor_parallel -from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.models.common.embeddings import apply_rotary_pos_emb from megatron.core.parallel_state import ( get_data_parallel_group, get_data_parallel_rank, @@ -146,14 +146,14 @@ def custom_forward(*inputs): return hidden_states - def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype): + def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype): """Allocate memory to store kv cache during inference.""" return torch.empty( inference_max_sequence_length, batch_size, self.num_query_groups_per_partition, - self.hidden_size_per_attention_head, + dim, dtype=dtype, device=torch.cuda.current_device(), ) @@ -178,10 +178,10 @@ def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_p inf_max_seq_length = inference_params.max_sequence_length inf_max_batch_size = inference_params.max_batch_size inference_key_memory = self._allocate_memory( - inf_max_seq_length, inf_max_batch_size, key.dtype + inf_max_seq_length, inf_max_batch_size, key.shape[-1], key.dtype ) inference_value_memory = self._allocate_memory( - inf_max_seq_length, inf_max_batch_size, value.dtype + inf_max_seq_length, inf_max_batch_size, value.shape[-1], value.dtype ) inference_params.key_value_memory_dict[self.layer_number] = ( inference_key_memory, diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py index a60a22c0f3..2588980b5b 100644 --- a/megatron/core/transformer/cuda_graphs.py +++ b/megatron/core/transformer/cuda_graphs.py @@ -6,6 +6,8 @@ import torch +from megatron.core.transformer.module import MegatronModule + try: from transformer_engine.pytorch import make_graphed_callables from transformer_engine.pytorch.fp8 import FP8GlobalStateManager @@ -233,11 +235,16 @@ def __call__(self, megatron_module, args, kwargs): break if runner is None: - runner = self.create_cudagraph_module(megatron_module, args, kwargs) - self.cudagraph_runners.append(runner) - logging.getLogger(__name__).info( - f"Creating cudagraph; now have {len(self.cudagraph_runners)}" - ) + if self.training and torch.is_grad_enabled(): + runner = self.create_cudagraph_module(megatron_module, args, kwargs) + self.cudagraph_runners.append(runner) + logging.getLogger(__name__).info( + f"Creating cudagraph; now have {len(self.cudagraph_runners)}" + ) + else: + # No cudagraphs were found in inference mode, so fallback to eager since + # tensor.requires_grad is needed to correctly trace the backward graph. + return super(MegatronModule, megatron_module).__call__(*args, **kwargs) tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs) out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch) diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py index bbac3fa4a2..d5c014cabf 100644 --- a/megatron/core/transformer/dot_product_attention.py +++ b/megatron/core/transformer/dot_product_attention.py @@ -40,6 +40,7 @@ def __init__( attn_mask_type: AttnMaskType, attention_type: str, attention_dropout: float = None, + softmax_scale: float = None, ): super().__init__(config=config) @@ -67,10 +68,14 @@ def __init__( self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size) coeff = None - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if softmax_scale is None: + self.softmax_scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) + else: + self.softmax_scale = softmax_scale + if self.config.apply_query_key_layer_scaling: coeff = self.layer_number - self.norm_factor *= coeff + self.softmax_scale /= coeff self.scale_mask_softmax = FusedScaleMaskSoftmax( input_in_fp16=self.config.fp16, @@ -143,7 +148,7 @@ def forward( query.transpose(0, 1), # [b * np, sq, hn] key.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] beta=0.0, - alpha=(1.0 / self.norm_factor), + alpha=self.softmax_scale, ) # change view to [b, np, sq, sk] diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py index ee4bb690b7..02a2cccca5 100644 --- a/megatron/core/transformer/moe/moe_utils.py +++ b/megatron/core/transformer/moe/moe_utils.py @@ -327,6 +327,7 @@ def topk_softmax_with_capacity( pad_to_capacity: bool = False, drop_policy: str = "probs", use_pre_softmax: bool = False, + deterministic_mode: bool = False, ): """Apply capacity and padding to the top-k selection. Args: @@ -366,7 +367,10 @@ def topk_softmax_with_capacity( if capacity_factor is None: # TopK without capacity - tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts) + if deterministic_mode: + tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts) + else: + tokens_per_expert = torch.histc(top_indices, bins=num_experts, min=0, max=num_experts) return probs, top_indices, tokens_per_expert else: # TopK with capacity diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py index 8894dc1df3..3e85ec53c5 100644 --- a/megatron/core/transformer/moe/router.py +++ b/megatron/core/transformer/moe/router.py @@ -74,7 +74,8 @@ def routing(self, logits: torch.Tensor): logits (torch.Tensor): Logits tensor. Returns: - Tuple[torch.Tensor, torch.Tensor]: Tuple of tensors representing max probs and the indices. + Tuple[torch.Tensor, torch.Tensor]: + Tuple of tensors representing max probs and the indices. """ raise NotImplementedError("Routing function not implemented.") @@ -155,6 +156,7 @@ def aux_loss_load_balancing(self, logits: torch.Tensor): pad_to_capacity=self.config.moe_pad_expert_input_to_capacity, drop_policy=self.config.moe_token_drop_policy, use_pre_softmax=self.config.moe_router_pre_softmax, + deterministic_mode=self.config.deterministic_mode, ) if self.training: @@ -172,8 +174,10 @@ def apply_load_balancing_loss( """Applies auxiliary loss to the MoE layer. Args: - probs (torch.Tensor): The probs output by the router for each token. [num_tokens, num_experts] - num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert. [num_experts] + probs (torch.Tensor): + The probs output by the router for each token. [num_tokens, num_experts] + num_local_tokens_per_expert (torch.Tensor): + The number of tokens per expert. [num_experts] activation (torch.Tensor): The activation tensor to attach the gradient function to. Returns: @@ -279,6 +283,7 @@ def routing(self, logits: torch.Tensor): pad_to_capacity=self.config.moe_pad_expert_input_to_capacity, drop_policy=self.config.moe_token_drop_policy, use_pre_softmax=self.config.moe_router_pre_softmax, + deterministic_mode=self.config.deterministic_mode, ) else: raise ValueError(f"Unsupported MoE routing type: {self.routing_type}") diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py index e23ea4ea0f..db1b1920fa 100644 --- a/megatron/core/transformer/moe/token_dispatcher.py +++ b/megatron/core/transformer/moe/token_dispatcher.py @@ -184,13 +184,23 @@ def token_permutation( self.global_local_map = None with torch.no_grad(): - tokens_per_expert = torch.bincount( - local_indices.view(-1), minlength=self.config.num_moe_experts - ) - if self.num_local_experts < self.config.num_moe_experts: - tokens_per_expert = tokens_per_expert[ - self.local_expert_indices[0] : self.local_expert_indices[-1] + 1 - ] + # The indices of local_indices that give its sorted order along dim 0. + self.indices = torch.argsort(local_indices, dim=0) + if self.config.deterministic_mode: + tokens_per_expert = torch.bincount( + local_indices.view(-1), minlength=self.config.num_moe_experts + ) + if self.num_local_experts < self.config.num_moe_experts: + tokens_per_expert = tokens_per_expert[ + self.local_expert_indices[0] : self.local_expert_indices[-1] + 1 + ] + else: + tokens_per_expert = torch.histc( + local_indices, + bins=self.num_local_experts, + min=self.local_expert_indices[0], + max=self.local_expert_indices[-1], + ) tokens_per_expert = tokens_per_expert.cpu().to(torch.long) # Stage2: permute the tokens locally so that they are grouped by their expert assignment @@ -382,7 +392,14 @@ def preprocess(self, indices: torch.Tensor) -> torch.Tensor: Returns: torch.Tensor: Tensor containing the number of tokens assigned to local expert. """ - num_local_tokens_per_expert = torch.bincount(indices.view(-1), minlength=self.num_experts) + if self.config.deterministic_mode: + num_local_tokens_per_expert = torch.bincount( + indices.view(-1), minlength=self.num_experts + ) + else: + num_local_tokens_per_expert = torch.histc( + indices, bins=self.num_experts, min=0, max=self.num_experts + ) # num_local_tokens_per_expert: [num_experts] tp_rank = parallel_state.get_tensor_model_parallel_rank() diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py index 66fe86aee5..b905fc99be 100644 --- a/megatron/core/transformer/moe/upcycling_utils.py +++ b/megatron/core/transformer/moe/upcycling_utils.py @@ -56,7 +56,40 @@ def _covert_to_moe_state_dict(state_dict, moe_model): router_key = mlp_weight_key.replace('mlp.linear_fc1.weight', 'mlp.router.weight') new_state_dict[router_key] = moe_state_dict[router_key].data.data.clone() - if mlp.config.moe_grouped_gemm: + use_te_grouped_gemm = 'decoder.layers.0.mlp.experts.linear_fc1.weight0' in moe_state_dict + + if mlp.config.moe_grouped_gemm and use_te_grouped_gemm: + for mlp_weight_key in mlp_fc1_weight_keys: + weight_tensor = new_state_dict.pop(mlp_weight_key) + for expert_i in range(mlp.num_local_experts): + new_key = mlp_weight_key.replace( + 'mlp.linear_fc1.weight', f'mlp.experts.linear_fc1.weight{expert_i}' + ) + new_state_dict[new_key] = weight_tensor.clone() + + for mlp_weight_key in mlp_fc2_weight_keys: + weight_tensor = new_state_dict.pop(mlp_weight_key) + for expert_i in range(mlp.num_local_experts): + new_key = mlp_weight_key.replace( + 'mlp.linear_fc2.weight', f'mlp.experts.linear_fc2.weight{expert_i}' + ) + new_state_dict[new_key] = weight_tensor.clone() + + for extra_state_key in mlp_fc1_extra_state_keys: + new_state_dict.pop(extra_state_key) + new_key = extra_state_key.replace( + 'mlp.linear_fc1._extra_state', 'mlp.experts.linear_fc1._extra_state' + ) + new_state_dict[new_key] = None + + for extra_state_key in mlp_fc2_extra_state_keys: + new_state_dict.pop(extra_state_key) + new_key = extra_state_key.replace( + 'mlp.linear_fc2._extra_state', 'mlp.experts.linear_fc2._extra_state' + ) + new_state_dict[new_key] = None + + elif mlp.config.moe_grouped_gemm: for mlp_weight_key in mlp_fc1_weight_keys: weight_tensor = new_state_dict.pop(mlp_weight_key) shape = weight_tensor.shape @@ -76,6 +109,7 @@ def _covert_to_moe_state_dict(state_dict, moe_model): ) new_key = mlp_weight_key.replace('mlp.linear_fc2.weight', 'mlp.experts.weight2') new_state_dict[new_key] = weight_tensor + else: def covert_to_experts(keys): diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py new file mode 100644 index 0000000000..d637e2b448 --- /dev/null +++ b/megatron/core/transformer/multi_latent_attention.py @@ -0,0 +1,375 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + + +import math +from dataclasses import dataclass +from typing import Union + +import torch + +from megatron.core import parallel_state +from megatron.core.models.common.embeddings import ( + YarnRotaryEmbedding, + _yarn_get_mscale, + apply_rotary_pos_emb, +) +from megatron.core.transformer.attention import Attention +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import MLATransformerConfig + + +@dataclass +class MLASelfAttentionSubmodules: + """Submodules for the MLA self-attention layer.""" + + linear_q_proj: Union[ModuleSpec, type] = None + linear_q_down_proj: Union[ModuleSpec, type] = None + linear_q_up_proj: Union[ModuleSpec, type] = None + linear_kv_down_proj: Union[ModuleSpec, type] = None + linear_kv_up_proj: Union[ModuleSpec, type] = None + core_attention: Union[ModuleSpec, type] = None + linear_proj: Union[ModuleSpec, type] = None + q_layernorm: Union[ModuleSpec, type] = None + kv_layernorm: Union[ModuleSpec, type] = None + + +class MultiLatentAttention(Attention): + """Multi-Latent Attention layer abstract class. + + This layer only contains common modules required for the "self attn" and + "cross attn" specializations. + """ + + def __init__( + self, + config: MLATransformerConfig, + submodules: Union[MLASelfAttentionSubmodules], + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + ) -> None: + world_size = parallel_state.get_tensor_model_parallel_world_size() + assert ( + world_size == 1 + ), "MLA is not supported with Tensor Parallelism yet, \ + use Expert Parallelism and Pipeline Parallelism for better performance." + + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attention_type=attention_type, + attn_mask_type=attn_mask_type, + ) + + self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads + + self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim + + mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale) + self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim) + + self.rotary_pos_emb = YarnRotaryEmbedding( + self.config.qk_pos_emb_head_dim, + rotary_base=self.config.rotary_base, + scaling_factor=self.config.rotary_scaling_factor, + original_max_position_embeddings=self.config.max_position_embeddings, + beta_fast=self.config.beta_fast, + beta_slow=self.config.beta_slow, + mscale=self.config.mscale, + mscale_all_dim=self.config.mscale_all_dim, + ) + + self.core_attention = build_module( + submodules.core_attention, + config=self.config, + layer_number=self.layer_number, + attn_mask_type=self.attn_mask_type, + attention_type=self.attention_type, + softmax_scale=self.softmax_scale, + k_channels=self.q_head_dim, + v_channels=self.config.v_head_dim, + ) + + # Output. + self.linear_proj = build_module( + submodules.linear_proj, + self.query_projection_size, + self.config.hidden_size, + config=self.config, + init_method=self.config.output_layer_init_method, + bias=self.config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='proj', + ) + + def forward( + self, + hidden_states, + attention_mask, + key_value_states=None, + inference_params=None, + rotary_pos_emb=None, + packed_seq_params=None, + position_ids=None, + ): + assert rotary_pos_emb is None, "Rotary position embeddings should not be passed into MLA." + + # hidden_states: [sq, b, h] + + # ===================== + # Query, Key, and Value + # ===================== + # Get the query, key and value tensors based on the type of attention - + # self or cross attn. + # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128] + query, key, value = self.get_query_key_value_tensors( + hidden_states, + key_value_states, + position_ids, + packed_seq_params, + inference_params=inference_params, + ) + + # =================================================== + # Adjust key, value for inference + # =================================================== + # rotary_pos_emb = None + key, value, _, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, key, value, rotary_pos_emb=None + ) + + # ================================== + # core attention computation + # ================================== + # Need corresponding TE change + if self.checkpoint_core_attention and self.training: + core_attn_out = self._checkpointed_attention_forward( + query, key, value, attention_mask, packed_seq_params=packed_seq_params + ) + else: + core_attn_out = self.core_attention( + query, + key, + value, + attention_mask, + packed_seq_params=packed_seq_params, + attn_mask_type=attn_mask_type, + ) + + if packed_seq_params is not None: + # reshape to same output shape as unpacked case + # (t, np, hn) -> (t, b=1, h=np*hn) + # t is the pack size = sum (sq_i) + # note that batch is a dummy dimension in the packed case + core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1) + + # ================= + # Output. [sq, b, h] + # ================= + output, bias = self.linear_proj(core_attn_out) + + return output, bias + + +class MLASelfAttention(MultiLatentAttention): + """MLA Self-attention layer class + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + config: MLATransformerConfig, + submodules: MLASelfAttentionSubmodules, + layer_number: int, + attn_mask_type=AttnMaskType.padding, + ): + super().__init__( + config=config, + submodules=submodules, + layer_number=layer_number, + attn_mask_type=attn_mask_type, + attention_type="self", + ) + + if self.config.q_lora_rank is None: + # Not projectiing query + self.linear_q_proj = build_module( + submodules.linear_q_proj, + self.config.hidden_size, + self.config.num_attention_heads * self.q_head_dim, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=False, + skip_bias_add=False, + is_expert=False, + ) + + else: + + self.linear_q_down_proj = build_module( + submodules.linear_q_down_proj, + self.config.hidden_size, + self.config.q_lora_rank, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=False, + skip_bias_add=False, + is_expert=False, + ) + + self.linear_q_up_proj = build_module( + submodules.linear_q_up_proj, + self.config.q_lora_rank, + self.config.num_attention_heads * self.q_head_dim, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=False, + skip_bias_add=False, + is_expert=False, + ) + + self.linear_kv_down_proj = build_module( + submodules.linear_kv_down_proj, + self.config.hidden_size, + self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim, + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=False, + skip_bias_add=False, + is_expert=False, + ) + + self.linear_kv_up_proj = build_module( + submodules.linear_kv_up_proj, + self.config.kv_lora_rank, + self.config.num_attention_heads * (self.config.qk_head_dim + self.config.v_head_dim), + config=self.config, + init_method=self.config.init_method, + gather_output=False, + bias=False, + skip_bias_add=False, + is_expert=False, + ) + + if self.config.q_lora_rank is not None: + self.q_layernorm = build_module( + submodules.q_layernorm, + hidden_size=self.config.q_lora_rank, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + + self.kv_layernorm = build_module( + submodules.kv_layernorm, + hidden_size=self.config.kv_lora_rank, + config=self.config, + eps=self.config.layernorm_epsilon, + ) + + def get_query_key_value_tensors( + self, + hidden_states, + key_value_states=None, + position_ids=None, + packed_seq_params=None, + inference_params=None, + ): + """ + Derives `query`, `key` and `value` tensors from `hidden_states`. + """ + # s = sequence length, b = batch size, h = hidden size, n = num attention heads + # Attention heads [s, b, n*h] + assert ( + hidden_states.ndim == 3 + ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D" + q_len, bsz, _ = hidden_states.size() + + if self.config.q_lora_rank is not None: + q_compressed, _ = self.linear_q_down_proj(hidden_states) + q_compressed = self.q_layernorm(q_compressed) + q, _ = self.linear_q_up_proj(q_compressed) + else: + # hidden_states:[s, b, 2048], q: [s, b, n * 192] + q, _ = self.linear_q_proj(hidden_states) + + # q: [s, b, n, 192] + q = q.view(q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim) + + # q: [s, b, n, 128], q_pos_emb: [s, b, n, 64] + q_no_pe, q_pos_emb = torch.split( + q, [self.config.qk_head_dim, self.config.qk_pos_emb_head_dim], dim=-1 + ) + + # kv_combined: [s, b, 576] + kv_combined, _ = self.linear_kv_down_proj(hidden_states) + + # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64] + kv_compressed, k_pos_emb = torch.split( + kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1 + ) + + # kv: [s, b, 2048] + kv, _ = self.linear_kv_up_proj(self.kv_layernorm(kv_compressed)) + + # kv: [s, b, n, 256] + kv = kv.view( + q_len, + bsz, + self.num_attention_heads_per_partition, + self.config.qk_head_dim + self.config.v_head_dim, + ) + + # k_no_pe: [s, b, n, 128], value: [s, b, n, 128] + k_no_pe, value = torch.split(kv, [self.config.qk_head_dim, self.config.v_head_dim], dim=-1) + + # rotary_pos_emb:[s, b, 1, 64] + rotary_pos_emb = self.rotary_pos_emb(max_seq_len=self.config.max_position_embeddings) + + if len(rotary_pos_emb) == 2: + mscale = rotary_pos_emb[1] + rotary_pos_emb = rotary_pos_emb[0] + + if inference_params is not None: + # add offset to the sequence start for inference + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + q_len + rotary_pos_emb = rotary_pos_emb[sequence_start:sequence_end] + + # [s, b, 64] -> [s, b, 1, 64] + k_pos_emb = torch.unsqueeze(k_pos_emb, 2) + + if packed_seq_params is not None: + cu_seqlens_q = packed_seq_params.cu_seqlens_q + cu_seqlens_kv = packed_seq_params.cu_seqlens_kv + else: + cu_seqlens_q = cu_seqlens_kv = None + + # q_pos_emb: [s, b, n, 64], k_pos_emb:[s, b, 1, 64] + q_pos_emb = apply_rotary_pos_emb( + q_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q, mscale=mscale + ) + k_pos_emb = apply_rotary_pos_emb( + k_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv, mscale=mscale + ) + + # query: [s, b, n, 192] + query = torch.cat([q_no_pe, q_pos_emb], dim=-1) + + # key: [s, b, n, 192] + key = torch.cat([k_no_pe, k_pos_emb], dim=-1) + + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + return query, key, value diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 46f6796909..3a88f1ab22 100755 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -2,10 +2,8 @@ from contextlib import nullcontext from dataclasses import dataclass -from importlib.metadata import version from typing import List, Optional, Union -import packaging import torch from torch import Tensor @@ -19,7 +17,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import BaseTransformerLayer from megatron.core.transformer.utils import sharded_state_dict_default -from megatron.core.utils import make_viewless_tensor +from megatron.core.utils import is_te_min_version, make_viewless_tensor try: from megatron.core.extensions.transformer_engine import ( @@ -375,10 +373,9 @@ def get_cuda_graph_optional_args( optional_inputs = {} optional_inputs['is_first_microbatch'] = self.current_microbatch == 0 try: - import transformer_engine.pytorch as te + import transformer_engine.pytorch as te # pylint: disable=unused-import - _te_version = packaging.version.Version(version("transformer-engine")) - if _te_version < packaging.version.Version("1.10.0"): + if is_te_min_version("1.10.0", check_equality=False): assert not any( [attention_mask, context, context_mask, rotary_pos_emb] ), "Keyword Arguments not supported with CUDA graph." diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index f16a0117a3..a63171686a 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -1,14 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. from dataclasses import dataclass -from importlib.metadata import version from typing import Callable, Optional, Tuple import torch.nn.functional as F -from pkg_resources import packaging from ..model_parallel_config import ModelParallelConfig -from ..utils import init_method_normal, scaled_init_method_normal +from ..utils import get_te_version, init_method_normal, is_te_min_version, scaled_init_method_normal @dataclass @@ -112,6 +110,9 @@ class TransformerConfig(ModelParallelConfig): """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the global batch, versus the default behavior of assuming all tokens are non-padded.""" + multi_latent_attention: bool = False + """Whether to use multi-latent attention.""" + #################### # initialization #################### @@ -262,7 +263,6 @@ class TransformerConfig(ModelParallelConfig): """When there are multiple experts per rank, compress multiple local (potentially small) gemms in a single kernel launch to improve the utilization and performance by leveraging the Grouped GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm). - """ moe_aux_loss_coeff: float = 0 # 1e-2 would be a good start value for load balance loss. @@ -282,6 +282,7 @@ class TransformerConfig(ModelParallelConfig): moe_token_dispatcher_type: str = "allgather" """The type of token dispatcher to use. The default is 'allgather'. Options are 'allgather' and 'alltoall'.""" + moe_per_layer_logging: bool = False """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.""" @@ -504,12 +505,62 @@ def __post_init__(self): if self.num_moe_experts and self.fp8: # TE version below 1.7.0 will raise Error when handle zeros tokens for expert - te_version = packaging.version.Version(version("transformer-engine")) - if te_version < packaging.version.Version("1.7.0.dev0"): + if not is_te_min_version("1.7.0.dev0"): raise ValueError( "Only transformer-engine>=1.7.0 supports MoE FP8 training, " - f"but your version is {te_version}." + f"but your version is {get_te_version()}." ) if self.moe_grouped_gemm: raise ValueError("Grouped GEMM of MoE not support fp8 for now.") + + +@dataclass +class MLATransformerConfig(TransformerConfig): + """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers. + + The initialization function has an argument for each parameter, including those in + ModelParallelConfig. Included YaRN RoPE parameters that is fused in MLA. + """ + + multi_latent_attention: bool = True + """Whether to use Multi-Latent Attention.""" + + q_lora_rank: int = 512 + """Rank of Query tensor's low rank representation.""" + + kv_lora_rank: int = 512 + """Rank of Key and Value tensors' low rank representation.""" + + qk_head_dim: int = 128 + """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim""" + + qk_pos_emb_head_dim: int = 64 + """Dimension of the position embedding in the QK projection.""" + + v_head_dim: int = 128 + """Dimension of the head in the V projection.""" + + rotary_base: float = 10000 + """Rotary base for the rotary embeddings.""" + + rotary_scaling_factor: float = 40 + """Rotary scaling factor for the rotary embeddings.""" + + normalization: str = "RMSNorm" + """Default normalization layer for MLA models is RMSNorm.""" + + max_position_embeddings: int = 163840 + """Maximum position embeddings for the original model.""" + + beta_fast: float = 32 + """Beta fast for YaRN RoPE.""" + + beta_slow: float = 1 + """Beta slow for YaRN RoPE.""" + + mscale: float = 0.707 + """Mscale for YaRN RoPE in Multi-Latent Attention.""" + + mscale_all_dim: float = 0.707 + """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention.""" diff --git a/megatron/core/utils.py b/megatron/core/utils.py index b0de950ef6..f3910926ab 100644 --- a/megatron/core/utils.py +++ b/megatron/core/utils.py @@ -15,10 +15,12 @@ from dataclasses import dataclass from datetime import datetime from functools import reduce +from importlib.metadata import version from types import TracebackType from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch +from packaging.version import Version as PkgVersion from megatron.core import parallel_state from megatron.core.dist_checkpointing.mapping import ShardedTensor @@ -26,6 +28,33 @@ logger = logging.getLogger(__name__) +_te_version = None + + +def get_te_version(): + """Get TE version from __version__; if not available use pip's. Use caching.""" + + def get_te_version_str(): + import transformer_engine as te + + if hasattr(te, '__version__'): + return str(te.__version__) + else: + return version("transformer-engine") + + global _te_version + if _te_version is None: + _te_version = PkgVersion(get_te_version_str()) + return _te_version + + +def is_te_min_version(version, check_equality=True): + """Check if minimum version of `transformer-engine` is installed.""" + if check_equality: + return get_te_version() >= PkgVersion(version) + return get_te_version() > PkgVersion(version) + + def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator) diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py index 2e92a96e9e..0df0168fa5 100644 --- a/megatron/inference/gpt/model_provider.py +++ b/megatron/inference/gpt/model_provider.py @@ -64,6 +64,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) -> "position_embedding_type": args.position_embedding_type, "rotary_percent": args.rotary_percent, "rotary_base": args.rotary_base, + "rope_scaling": args.use_rope_scaling, } model = model_type(**model_kwargs) diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py index 7e4424c7b0..21ba00c600 100644 --- a/megatron/legacy/model/rms_norm.py +++ b/megatron/legacy/model/rms_norm.py @@ -8,7 +8,8 @@ class RMSNorm(torch.nn.Module): def __init__(self, dim: int, eps: float = 1e-6, - sequence_parallel: bool = False): + sequence_parallel: bool = False, + config: dict = None): """RMS Normaliation module Args: diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py index 7d723df024..dda550551a 100644 --- a/megatron/legacy/model/transformer.py +++ b/megatron/legacy/model/transformer.py @@ -13,11 +13,11 @@ from megatron import core from megatron.core import mpu, tensor_parallel from megatron.core.enums import ModelType +from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType +from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax +from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl +from megatron.core.models.common.embeddings import apply_rotary_pos_emb from megatron.core.jit import jit_fuser -from megatron.core.models.common.embeddings.rotary_pos_embedding import ( - RotaryEmbedding, - apply_rotary_pos_emb, -) from megatron.core.num_microbatches_calculator import get_num_microbatches from megatron.core.parallel_state import ( get_tensor_and_expert_parallel_group, @@ -1406,21 +1406,15 @@ def __init__(self, config, self.transformer_engine_v_0_8 = False if self.transformer_impl == 'transformer_engine': global transformer_engine - from importlib.metadata import version - import transformer_engine - from pkg_resources import packaging - te_version = packaging.version.Version(version("transformer-engine")) - if te_version >= packaging.version.Version("0.8.0"): + if core.utils.is_te_min_version("0.8.0"): self.transformer_engine_v_0_8 = True - if te_version >= packaging.version.Version("0.10.0"): + if core.utils.is_te_min_version("0.10.0"): self.transformer_engine_v_0_10 = True - if te_version >= packaging.version.Version("0.11.0"): + if core.utils.is_te_min_version("0.11.0"): self.transformer_engine_v_0_11 = True - del version, packaging - assert not args.squared_relu, ("TransformerEngine does not support squared " "relu activation.") diff --git a/megatron/training/activations.py b/megatron/training/activations.py index fee84bddd0..c6ce9f1de1 100644 --- a/megatron/training/activations.py +++ b/megatron/training/activations.py @@ -16,3 +16,7 @@ def squared_relu(x: torch.Tensor) -> torch.Tensor: @jit_fuser def quick_gelu(x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(1.702 * x) + +@jit_fuser +def fast_gelu(x: torch.Tensor) -> torch.Tensor: + return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 3dcfe4f2b2..e3d876a5f2 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -17,7 +17,7 @@ get_config_path as get_retro_config_path, get_gpt_data_dir as get_retro_data_dir, ) -from megatron.core.transformer import TransformerConfig +from megatron.core.transformer import TransformerConfig, MLATransformerConfig from megatron.training.activations import squared_relu from megatron.training.utils import update_use_dist_ckpt @@ -42,6 +42,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): parser = _add_biencoder_args(parser) parser = _add_vision_args(parser) parser = _add_moe_args(parser) + parser = _add_mla_args(parser) parser = _add_logging_args(parser) parser = _add_straggler_detector_args(parser) parser = _add_inference_args(parser) @@ -288,7 +289,8 @@ def validate_args(args, defaults={}): # Overlap P2P communication is disabled if not using the interleaved schedule. args.overlap_p2p_comm = False args.align_param_gather = False - if args.rank == 0: + # Only print warning if PP size > 1. + if args.rank == 0 and args.pipeline_model_parallel_size > 1: print('WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False ' 'since non-interleaved schedule does not support overlapping p2p communication ' 'and aligned param AG') @@ -654,10 +656,13 @@ def _check_arg_is_not_none(args, arg): def core_transformer_config_from_args(args, config_class=None): - + # Config class. config_class = config_class or TransformerConfig + if args.multi_latent_attention: + config_class = MLATransformerConfig + # Translate args to core transformer configuration kw_args = {} for f in dataclasses.fields(config_class): @@ -842,6 +847,8 @@ def _add_network_size_args(parser): help='Use interleaved rotary embedding.') group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None, help='Sequence length interpolation factor for rotary embeddings.') + group.add_argument('--use-rope-scaling', action='store_true', + help='Apply rope scaling as used in llama3.1') group.add_argument('--no-position-embedding', action='store_false', help='Disable position embedding. Deprecated: use --position-embedding-type', @@ -876,7 +883,9 @@ def _add_network_size_args(parser): help='Disable BERT binary head.', dest='bert_binary_head') group.add_argument('--untie-embeddings-and-output-weights', action='store_true', - help='Untie embeddings and output weights.'), + help='Untie embeddings and output weights.') + group.add_argument('--multi-latent-attention', action='store_true', + help='Use multi-latent attention for model.') return parser @@ -1151,6 +1160,9 @@ def _add_training_args(parser): group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false', help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.', dest='tp_comm_bulk_wgrad') + group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str, + choices=['nccl', 'mpi', 'gloo'], + help='Set the bootstrapping backend of Tensor parallel communications.') group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.') @@ -1910,6 +1922,23 @@ def _add_moe_args(parser): return parser +def _add_mla_args(parser): + group = parser.add_argument_group(title="mla") + group.add_argument('--q-lora-rank', type=int, default=None, + help="Rank of Query tensor's low rank representation.") + group.add_argument('--kv-lora-rank', type=int, default=32, + help="Rank of Key and Value tensors' low rank representation.") + group.add_argument('--qk-head-dim', type=int, default=128, + help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim") + group.add_argument('--qk-pos-emb-head-dim', type=int, default=64, + help="Dimension of the position embedding in the QK projection.") + group.add_argument('--v-head-dim', type=int, default=128, + help="Dimension of the head in the V projection.") + group.add_argument('--rotary-scaling-factor', type=float, default=1.0, + help="Rotary scaling factor for the rotary embeddings.") + + return parser + def _add_experimental_args(parser): group = parser.add_argument_group(title='experimental') diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py index cb4b7ace4d..3de49f6c57 100644 --- a/megatron/training/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -1131,7 +1131,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri 'consumed_train_samples', 0) args.skipped_train_samples = getattr(checkpoint_args, 'skipped_train_samples', 0) - update_num_microbatches(consumed_samples=args.consumed_train_samples) + update_num_microbatches(consumed_samples=args.consumed_train_samples, verbose=True) args.consumed_valid_samples = getattr(checkpoint_args, 'consumed_valid_samples', 0) else: diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py index b2ef8a8f45..ad68ce8cb7 100644 --- a/megatron/training/initialize.py +++ b/megatron/training/initialize.py @@ -4,8 +4,6 @@ import logging import random import os -import packaging -import packaging.version import time import numpy as np @@ -24,6 +22,7 @@ from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train from megatron.core.fusions.fused_bias_gelu import bias_gelu from megatron.core.fusions.fused_bias_swiglu import bias_swiglu +from megatron.core.utils import get_te_version, is_te_min_version logger = logging.getLogger(__name__) @@ -213,12 +212,21 @@ def _initialize_tp_communicators(): input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size] - #We create a MPI process group, which is needed to bootstrap the pipelined - #tensor-model-parallel communication overlap - torch.distributed.new_group(backend='mpi') - - te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size, - use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,) + if is_te_min_version("1.9.0"): + # The process group with the target bootstrap backend is created in Transformer Engine. + te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size, + use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs, + bootstrap_backend = args.tp_comm_bootstrap_backend) + else: + if args.tp_comm_bootstrap_backend != 'mpi': + warnings.warn( + f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend." + ) + # Create a MPI process group to help with TP communication overlap bootstrap. + torch.distributed.new_group(backend='mpi') + + te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size, + use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs) def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks): """Initialize torch.distributed and core model parallel.""" diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py index 226ae1e799..af0d493f87 100644 --- a/megatron/training/tokenizer/tokenizer.py +++ b/megatron/training/tokenizer/tokenizer.py @@ -2,15 +2,14 @@ """Megatron tokenizers.""" -import math -from abc import ABC, abstractmethod import base64 import json +import math +import types +from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, List, Optional -import types - from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer @@ -20,27 +19,28 @@ def build_tokenizer(args, **kwargs): """Initialize tokenizer.""" if args.rank == 0: - print('> building {} tokenizer ...'.format(args.tokenizer_type), - flush=True) + print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True) # Select and instantiate the tokenizer. if args.tokenizer_type == 'BertWordPieceLowerCase': assert args.vocab_file is not None - tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, - lower_case=True, - vocab_extra_ids=args.vocab_extra_ids) + tokenizer = _BertWordPieceTokenizer( + vocab_file=args.vocab_file, lower_case=True, vocab_extra_ids=args.vocab_extra_ids + ) elif args.tokenizer_type == 'BertWordPieceCase': assert args.vocab_file is not None - tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, - lower_case=False, - vocab_extra_ids=args.vocab_extra_ids) + tokenizer = _BertWordPieceTokenizer( + vocab_file=args.vocab_file, lower_case=False, vocab_extra_ids=args.vocab_extra_ids + ) elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.vocab_file is not None assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) elif args.tokenizer_type == 'SentencePieceTokenizer': assert args.tokenizer_model is not None - tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids) + tokenizer = _SentencePieceTokenizer( + args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids + ) elif args.tokenizer_type == 'GPTSentencePieceTokenizer': assert args.tokenizer_model is not None tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) @@ -65,13 +65,11 @@ def build_tokenizer(args, **kwargs): assert args.vocab_size is not None tokenizer = _NullTokenizer(args.vocab_size) else: - raise NotImplementedError('{} tokenizer is not ' - 'implemented.'.format(args.tokenizer_type)) + raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) # Add vocab size (if not already set from a checkpoint). if getattr(args, "padded_vocab_size", None) is None: - args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, - args) + args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) return tokenizer @@ -81,13 +79,14 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True): still having GPU friendly size.""" after = orig_vocab_size - multiple = args.make_vocab_size_divisible_by * \ - args.tensor_model_parallel_size + multiple = args.make_vocab_size_divisible_by * args.tensor_model_parallel_size after = int(math.ceil(after / multiple) * multiple) if args.rank == 0 and logging_enabled: - print(' > padded vocab (size: {}) with {} dummy tokens ' - '(new size: {})'.format( - orig_vocab_size, after - orig_vocab_size, after), flush=True) + print( + ' > padded vocab (size: {}) with {} dummy tokens ' + '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after), + flush=True, + ) return after @@ -97,10 +96,14 @@ def __init__(self, pretrained_model_name_or_path, **kwargs): try: import transformers except ImportError: - raise EnvironmentError(f"The transformers library must be installed to use huggingface_tokenizer_provider") + raise EnvironmentError( + f"The transformers library must be installed to use huggingface_tokenizer_provider" + ) # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there - self._tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + self._tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs + ) self._vocab = self._tokenizer.get_vocab() self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()} @@ -146,8 +149,7 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0): self._additional_special_tokens = [] # (dsachan) Add BOS and EOS tokens - SPECIAL_TOKENS = {'eos_token': '[EOS]', - 'bos_token': '[BOS]'} + SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'} self._bos_token = '[BOS]' self.add_token(self._bos_token) self._bos_token_id = self.vocab.get(self._bos_token) @@ -160,7 +162,8 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0): # These can be used as sentinel tokens in T5 model inputs additional_special_tokens = [] additional_special_tokens.extend( - ["".format(i) for i in range(vocab_extra_ids)]) + ["".format(i) for i in range(vocab_extra_ids)] + ) self.add_additional_special_tokens(additional_special_tokens) def add_token(self, token): @@ -195,6 +198,10 @@ def decode(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return self.tokenizer.convert_tokens_to_string(tokens) + def detokenize(self, token_ids): + """Copy of decode() method for inference pipeline compatibility""" + return self.decode(token_ids) + def decode_token_ids(self, token_ids): tokens = self.tokenizer.convert_ids_to_tokens(token_ids) exclude_list = ['[PAD]', '[CLS]'] @@ -227,32 +234,37 @@ def mask(self): @property def bos(self): - """ Id of the beginning of sentence token in the vocabulary.""" + """Id of the beginning of sentence token in the vocabulary.""" return self._bos_token_id @property def eos(self): - """ Id of the end of sentence token in the vocabulary.""" + """Id of the end of sentence token in the vocabulary.""" return self._eos_token_id + @property + def eod(self): + """Copy of eod property for inference pipeline compatibility""" + return self.eos + @property def bos_token(self): - """ Beginning of sentence token id """ + """Beginning of sentence token id""" return self._bos_token @property def eos_token(self): - """ End of sentence token id """ + """End of sentence token id""" return self._eos_token @property def additional_special_tokens(self): - """ All the additional special tokens you may want to use (list of strings).""" + """All the additional special tokens you may want to use (list of strings).""" return self._additional_special_tokens @property def additional_special_tokens_ids(self): - """ Ids of all the additional special tokens in the vocabulary (list of integers).""" + """Ids of all the additional special tokens in the vocabulary (list of integers).""" return [self.vocab.get(token) for token in self._additional_special_tokens] @additional_special_tokens.setter @@ -266,8 +278,9 @@ class _GPT2BPETokenizer(MegatronTokenizer): def __init__(self, vocab_file, merge_file): super().__init__(vocab_file, merge_file) - self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', - special_tokens=[], max_len=None) + self.tokenizer = GPT2Tokenizer( + vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None + ) self.eod_id = self.tokenizer.encoder['<|endoftext|>'] @property @@ -300,6 +313,7 @@ def __init__(self, model_file, vocab_extra_ids=0): super().__init__(model_file, vocab_extra_ids=vocab_extra_ids) import sentencepiece + self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file) self._initalize(vocab_extra_ids) @@ -462,7 +476,7 @@ def additional_special_tokens_ids(self): class _GPTSentencePieceTokenizer(_SentencePieceTokenizer): """SentencePieceTokenizer-Megatron wrapper""" - def __init__(self, model_file,): + def __init__(self, model_file): super().__init__(model_file, vocab_extra_ids=0) def _initalize(self, vocab_extra_ids): @@ -502,7 +516,7 @@ def additional_special_tokens_ids(self): class _Llama2Tokenizer(_SentencePieceTokenizer): """SentencePieceTokenizer-Megatron wrapper""" - def __init__(self, model_file,): + def __init__(self, model_file): super().__init__(model_file, vocab_extra_ids=0) def _initalize(self, vocab_extra_ids): @@ -549,10 +563,7 @@ def additional_special_tokens_ids(self): return None -def reload_mergeable_ranks( - path: str, - max_vocab: Optional[int] = None, -) -> Dict[bytes, int]: +def reload_mergeable_ranks(path: str, max_vocab: Optional[int] = None) -> Dict[bytes, int]: """ Reload our tokenizer JSON file and convert it to Tiktoken format. """ @@ -585,9 +596,12 @@ def reload_mergeable_ranks( return ranks -PATTERN_TIKTOKEN = r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+" +PATTERN_TIKTOKEN = ( + r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+" +) PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + class CustomTikTokenizer(MegatronTokenizer): def __init__( self, @@ -602,9 +616,10 @@ def __init__( pattern=pattern, vocab_size=vocab_size, num_special_tokens=num_special_tokens, - special_tokens=special_tokens + special_tokens=special_tokens, ) import tiktoken + from .. import print_rank_0 # To prevent circular import. if vocab_size is None: @@ -614,20 +629,30 @@ def __init__( SPECIAL_TOKENS = ["", "", ""] if special_tokens is None: special_tokens = SPECIAL_TOKENS.copy() - assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}" + assert len(special_tokens) == len( + set(special_tokens) + ), f"Special tokens should be unique: {special_tokens}" assert len(special_tokens) <= num_special_tokens < self._vocab_size - assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}" + assert set(SPECIAL_TOKENS) <= set( + special_tokens + ), f"Custom special tokens should include {SPECIAL_TOKENS}" - special_filler = ["".format(id=i) for i in range(len(special_tokens), num_special_tokens)] + special_filler = [ + "".format(id=i) for i in range(len(special_tokens), num_special_tokens) + ] if special_filler: print_rank_0(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}") special_tokens = special_tokens + special_filler assert len(set(special_tokens)) == len(special_tokens) == num_special_tokens, special_tokens inner_vocab_size = self._vocab_size - num_special_tokens - token_to_id_without_special_tokens = reload_mergeable_ranks(path, max_vocab=inner_vocab_size) + token_to_id_without_special_tokens = reload_mergeable_ranks( + path, max_vocab=inner_vocab_size + ) # Create space for special tokens. - token_to_id_without_special_tokens = {t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()} + token_to_id_without_special_tokens = { + t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items() + } special_tokens = {t: i for i, t in enumerate(special_tokens)} self._unk_id = special_tokens[""] @@ -650,7 +675,6 @@ def __init__( self._id_to_token = {v: k for k, v in self._token_to_id.items()} assert set(range(self._vocab_size)) == set(self._id_to_token.keys()) - @property def bos(self) -> int: return self._bos_id diff --git a/megatron/training/training.py b/megatron/training/training.py index 7d60f41f5c..d5ee16be5f 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -28,6 +28,7 @@ ) from megatron.training.checkpointing import load_checkpoint from megatron.training.checkpointing import save_checkpoint +from megatron.training.checkpointing import checkpoint_exists from megatron.legacy.model import Float16Module from megatron.core.distributed import DistributedDataParallelConfig from megatron.core.distributed import DistributedDataParallel as DDP @@ -205,6 +206,7 @@ def pretrain( args_defaults={}, get_embedding_ranks=None, get_position_embedding_ranks=None, + non_loss_data_func=None, ): """Main training program. @@ -233,6 +235,10 @@ def pretrain( to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. + get_embedding_ranks (TODO): + get_position_embedding_ranks (TODO): + non_loss_data_func (callable): A custom function to call during evaluation. + It can run e.g. benchmarks. """ # Initalize and get arguments, timers, and Tensorboard writer. @@ -356,7 +362,8 @@ def pretrain( forward_step_func, model, optimizer, opt_param_scheduler, train_data_iterator, valid_data_iterator, - process_non_loss_data_func, config, checkpointing_context) + process_non_loss_data_func, config, checkpointing_context, + non_loss_data_func) print_datetime('after training is done') @@ -381,14 +388,16 @@ def pretrain( evaluate_and_print_results(prefix, forward_step_func, valid_data_iterator, model, iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train) + verbose=True, write_to_tensorboard=not args.skip_train, + non_loss_data_func=non_loss_data_func) if args.do_test: prefix = f'iteration {iteration} on test set' evaluate_and_print_results(prefix, forward_step_func, test_data_iterator, model, iteration, process_non_loss_data_func, config, - verbose=True, write_to_tensorboard=not args.skip_train) + verbose=True, write_to_tensorboard=not args.skip_train, + non_loss_data_func=non_loss_data_func) wandb_writer = get_wandb_writer() if wandb_writer: @@ -634,7 +643,8 @@ def setup_model_and_optimizer(model_provider_func, opt_param_scheduler = get_optimizer_param_scheduler(optimizer) if args.moe_use_upcycling: - assert not os.path.exists( + torch.distributed.barrier() + assert not checkpoint_exists( args.save ), ("The upcycling destination directory already exists. " "Please check if --moe-use-upcycling is mistakenly enabled. " @@ -642,15 +652,18 @@ def setup_model_and_optimizer(model_provider_func, "All subsequent runs should remove this flag. ") num_experts = args.num_experts args.num_experts = None + expert_model_parallel_size = args.expert_model_parallel_size + args.expert_model_parallel_size = 1 dense_model_for_upcycling = get_model(model_provider_func, model_type) args.num_experts = num_experts + args.expert_model_parallel_size = expert_model_parallel_size _, args.num_floating_point_operations_so_far = upcycling_utils.load_and_upcycle_model( load_checkpoint, unwrapped_model, dense_model_for_upcycling, load_kwargs = {'model': dense_model_for_upcycling, 'optimizer': None, 'opt_param_scheduler': None} ) - args.iteration = 0 + args.iteration = 1 save_checkpoint(args.iteration, model, None, None, args.num_floating_point_operations_so_far) torch.distributed.barrier() del dense_model_for_upcycling @@ -1095,7 +1108,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler, def train(forward_step_func, model, optimizer, opt_param_scheduler, train_data_iterator, valid_data_iterator, - process_non_loss_data_func, config, checkpointing_context): + process_non_loss_data_func, config, checkpointing_context, non_loss_data_func): """Train the model function.""" args = get_args() timers = get_timers() @@ -1331,7 +1344,8 @@ def get_e2e_base_metrics(): evaluate_and_print_results(prefix, forward_step_func, valid_data_iterator, model, iteration, process_non_loss_data_func, - config, False) + config, verbose=False, write_to_tensorboard=True, + non_loss_data_func=non_loss_data_func) eval_duration += timers('eval-time').elapsed() eval_iterations += args.eval_iters timers('eval-time').stop() @@ -1456,7 +1470,8 @@ def evaluate(forward_step_func, model, process_non_loss_data_func, config, - verbose=False): + verbose=False, + non_loss_data_func=None): """Evaluation.""" args = get_args() timers = get_timers() @@ -1534,7 +1549,9 @@ def evaluate(forward_step_func, return None, None, True collected_non_loss_data = None - if process_non_loss_data_func is not None and is_last_rank(): + if non_loss_data_func is not None: + collected_non_loss_data = non_loss_data_func(model) + elif process_non_loss_data_func is not None and is_last_rank(): collected_non_loss_data = forward_backward_func( forward_step_func=forward_step_func, data_iterator=data_iterator, @@ -1562,7 +1579,7 @@ def evaluate(forward_step_func, def evaluate_and_print_results(prefix, forward_step_func, data_iterator, model, iteration, process_non_loss_data_func, config, - verbose=False, write_to_tensorboard=True): + verbose=False, write_to_tensorboard=True, non_loss_data_func=None): """Helper function to evaluate and dump results on screen.""" args = get_args() if write_to_tensorboard: @@ -1574,7 +1591,7 @@ def evaluate_and_print_results(prefix, forward_step_func, total_loss_dict, collected_non_loss_data, timelimit = evaluate( forward_step_func, data_iterator, model, - process_non_loss_data_func, config, verbose) + process_non_loss_data_func, config, verbose, non_loss_data_func) # Timelimit hit during evaluation if timelimit: return diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py index f81d4dee5d..3c6c39b07f 100644 --- a/megatron/training/yaml_arguments.py +++ b/megatron/training/yaml_arguments.py @@ -16,7 +16,7 @@ import torch.nn.functional as F -from megatron.core.transformer import TransformerConfig +from megatron.core.transformer import TransformerConfig, MLATransformerConfig # Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml # Allows for yaml to use environment variables @@ -442,7 +442,10 @@ def squared_relu(x): kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_ # Return Transformer config. - return TransformerConfig(**kw_args) + if getattr(args, "multi_latent_attention", False): + return MLATransformerConfig(**kw_args) + else: + return TransformerConfig(**kw_args) def load_yaml(yaml_path): print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored") diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 0bd85b76e1..3b7f8db012 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -73,9 +73,9 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat transformer_layer_spec = import_module(args.spec) else: if use_te: - transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.fp8) + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, args.fp8) else: - transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm) + transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention) build_model_context = nullcontext build_model_context_args = {} @@ -105,7 +105,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, position_embedding_type=args.position_embedding_type, rotary_percent=args.rotary_percent, - rotary_base=args.rotary_base + rotary_base=args.rotary_base, + rope_scaling=args.use_rope_scaling ) return model diff --git a/pretrain_vlm.py b/pretrain_vlm.py index 7777603e53..6b1848e96c 100644 --- a/pretrain_vlm.py +++ b/pretrain_vlm.py @@ -46,10 +46,12 @@ def model_provider( model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model """ args = get_args() + vision_model_type = "clip" num_image_embeddings = get_num_image_embeddings( - args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1 + args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1 ) + old_seq_length = args.seq_length # decoder_seq_length denotes the language model sequence length. args.decoder_seq_length = args.seq_length + num_image_embeddings @@ -87,6 +89,7 @@ def model_provider( vision_transformer_config.num_layers = args.encoder_num_layers vision_transformer_config.first_pipeline_num_layers = None vision_transformer_config.last_pipeline_num_layers = None + vision_transformer_config.vision_model_type = vision_model_type vision_projection_type = "mlp" vision_projection_config = deepcopy(language_transformer_config) @@ -128,6 +131,7 @@ def model_provider( parallel_output=parallel_output, language_position_embedding_type=args.position_embedding_type, language_rotary_percent=args.rotary_percent, + language_rope_scaling=args.use_rope_scaling, pre_process=pre_process, post_process=post_process, add_encoder=add_encoder, @@ -137,6 +141,12 @@ def model_provider( patch_dim=args.patch_dim, ) + model.freeze( + freeze_language_model=args.freeze_LM, + freeze_vision_model=args.freeze_ViT, + freeze_vision_projection=False, + ) + return model @@ -270,7 +280,18 @@ def forward_step(data_iterator, model: LLaVAModel): def add_vlm_extra_args(parser): """Extra arguments.""" group = parser.add_argument_group(title='vision language model specific arguments') - group.add_argument("--disable-vision-class-token", action="store_true", default=False) + group.add_argument( + '--freeze-LM', action='store_true', default=False, help="Freeze language model weights" + ) + group.add_argument( + '--freeze-ViT', action='store_true', default=False, help="Freeze vision model (ViT) weights" + ) + group.add_argument( + "--disable-vision-class-token", + action="store_true", + default=False, + help="Drop vision model class token", + ) return parser diff --git a/tests/functional_tests/jet_recipes/_build-mcore.yaml b/tests/functional_tests/jet_recipes/_build-mcore.yaml new file mode 100644 index 0000000000..81b38b69ce --- /dev/null +++ b/tests/functional_tests/jet_recipes/_build-mcore.yaml @@ -0,0 +1,11 @@ +type: build +format_version: 1 +maintainers: [maanug] +spec: + name: mcore-pyt + platforms: [linux/amd64] + source: + # The image tag will be added via `jet-tests.yaml` + # Tags are one of {buildcache, $CI_PIPELINE_ID} + image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci + \ No newline at end of file diff --git a/tests/functional_tests/jet_recipes/_build-nemo.yaml b/tests/functional_tests/jet_recipes/_build-nemo.yaml new file mode 100644 index 0000000000..eb2b318ab5 --- /dev/null +++ b/tests/functional_tests/jet_recipes/_build-nemo.yaml @@ -0,0 +1,10 @@ +type: build +format_version: 1 +maintainers: [maanug] +spec: + name: mcore-nemo + platforms: [linux/amd64] + source: + # The image tag will be added via `jet-tests.yaml` + # Tags are one of {buildcache, $CI_PIPELINE_ID} + image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci \ No newline at end of file diff --git a/tests/functional_tests/jet_recipes/_build-pyt.yaml b/tests/functional_tests/jet_recipes/_build-pyt.yaml deleted file mode 100644 index d24836e44c..0000000000 --- a/tests/functional_tests/jet_recipes/_build-pyt.yaml +++ /dev/null @@ -1,23 +0,0 @@ -type: build -format_version: 1 -maintainers: [maanug] -spec: - name: mcore-pyt - platforms: [linux/amd64] - source: - # The image tag will be added via `jet-tests.yaml` - # Tags are one of {buildcache, $CI_PIPELINE_ID} - image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci - - ---- -type: build -format_version: 1 -maintainers: [maanug] -spec: - name: mcore-nemo - platforms: [linux/amd64] - source: - # The image tag will be added via `jet-tests.yaml` - # Tags are one of {buildcache, $CI_PIPELINE_ID} - image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci \ No newline at end of file diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml index 75aac2faab..088436e8ea 100644 --- a/tests/functional_tests/jet_recipes/bert.yaml +++ b/tests/functional_tests/jet_recipes/bert.yaml @@ -13,7 +13,7 @@ spec: /workspace/data/bert_data: text/the_pile/bert_shard00 script: |- ls - cd /workspace/megatron-lm + cd /opt/megatron-lm ARGUMENTS=( "DATA_PATH=/workspace/data/bert_data" @@ -32,7 +32,7 @@ products: time_limit: [12000] test_case: - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G - - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G + # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml index 87a6fb2c23..f14d2f0afa 100644 --- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml +++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml @@ -16,8 +16,8 @@ spec: cd /opt/NeMo ARGUMENTS=( - "DATA_PATH=''" - "DATA_CACHE_PATH=''" + "DATA_PATH='-'" + "DATA_CACHE_PATH='-'" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_PATH=/workspace/checkpoints" diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml index abaef86b81..8c09d0bd13 100644 --- a/tests/functional_tests/jet_recipes/gpt.yaml +++ b/tests/functional_tests/jet_recipes/gpt.yaml @@ -12,7 +12,7 @@ spec: /workspace/data/gpt3_data: text/the_pile/shard00 script: |- ls - cd /workspace/megatron-lm + cd /opt/megatron-lm ARGUMENTS=( "DATA_PATH=/workspace/data/gpt3_data" diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml index 7a20b1145a..3149f5664f 100644 --- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml +++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml @@ -13,11 +13,11 @@ spec: scope: null script: |- ls - cd /workspace/megatron-lm + cd /opt/megatron-lm ARGUMENTS=( - "DATA_PATH=''" - "DATA_CACHE_PATH=''" + "DATA_PATH='-'" + "DATA_CACHE_PATH='-'" "OUTPUT_PATH={assets_dir}" "TENSORBOARD_PATH={assets_dir}/tensorboard" "CHECKPOINT_PATH=/workspace/checkpoints" diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml index 947023b0eb..dbbbc508d2 100644 --- a/tests/functional_tests/jet_recipes/t5.yaml +++ b/tests/functional_tests/jet_recipes/t5.yaml @@ -13,7 +13,7 @@ spec: /workspace/data/t5_data: text/the_pile/t5_shard00 script: |- ls - cd /workspace/megatron-lm + cd /opt/megatron-lm ARGUMENTS=( "DATA_PATH=/workspace/data/t5_data" @@ -31,6 +31,12 @@ products: - scope: [mr] time_limit: [12000] test_case: + - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G + - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G + - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G + - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G + - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G + - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G - scope: [weekly] diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py new file mode 100644 index 0000000000..5ee31bc232 --- /dev/null +++ b/tests/functional_tests/python_test_utils/jet/common.py @@ -0,0 +1,140 @@ +import copy +import itertools +import pathlib +from typing import List, Optional + +import jetclient +import yaml + +BASE_PATH = pathlib.Path(__file__).parent.resolve() + + +def flatten_products( + workload_manifest: jetclient.JETWorkloadManifest, +) -> jetclient.JETWorkloadManifest: + """Flattens a nested dict of products""" + workload_manifest.products = [ + dict(zip(inp.keys(), values)) + for inp in workload_manifest.products + for values in itertools.product(*inp.values()) + ] + + return workload_manifest + + +def flatten_workload( + workload_manifest: jetclient.JETWorkloadManifest, +) -> List[jetclient.JETWorkloadManifest]: + """Flattens a workload with products into a list of workloads that don't have products.""" + workload_manifest = dict(workload_manifest) + products = workload_manifest.pop("products") + workload_manifests = [] + for product in products: + workload = copy.deepcopy(workload_manifest) + workload['spec'] = {k: v for k, v in workload['spec'] if k not in product.keys()} + workload['spec'] = dict(**dict(workload['spec']), **product) + workload_manifests.append(jetclient.JETWorkloadManifest(**workload)) + return workload_manifests + + +def load_config(config_path: str) -> jetclient.JETWorkloadManifest: + """Loads and parses a yaml file into a JETWorkloadManifest""" + with open(config_path) as stream: + try: + return jetclient.JETWorkloadManifest(**yaml.safe_load(stream)) + except yaml.YAMLError as exc: + raise exc + + +def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]: + """Wrapper function for doing all the fun at once.""" + return flatten_workload(flatten_products(load_config(config_path=config_path))) + + +def filter_by_test_case( + workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str +) -> jetclient.JETWorkloadManifest: + """Returns a workload with matching name. Raises an error if there no or more than a single workload.""" + workload_manifests = list( + workload_manifest + for workload_manifest in workload_manifests + if workload_manifest.spec.test_case == test_case + ) + + if len(workload_manifests) > 1: + raise ValueError("Duplicate test_case found!") + + if len(workload_manifests) == 0: + raise ValueError("No test_case found!") + + return workload_manifests[0] + + +def filter_by_scope( + workload_manifests: List[jetclient.JETWorkloadManifest], scope: str +) -> List[jetclient.JETWorkloadManifest]: + """Returns all workload with matching scope.""" + workload_manifests = list( + workload_manifest + for workload_manifest in workload_manifests + if workload_manifest.spec.scope == scope + ) + + if len(workload_manifests) == 0: + raise ValueError("No test_case found!") + + return workload_manifests + + +def filter_by_model( + workload_manifests: List[jetclient.JETWorkloadManifest], model: str +) -> List[jetclient.JETWorkloadManifest]: + """Returns all workload with matching model.""" + workload_manifests = list( + workload_manifest + for workload_manifest in workload_manifests + if workload_manifest.spec.model == model + ) + + if len(workload_manifests) == 0: + raise ValueError("No test_case found!") + + return workload_manifests + + +def load_workloads( + container_tag: str, + scope: Optional[str] = None, + model: Optional[str] = None, + test_case: Optional[str] = None, + container_image: Optional[str] = None, +) -> List[jetclient.JETWorkloadManifest]: + """Return all workloads from disk that match scope and platform.""" + recipes_dir = BASE_PATH / ".." / ".." / "jet_recipes" + local_dir = BASE_PATH / ".." / ".." / "local_recipes" + + workloads: List[jetclient.JETWorkloadManifest] = [] + build_workloads: List[jetclient.JETClient] = [] + for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")): + workloads += load_and_flatten(config_path=file) + if file.stem.startswith("_build"): + build_workloads.append(load_config(config_path=file)) + + if scope: + workloads = filter_by_scope(workload_manifests=workloads, scope=scope) + + if model: + workloads = filter_by_model(workload_manifests=workloads, model=model) + + if test_case: + workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)] + + for workload in list(workloads): + for build_workload in build_workloads: + if ( + workload.spec.build == build_workload.spec.name + ) and build_workload not in workloads: + container_image = container_image or build_workload.spec.source.image + build_workload.spec.source.image = f"{container_image}:{container_tag}" + workloads.append(build_workload) + return workloads diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py new file mode 100644 index 0000000000..c7338d3181 --- /dev/null +++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py @@ -0,0 +1,113 @@ +import pathlib +from typing import Optional + +import click +import yaml + +from tests.functional_tests.python_test_utils.jet import common + +BASE_PATH = pathlib.Path(__file__).parent.resolve() + + +@click.command() +@click.option("--scope", required=True, type=str, help="Test scope") +@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on") +@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on") +@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to") +@click.option("--container-image", required=True, type=str, help="LTS Container tag to use") +@click.option("--container-image-dev", required=True, type=str, help="Dev Container tag to use") +@click.option("--container-tag", required=True, type=str, help="Container tag to use") +@click.option( + "--run-name", required=False, type=str, help="Run name (only relevant for release tests)" +) +@click.option( + "--wandb-experiment", + required=False, + type=str, + help="Wandb experiment (only relevant for release tests)", +) +def main( + scope: str, + a100_cluster: str, + h100_cluster: str, + output_path: str, + container_image: str, + container_image_dev: str, + container_tag: str, + run_name: Optional[str] = None, + wandb_experiment: Optional[str] = None, +): + test_cases = [ + test_case + for test_case in common.load_workloads(scope=scope, container_tag=container_tag) + if test_case.type != "build" + ] + + gitlab_pipeline = { + "stages": list(set([test_case.spec.model for test_case in test_cases])), + "default": {"interruptible": True}, + } + + for test_case in test_cases: + if test_case.spec.platforms == "dgx_a100": + cluster = a100_cluster + elif test_case.spec.platforms == "dgx_h100": + cluster = h100_cluster + else: + raise ValueError(f"Platform {test_case.spec.platforms} unknown") + + script = [ + "export PYTHONPATH=$(pwd); " + "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py", + f"--model {test_case.spec.model}", + f"--test-case {test_case.spec.test_case}", + f"--container-tag {container_tag}", + f"--cluster {cluster}", + ] + + with open( + pathlib.Path( + BASE_PATH + / ".." + / ".." + / "test_cases" + / test_case.spec.model + / test_case.spec.test_case + / "model_config.yaml" + ) + ) as stream: + try: + test_case_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + if 'EXPERIMENTAL' in test_case_dict and test_case_dict['EXPERIMENTAL']: + script.append(f"--container-image {container_image_dev}") + + if run_name is not None and wandb_experiment is not None: + script.append(f"--run-name {run_name}") + test_case.spec.model + script.append( + f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}" + ) + + gitlab_pipeline[test_case.spec.test_case] = { + "stage": f"{test_case.spec.model}", + "image": f"{container_image}:{container_tag}", + "tags": ["mcore-docker-node-jet"], + "rules": [ + {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'}, + {"if": '$CI_MERGE_REQUEST_ID'}, + ], + "timeout": "7 days", + "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "jet-generate"}], + "script": [" ".join(script)], + "artifacts": {"paths": ["results/"]}, + } + + with open(output_path, 'w') as outfile: + yaml.dump(gitlab_pipeline, outfile, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py new file mode 100644 index 0000000000..bc9ad22302 --- /dev/null +++ b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py @@ -0,0 +1,62 @@ +"""Generate launch scripts for local execution. + +This script allows to generate pre-filled launch scripts that allow for local execution of Megatron-LM functional tests inside containerized enviroments (i.e. Slurm enroot or Docker). + +This script will generate scripts into `$(pwd)/test_cases`. +""" + +import pathlib +from typing import Optional + +import click +import jetclient +import yaml + +from tests.functional_tests.python_test_utils.jet import common + + +def load_script(config_path: str) -> str: + with open(config_path) as stream: + try: + jetclient.JETWorkloadManifest(**yaml.safe_load(stream)).spec.script + except yaml.YAMLError as exc: + raise exc + + +@click.command() +@click.option("--model", required=False, type=str, help="Filters all tests by matching model") +@click.option("--scope", required=False, type=str, help="Filters all tests by matching scope") +@click.option( + "--test-case", required=False, type=str, help="Returns a single test-case with matching name." +) +@click.option( + "--output-path", + required=True, + type=str, + help="Directory where the functional test will write its artifacts to (Tensorboard logs)", + default="/opt/megatron-lm", +) +def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str): + workloads = common.load_workloads( + container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none' + ) + + for workload in workloads: + if workload.type == "build": + continue + magic_values = dict(workload.spec) + magic_values["assets_dir"] = output_path + + file_path = ( + pathlib.Path.cwd() + / "test_cases" + / workload.spec.model + / f"{workload.spec.test_case}.sh" + ) + file_path.parent.mkdir(parents=True, exist_ok=True) + with open(file_path, "w", encoding="utf-8") as fh: + fh.write(workload.spec.script.format(**magic_values)) + + +if __name__ == "__main__": + main() diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py new file mode 100644 index 0000000000..3e243c542a --- /dev/null +++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py @@ -0,0 +1,216 @@ +import os +import pathlib +import re +import signal +import sys +import tempfile +from typing import List, Optional, Tuple + +import click +import jetclient +import yaml +from jetclient.services.dtos.pipeline import PipelineStatus + +from tests.functional_tests.python_test_utils.jet import common + +BASE_PATH = pathlib.Path(__file__).parent.resolve() + + +def resolve_cluster_config(cluster: str) -> str: + if cluster == "dgxh100_eos": + return "mcore/eos" + if cluster == "dgxa100_dracooci": + return "mcore/draco-oci" + if cluster == "dgxa100_dracooci-ord": + return "mcore/draco-oci-ord" + if cluster == "dgxh100_coreweave": + return "mcore/coreweave" + raise ValueError(f"Unknown cluster {cluster} provided.") + + +def register_pipeline_terminator(pipeline: jetclient.JETPipeline): + def sigterm_handler(_signo, _stack_frame): + print(f"Trying to terminate pipeline {pipeline.jet_id}") + pipeline.cancel() + print(f"Pipeline {pipeline.jet_id} terminated") + sys.exit(0) + + signal.signal(signal.SIGINT, sigterm_handler) + signal.signal(signal.SIGTERM, sigterm_handler) + + +def launch_and_wait_for_completion( + test_case: str, + container_image: str, + container_tag: str, + cluster: str, + account: str, + run_name: Optional[str], + wandb_experiment: Optional[str], +) -> jetclient.JETPipeline: + pipeline = jetclient.JETClient( + customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod" + ).workloads.submit( + workloads=common.load_workloads( + test_case=test_case, container_image=container_image, container_tag=container_tag + ), + config_id=resolve_cluster_config(cluster), + custom_config={ + "launchers": {cluster: {"account": account}}, + "executors": { + "jet-ci": { + "environments": { + cluster: { + "variables": { + "RUN_NAME": run_name or "", + "WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "", + "WANDB_EXPERIMENT": wandb_experiment or "", + } + } + } + } + }, + }, + wait_for_validation=True, + ) + + register_pipeline_terminator(pipeline=pipeline) + + print( + f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}", + flush=True, + ) + + pipeline.wait(max_wait_time=60 * 60 * 24 * 7) + print(f"Pipeline terminated; status: {pipeline.get_status()}") + return pipeline + + +def download_job_assets(job: jetclient.JETJob, iteration: int = 0) -> List[str]: + logs = job.get_logs() + if not logs: + return [""] + + assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}" + + for restart_idx, log in enumerate(logs): + assets = log.get_assets() + assets_path = assets_base_path / f"restart={restart_idx}" + assets_path.mkdir(parents=True, exist_ok=True) + for log_filename in assets.keys(): + with open(assets_path / log_filename, "w") as fh: + assets[log_filename].download(pathlib.Path(fh.name)) + + +def download_job_logs(job: jetclient.JETJob) -> List[str]: + logs = job.get_logs() + if not logs: + return [""] + + assets = logs[0].get_assets() + log_filename = [key for key in assets.keys() if key.endswith(".log")][0] + + with tempfile.NamedTemporaryFile() as tmp_file: + assets[log_filename].download(pathlib.Path(tmp_file.name)) + with open(pathlib.Path(tmp_file.name), "r") as fh: + return fh.readlines() + + +def parse_iterations_from_logs(logs: List[str]) -> Optional[Tuple[int, int]]: + for log_row in logs[::-1]: + match = re.search(r"iteration\s+(\d+)\s*/\s*(\d+)", log_row) + if match is not None: + return int(match.group(1)), int(match.group(2)) + + +@click.command() +@click.option("--model", required=True, type=str, help="Model") +@click.option("--test-case", required=True, type=str, help="Test case") +@click.option( + "--account", + required=False, + type=str, + help="Slurm account to use", + default="coreai_dlalgo_mcore", +) +@click.option("--cluster", required=True, type=str, help="Cluster to run on") +@click.option("--container-tag", required=True, type=str, help="Base image of Mcore image") +@click.option("--container-image", required=False, type=str, help="Base image of Mcore image") +@click.option( + "--run-name", required=False, type=str, help="Run name (only relevant for release tests)" +) +@click.option( + "--wandb-experiment", + required=False, + type=str, + help="Wandb experiment (only relevant for release tests)", +) +def main( + model: str, + test_case: str, + account: str, + cluster: str, + container_tag: str, + container_image: Optional[str] = None, + run_name: Optional[str] = None, + wandb_experiment: Optional[str] = None, +): + + with open( + pathlib.Path( + BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml" + ) + ) as stream: + try: + test_case_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + test_type = test_case_dict['TEST_TYPE'] + + if test_type == "release" and (run_name is None or wandb_experiment is None): + print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})") + sys.exit(1) + + n_attempts = 0 + n_iteration = 0 + while True and n_attempts < 3: + pipeline = launch_and_wait_for_completion( + test_case=test_case, + container_image=container_image, + container_tag=container_tag, + cluster=cluster, + account=account, + run_name=run_name, + wandb_experiment=wandb_experiment, + ) + + main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0] + + logs = download_job_logs(job=main_job) + concat_logs = "\n".join(logs) + print(f"Logs:\n{concat_logs}") + + download_job_assets(job=main_job, iteration=n_iteration) + + if test_type != "release": + success = pipeline.get_status() == PipelineStatus.SUCCESS + sys.exit(int(not success)) # invert for exit 0 + + parsed_result = parse_iterations_from_logs(logs=logs) + if not parsed_result: + print("Weird log, no iterations found") + n_attempts += 1 + continue + + current_iteration, total_iterations = parsed_result + if current_iteration == total_iterations: + + success = pipeline.get_status() == PipelineStatus.SUCCESS + sys.exit(int(not success)) # invert for exit 0 + n_iteration += 1 + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh index d43a3af77f..12dd359c65 100644 --- a/tests/functional_tests/shell_test_utils/_run_training.sh +++ b/tests/functional_tests/shell_test_utils/_run_training.sh @@ -41,15 +41,9 @@ done cat $TRAINING_PARAMS_PATH | envsubst >$TRAINING_PARAMS_PATH.tmp mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH -# Run before script -SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT') -if [[ "$SCRIPT" != null ]]; then - eval "$SCRIPT" -fi; - # Pull env vars to export ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH) -for ARGUMENT in $ENV_VARS; do +while IFS= read -r ARGUMENT; do KEY=$(echo $ARGUMENT | cut -f1 -d=) KEY_LENGTH=${#KEY} @@ -57,7 +51,13 @@ for ARGUMENT in $ENV_VARS; do export "$KEY"="$VALUE" echo "$KEY=$VALUE" -done +done <<< "$ENV_VARS" + +# Run before script +SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT') +if [[ "$SCRIPT" != null ]]; then + eval "$SCRIPT" +fi; # Exit earlier to leave time for properly saving checkpoint if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh index 277d46add1..1bb2ea5c3c 100644 --- a/tests/functional_tests/shell_test_utils/notify.sh +++ b/tests/functional_tests/shell_test_utils/notify.sh @@ -1,6 +1,6 @@ set -euxo pipefail -collect_jet_jobs () { +collect_jobs () { PAGE=1 PER_PAGE=100 RESULTS="[]" @@ -11,7 +11,7 @@ collect_jet_jobs () { -s \ --globoff \ --header "PRIVATE-TOKEN: $RO_API_TOKEN" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${JET_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" ) # Combine the results RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE") @@ -85,31 +85,16 @@ if [[ $DOWNSTREAM_PIPELINE_ID == null ]]; then else set +x - JET_PIPELINE_JSON=$(curl \ - --fail \ - --silent \ - --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ - "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${DOWNSTREAM_PIPELINE_ID}/bridges?per_page=100" - ) + JOBS=$(echo "$(collect_jobs)" | jq '[.[] | {id, name, status}]') + echo $JOBS set -x - JET_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$JET_PIPELINE_JSON") - set +x - JET_LOGS=$(echo "$(collect_jet_jobs)" \ - | jq '[ - .[] - | select(.name | startswith("build/") | not) - | select(.name | contains("3 logs_after") | not) - | select(.name | contains("1 logs_before") | not) - ]' - ) - - FAILED_JET_LOGS=$(echo "$JET_LOGS" \ + FAILED_JOBS=$(echo "$JOBS" \ | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[ .[] | select(.status != "success") | { - "name": (.name[6:] | split(" ")[0]), + name, id, "url": ("https://" + $GITLAB_ENDPOINT + "/dl/jet/ci/-/jobs/" + (.id | tostring)), } @@ -117,29 +102,34 @@ else ) set -x - for row in $(echo "${FAILED_JET_LOGS}" | jq -r '.[] | @base64'); do + for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do _jq() { echo ${row} | base64 --decode | jq -r ${1} } JOB_ID=$(_jq '.id') - SLURM_FAILURE=$(jet \ - -c -df json -th logs query --raw \ - -c "obj_status.s_message" \ - --eq obj_ci.l_job_id "$JOB_ID" \ - | jq '.[0].obj_status.s_message' \ - | tr -d '"' - ) - FAILED_JET_LOGS=$(echo "$FAILED_JET_LOGS" \ - | jq \ - --argjson JOB_ID "$JOB_ID" \ - --arg SLURM_FAILURE "$SLURM_FAILURE" ' - .[] |= ((select(.id==$JOB_ID) += { - "slurm_failure_reason": $SLURM_FAILURE})) - ') + FULL_LOG=$(curl \ + --location \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace") + + if [[ "$FULL_LOG" == *exception* ]]; then + LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1) + SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499} + else + SHORT_LOG=${FULL_LOG: -1000} + fi + + FAILED_JOBS=$(echo "$FAILED_JOBS" \ + | jq \ + --argjson JOB_ID "$JOB_ID" \ + --arg SLURM_FAILURE "$SHORT_LOG" ' + .[] |= ((select(.id==$JOB_ID) += { + "slurm_failure_reason": $SLURM_FAILURE})) + ') done - NUM_FAILED=$(echo "$FAILED_JET_LOGS" | jq 'length') - NUM_TOTAL=$(echo "$JET_LOGS" | jq 'length') + NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length') + NUM_TOTAL=$(echo "$JOBS" | jq 'length') if [[ $NUM_FAILED -eq 0 ]]; then BLOCKS='[ @@ -152,7 +142,7 @@ else } ]' else - BLOCKS=$(echo -e "$FAILED_JET_LOGS" \ + BLOCKS=$(echo "$FAILED_JOBS" \ | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" ' [ { @@ -170,7 +160,7 @@ else "type": "mrkdwn", "text": ( "• Job: <" +.url + "|" + .name + ">" - + "\n SLURM failure reason: \n```" + .slurm_failure_reason[-2000:] + "```" + + "\n SLURM failure reason: \n```" + .slurm_failure_reason + "```" ) } diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh new file mode 100644 index 0000000000..46be8b078e --- /dev/null +++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh @@ -0,0 +1,186 @@ +set -euxo pipefail + +collect_jobs () { + PAGE=1 + PER_PAGE=100 + RESULTS="[]" + + while true; do + # Fetch the paginated results + RESPONSE=$(curl \ + -s \ + --globoff \ + --header "PRIVATE-TOKEN: $RO_API_TOKEN" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE" + ) + # Combine the results + RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE") + + # Check if there are more pages + if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then + break + fi + + # Increment the page number + PAGE=$((PAGE + 1)) + done + + echo "$RESULTS" +} + +CI_PIPELINE_ID=${1:-16595865} +CI_PROJECT_ID=${CI_PROJECT_ID:-19378} +PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID +JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/ +CONTEXT="unit-tests-extended" + +# Fetch Elastic logs +set +x +PIPELINE_JSON=$(curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs" + ) || ret_code=$? +set -x +if [[ ${ret_code:-0} -ne 0 ]]; then + echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist + exit 1 +fi + +UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("unit_tests"))]') + +if [[ $UNIT_TESTS_JOBS == null ]]; then + FAILED_JOBS=$(curl \ + --fail \ + --silent \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \ + | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"') + curl \ + -X POST \ + -H "Content-type: application/json" \ + --data ' + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n" + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "\n• Job: '"$FAILED_JOBS"'" + } + }, + ] + + }' \ + $WEBHOOK_URL + +else + FAILED_JOBS=$(echo -E "$UNIT_TESTS_JOBS" \ + | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" --arg JOB_URL "$JOB_URL" '[ + .[] + | select(.status != "success") + | { + name, + id, + "url": ($JOB_URL + (.id | tostring)), + } + ]' + ) + set -x + + for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do + _jq() { + echo ${row} | base64 --decode | jq -r ${1} + } + JOB_ID=$(_jq '.id') + FULL_LOG=$(curl \ + --location \ + --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \ + "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace") + + if [[ "$FULL_LOG" == *exception* ]]; then + LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1) + SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499} + else + SHORT_LOG=${FULL_LOG: -1000} + fi + + FAILED_JOBS=$(echo "$FAILED_JOBS" \ + | jq \ + --argjson JOB_ID "$JOB_ID" \ + --arg SLURM_FAILURE "$SHORT_LOG" ' + .[] |= ((select(.id==$JOB_ID) += { + "slurm_failure_reason": $SLURM_FAILURE})) + ') + done + + NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length') + NUM_TOTAL=$(echo "$UNIT_TESTS_JOBS" | jq 'length') + + if [[ $NUM_FAILED -eq 0 ]]; then + BLOCKS='[ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed" + } + } + ]' + else + BLOCKS=$(echo "$FAILED_JOBS" \ + | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" ' + [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed") + } + } + ] + [ + .[] + | { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + "• Job: <" +.url + "|" + .name + ">" + + "\n SLURM failure reason: \n```" + .slurm_failure_reason + "```" + + ) + } + } + ] + [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ("===============================================") + } + } + ]' + ) + fi + + for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do + _jq() { + echo ${row} | base64 --decode + } + + curl \ + -X POST \ + -H "Content-type: application/json" \ + --data '{"blocks": '["$(_jq)"]'}' \ + $WEBHOOK_URL + done + +fi \ No newline at end of file diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh index 7578d25c2d..c9c16b43c6 100644 --- a/tests/functional_tests/shell_test_utils/run_ci_test.sh +++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh @@ -4,11 +4,11 @@ set -exo pipefail echo "------ARGUMENTS LIST --------" for ARGUMENT in "$@"; do + echo $ARGUMENT KEY=$(echo $ARGUMENT | cut -f1 -d=) KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - + VALUE=$(eval echo ${ARGUMENT:$KEY_LENGTH+1}) export "$KEY"="$VALUE" echo "$KEY=$VALUE" done diff --git a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh b/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh deleted file mode 100644 index 3ee776ce9b..0000000000 --- a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash - -####################################################################################### -# -# Script for capturing a reference model. -# -# It will train a model until a target iteration was hit. -# -# -######################################################################################## - -set -exo pipefail - -echo "------ARGUMENTS LIST --------" -for ARGUMENT in "$@"; do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" - echo "$KEY=$VALUE" -done -echo "---------------------------------" - -# Check that mandatory vars are set -MANDATORY_VARS=( - "MODEL" - "VARIANT" - "TRAINING_SCRIPT_PATH" - "OUTPUT_PATH" - "IMAGE_TAG" - "NODES" - "PPP" - "PARTITION" - "ITERATIONS" - "WANDB_API_KEY" - "CLUSTER" - "DATASET" - "WANDB_EXPERIMENT" - "GPUS_PER_NODE" -) -for mandatory_var in "${MANDATORY_VARS[@]}"; do - if [[ -z "${!mandatory_var}" ]]; then - echo 'Providing $'$mandatory_var' is mandatory.' - exit 1 - fi -done - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -ROOT_DIR=$(realpath $SCRIPT_DIR/../../../) - -# Fetch dataset base path via JET and refresh DATA_BELDN -DATA_PATH=$(jet -c -tf plain -th artifacts registry list -c storages.$CLUSTER.identifier -f "key == '$DATASET'") -DATA_BLEND=$(eval echo "$DATA_BLEND") - -######################################################################################## -# Dont change below -######################################################################################## - -SLURM_LOGS=$OUTPUT_PATH/slurm_logs/ -mkdir -p $SLURM_LOGS - -# Container settings -ARGUMENTS=( - "TRAINING_SCRIPT_PATH=${TRAINING_SCRIPT_PATH}" - "TEST_CASE_PATH=./tests/functional_tests/test_cases/$MODEL/$VARIANT" - "OUTPUT_PATH=${OUTPUT_PATH}" - "TENSORBOARD_PATH=${OUTPUT_PATH}/tensorboard" - "CHECKPOINT_PATH=${OUTPUT_PATH}/checkpoints" - "DATA_PATH=${DATA_PATH}" - "DATA_CACHE_PATH=${OUTPUT_PATH}/data-cache" - "WANDB_API_KEY=${WANDB_API_KEY}" - "WANDB_EXPERIMENT=${WANDB_EXPERIMENT}" - "DATA_BLEND=\"${DATA_BLEND}\"" -) - -if [[ -n $LOAD_PATH ]]; then - ARGUMENTS+=("LOAD_PATH=${LOAD_PATH}") -fi - -echo ${ARGUMENTS[@]} - -while : -do - -if [[ $(cat "${OUTPUT_PATH}/checkpoints/latest_checkpointed_iteration.txt" || echo 0) -ge $ITERATIONS ]]; then - break -fi - -# Fire of sbatch -echo '#!/bin/bash' > sbatch.sh - -if [[ $GPUS_PER_NODE != null ]]; then - echo '#SBATCH --gres=gpu:8' >> sbatch.sh -fi -echo "#SBATCH --nodes=$NODES -#SBATCH --account $PPP -#SBATCH --partition $PARTITION -#SBATCH --ntasks-per-node=1 -#SBATCH --time "04:00:00" -#SBATCH --job-name=$PPP:mcore:release:$MODEL -#SBATCH --dependency=singleton -#SBATCH --output=/dev/null -#SBATCH --error=/dev/null -#SBATCH --exclusive - -# Prepare SLURM job -echo "SLURM_JOB_ID=\$SLURM_JOB_ID" > "$SLURM_LOGS/\${SLURM_JOB_ID}.log" - -srun \ - --ntasks-per-node=1 \ - --container-image='gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG' \ - --container-mounts='${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}' \ - --container-workdir=/workspace/megatron-lm \ - bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>'$SLURM_LOGS/\${SLURM_JOB_ID}.log' 2>&1" >> sbatch.sh - -set +e -sbatch -W sbatch.sh -set -e -done - -# Write golden values into repo if this run should become a reference -cp $OUTPUT_PATH/golden_values.json > ./golden_values.json diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml index 941e8b7bdb..bf88792152 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml @@ -25,14 +25,14 @@ MODEL_ARGS: --micro-batch-size: 4 --rampup-batch-size: "384 384 97656250" --global-batch-size: 1152 - --train-samples: 4882812 + --train-samples: 19531250 --manual-gc: true # Transformer Engine args --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml index 941e8b7bdb..9453db100c 100644 --- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml @@ -32,7 +32,7 @@ MODEL_ARGS: --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml index ee149b884e..9516076dc6 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml @@ -30,7 +30,7 @@ MODEL_ARGS: --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: Llama2Tokenizer --tokenizer-model: ${DATA_PATH}/tokenizer.model --data-path: ${DATA_BLEND} @@ -88,6 +88,7 @@ MODEL_ARGS: --auto-detect-ckpt-format: true --load: ${LOAD_PATH} --save: ${OUTPUT_PATH}/checkpoints + --no-ckpt-fully-parallel-save: true --save-interval: 500 # Add initialization args diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml index 1fe7611a81..585d9bb2c7 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml index d80246eecd..22607416a3 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml @@ -33,7 +33,7 @@ MODEL_ARGS: --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: GPTSentencePieceTokenizer --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model --data-path: $DATA_BLEND diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml index b2f6983a62..39421a887e 100644 --- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml +++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml @@ -31,7 +31,7 @@ MODEL_ARGS: --transformer-impl: transformer_engine # Data args - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --tokenizer-type: Llama2Tokenizer --tokenizer-model: ${DATA_PATH}/tokenizer.model --data-path: ${DATA_BLEND} @@ -89,6 +89,7 @@ MODEL_ARGS: --auto-detect-ckpt-format: true --load: ${LOAD_PATH} --save: ${OUTPUT_PATH}/checkpoints + --no-ckpt-fully-parallel-save: true --save-interval: 500 # Add initialization args diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json index bd193a724d..f4b39082a6 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json @@ -1 +1 @@ -{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13442, 9.13256, 9.12852, 9.11273, 9.05533, 9.04358, 8.98427, 8.93519, 8.89295, 8.79396]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3478477.0, 3585145.0, 3475635.0, 3384010.0, 3700478.0, 3480110.0, 3398548.0, 3454436.0, 3425849.0, 3585758.0]},"iteration_timing_avg": 0.2253964705882353} \ No newline at end of file +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json index de82457c30..03e0dd0e9b 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json @@ -1 +1 @@ -{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3558381.0, 3664861.0, 3555505.0, 3463866.0, 3780904.0, 3560200.0, 3478189.0, 3534510.0, 3506002.0, 3665772.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16219, 9.16263, 9.15739, 9.1412, 9.09523, 9.07236, 9.01592, 8.96749, 8.92204, 8.8314]}} \ No newline at end of file +{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json index 0ce1048997..96f345a702 100644 --- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json +++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json @@ -1 +1 @@ -{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19789, 9.20022, 9.19547, 9.17248, 9.11862, 9.10315, 9.0418, 8.98727, 8.9443, 8.84512]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3718539.0, 3825032.0, 3715374.0, 3623934.0, 3940675.0, 3720162.0, 3638165.0, 3695121.0, 3666164.0, 3825842.0]}, "iteration_timing_avg": 0.5847132352941178} \ No newline at end of file +{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json new file mode 100644 index 0000000000..bcff777664 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json @@ -0,0 +1 @@ +{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [19.39068, 0.66038, 0.65673, 0.66493, 0.65894, 0.6473, 0.65746, 0.64942, 0.66259, 0.65247, 0.65165, 0.64944, 0.81313, 0.65069, 0.64982, 0.65247, 0.65149, 0.65284, 0.64913, 0.6496]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.63253, 0.27412, 0.26777, 0.27338, 0.26922, 0.26445, 0.27043, 0.26308, 0.27178, 0.26246, 0.26565, 0.26691, 0.42095, 0.26741, 0.26653, 0.26546, 0.26547, 0.26403, 0.26266, 0.26606]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.0264, 0.24005, 0.23751, 0.24162, 0.24102, 0.23888, 0.24027, 0.23829, 0.24182, 0.24308, 0.24109, 0.23964, 0.23841, 0.24005, 0.23898, 0.23896, 0.24052, 0.23894, 0.24242, 0.23863]}, "forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [8.32911, 0.07441, 0.07755, 0.07578, 0.07557, 0.07223, 0.0737, 0.07404, 0.07108, 0.07174, 0.07137, 0.07162, 0.07437, 0.07185, 0.07129, 0.07247, 0.0719, 0.07573, 0.07292, 0.07122]}, "forward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.47287, 0.00053, 0.00063, 0.00048, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00063, 0.00044, 0.00046, 0.00047, 0.00045, 0.00056, 0.00046, 0.00045, 0.00046, 0.00045, 0.00044]}, "backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.1444, 0.13179, 0.12767, 0.13592, 0.1279, 0.12912, 0.13033, 0.1328, 0.13106, 0.13249, 0.12957, 0.12877, 0.13334, 0.12829, 0.12815, 0.13128, 0.12985, 0.13117, 0.12901, 0.1277]}, "backward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00065, 0.00056, 0.00066, 0.00067, 0.0006, 0.00059, 0.00064, 0.00067, 0.00068, 0.0006, 0.00056, 0.00058, 0.00059, 0.00056, 0.00064, 0.00058, 0.00049, 0.00079, 0.00081, 0.0006]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [12.49425, 0.23291, 0.228, 0.22475, 0.22786, 0.22525, 0.22534, 0.22597, 0.23004, 0.22656, 0.22342, 0.22577, 0.38374, 0.22857, 0.22673, 0.22371, 0.22908, 0.23017, 0.23145, 0.23191]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5.02478, 0.00608, 0.00441, 0.00414, 0.0093, 0.00347, 0.00363, 0.00527, 0.0093, 0.00705, 0.00369, 0.00633, 0.00834, 0.00352, 0.0034, 0.00565, 0.00346, 0.00354, 0.00341, 0.0035]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.47745, 0.00052, 0.00064, 0.00053, 0.00052, 0.0006, 0.00052, 0.00062, 0.00052, 0.00056, 0.00065, 0.00056, 0.00054, 0.00053, 0.00058, 0.00052, 0.00052, 0.00052, 0.00055, 0.00053]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.43086, 0.00036, 0.00041, 0.00037, 0.00032, 0.00037, 0.00048, 0.00044, 0.00043, 0.00045, 0.00034, 0.00044, 0.00037, 0.00043, 0.00044, 0.00032, 0.00032, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00053, 0.00034, 0.00032, 0.00033, 0.00034, 0.00031, 0.00033, 0.00035, 0.00032, 0.00033, 0.00036, 0.00035, 0.00033, 0.00033, 0.00034, 0.00035, 0.00033, 0.00034, 0.00032, 0.00035]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.26638, 0.00127, 0.00123, 0.00144, 0.00125, 0.00123, 0.00128, 0.00162, 0.00128, 0.00131, 0.00138, 0.00133, 0.00142, 0.0013, 0.00136, 0.00137, 0.00133, 0.00135, 0.00129, 0.00136]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.01282, 0.00738, 0.00728, 0.00736, 0.00738, 0.00733, 0.00738, 0.00735, 0.00731, 0.00727, 0.00897, 0.00755, 0.0073, 0.00721, 0.00734, 0.00746, 0.00736, 0.00734, 0.00737, 0.00726]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00984, 0.00108, 0.00105, 0.00108, 0.00105, 0.00105, 0.00107, 0.00104, 0.00105, 0.00106, 0.00106, 0.00105, 0.0012, 0.00106, 0.00105, 0.00105, 0.00105, 0.00106, 0.00104, 0.00106]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0011, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.0015, 0.00102, 0.00101, 0.00101, 0.00102, 0.00268, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.29197, 0.01172, 0.01152, 0.01191, 0.01165, 0.01156, 0.0117, 0.01199, 0.01159, 0.01161, 0.0134, 0.01194, 0.01269, 0.01155, 0.01172, 0.01186, 0.01173, 0.01343, 0.01172, 0.01165]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.7057, 0.68569, 0.68236, 0.69077, 0.68415, 0.67238, 0.68288, 0.67481, 0.6874, 0.67748, 0.6785, 0.67478, 0.83941, 0.6755, 0.67503, 0.67787, 0.67668, 0.67904, 0.67443, 0.67541]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..076389c3d6 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: transformer_engine + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 10000 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 2 + --deterministic-mode: true + --ckpt-format: torch +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..b0d00b8f83 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: transformer_engine + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 50 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 2 + --deterministic-mode: true + --ckpt-format: torch +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json new file mode 100644 index 0000000000..c59b98b90a --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json @@ -0,0 +1 @@ +{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [11.55278, 0.77358, 0.76856, 0.77172, 0.75887, 0.76061, 0.75836, 0.76125, 0.76192, 0.76187, 0.76171, 0.76045, 0.7599, 0.76535, 0.76121, 0.76796, 0.76998, 0.76511, 0.76167, 0.75816]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [6.97639, 0.39525, 0.3898, 0.39437, 0.37749, 0.38195, 0.37908, 0.37821, 0.38433, 0.38023, 0.38359, 0.37973, 0.37768, 0.37754, 0.38336, 0.38173, 0.39026, 0.38845, 0.38337, 0.37691]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.32964, 0.37495, 0.37481, 0.37567, 0.37884, 0.37558, 0.37486, 0.37929, 0.37612, 0.37965, 0.37608, 0.37503, 0.37843, 0.38541, 0.37552, 0.38094, 0.37923, 0.37628, 0.37437, 0.37757]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.89543, 0.00188, 0.00211, 0.00164, 0.00165, 0.00162, 0.00162, 0.00162, 0.00184, 0.00165, 0.00164, 0.00208, 0.00162, 0.00167, 0.0016, 0.00168, 0.00165, 0.00163, 0.00164, 0.00161]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00146, 0.00105, 0.00105, 0.00102, 0.00107, 0.00107, 0.00107, 0.00109, 0.00105, 0.00106, 0.00107, 0.00106, 0.00106, 0.00106, 0.00108, 0.00108, 0.00107, 0.00104, 0.00103, 0.0011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.50022, 0.00376, 0.00381, 0.00329, 0.00321, 0.00354, 0.00371, 0.00375, 0.00366, 0.00301, 0.00349, 0.00372, 0.00349, 0.00369, 0.00297, 0.00283, 0.00369, 0.00377, 0.00388, 0.00369]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.04986, 0.02302, 0.02299, 0.02588, 0.02338, 0.0231, 0.02293, 0.0231, 0.02309, 0.02329, 0.02328, 0.02332, 0.02304, 0.02327, 0.02287, 0.02321, 0.02315, 0.0234, 0.02312, 0.02327]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0158, 0.00219, 0.00221, 0.00411, 0.0022, 0.0022, 0.00216, 0.0022, 0.00217, 0.00218, 0.00218, 0.00225, 0.00233, 0.00219, 0.00223, 0.00222, 0.00212, 0.0022, 0.00222, 0.00225]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00301, 0.00302, 0.00302, 0.00339, 0.003, 0.00302, 0.00302, 0.00301, 0.00301, 0.00301, 0.003, 0.00301, 0.00302, 0.00304, 0.003, 0.00301, 0.00299, 0.00304, 0.00303, 0.00303]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.57167, 0.03386, 0.03382, 0.03847, 0.03353, 0.03358, 0.03363, 0.03394, 0.03377, 0.03326, 0.03368, 0.03412, 0.03363, 0.03407, 0.03281, 0.03316, 0.03373, 0.03419, 0.03396, 0.034]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [13.15856, 0.82951, 0.82427, 0.83168, 0.8147, 0.81581, 0.81386, 0.8171, 0.8176, 0.81664, 0.81719, 0.81685, 0.81547, 0.82136, 0.81551, 0.82315, 0.82591, 0.82132, 0.81777, 0.81414]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..d1b9e8429e --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: transformer_engine + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 10000 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 0 + --deterministic-mode: true + --ckpt-format: torch_dist +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..540d4c1b73 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: transformer_engine + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 50 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 0 + --deterministic-mode: true + --ckpt-format: torch_dist +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json new file mode 100644 index 0000000000..d932464f76 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json @@ -0,0 +1,763 @@ +{ + "forward-backward-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 14.18678, + 0.67885, + 0.68278, + 0.68333, + 0.67855, + 0.68179, + 0.68809, + 0.67808, + 0.67889, + 0.69586, + 0.69577, + 0.67938, + 0.68076, + 0.68551, + 0.69108, + 0.67821, + 0.68422, + 0.68947, + 0.67891, + 0.68614 + ] + }, + "forward-compute-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 8.91183, + 0.31386, + 0.31455, + 0.31529, + 0.31399, + 0.31376, + 0.3168, + 0.31219, + 0.31205, + 0.32539, + 0.32943, + 0.31424, + 0.31569, + 0.32161, + 0.32188, + 0.31166, + 0.31627, + 0.31935, + 0.31029, + 0.32078 + ] + }, + "backward-compute-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 4.25414, + 0.3682, + 0.37658, + 0.37755, + 0.37333, + 0.37381, + 0.37727, + 0.37278, + 0.37206, + 0.37541, + 0.37183, + 0.37214, + 0.37101, + 0.37247, + 0.37485, + 0.36955, + 0.37359, + 0.3825, + 0.37545, + 0.37777 + ] + }, + "layernorm-grads-all-reduce-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.00004, + 0.00003, + 0.00003, + 0.00002, + 0.00002, + 0.00002, + 0.00002, + 0.00003, + 0.00002, + 0.00003, + 0.00002, + 0.00003, + 0.00002, + 0.00002, + 0.00004, + 0.00003, + 0.00002, + 0.00002, + 0.00002, + 0.00002 + ] + }, + "embedding-grads-all-reduce-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.00005, + 0.00004, + 0.00004, + 0.00004, + 0.00004, + 0.00003, + 0.00003, + 0.00004, + 0.00004, + 0.00003, + 0.00003, + 0.00004, + 0.00004, + 0.00004, + 0.00004, + 0.00003, + 0.00003, + 0.00003, + 0.00003, + 0.00003 + ] + }, + "all-grads-sync-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.9061, + 0.00163, + 0.00202, + 0.00163, + 0.00157, + 0.00156, + 0.00183, + 0.0016, + 0.00183, + 0.00157, + 0.00157, + 0.00158, + 0.00168, + 0.00158, + 0.00169, + 0.00156, + 0.00157, + 0.00157, + 0.00156, + 0.00185 + ] + }, + "optimizer-copy-to-main-grad-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.0011, + 0.00104, + 0.00102, + 0.00101, + 0.00097, + 0.00098, + 0.001, + 0.00096, + 0.00096, + 0.00099, + 0.00095, + 0.00097, + 0.00096, + 0.00098, + 0.00097, + 0.00098, + 0.00095, + 0.00099, + 0.00098, + 0.00099 + ] + }, + "optimizer-clip-main-grad-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 1.59317, + 0.00265, + 0.00282, + 0.00284, + 0.00289, + 0.00298, + 0.00282, + 0.00294, + 0.00302, + 0.00301, + 0.00304, + 0.00294, + 0.00253, + 0.00296, + 0.00251, + 0.00227, + 0.00282, + 0.00287, + 0.00308, + 0.00276 + ] + }, + "optimizer-count-zeros-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.04375, + 0.02396, + 0.02387, + 0.02381, + 0.02385, + 0.02393, + 0.0241, + 0.02406, + 0.02393, + 0.024, + 0.02396, + 0.024, + 0.0241, + 0.02397, + 0.024, + 0.02378, + 0.0238, + 0.02393, + 0.02395, + 0.02405 + ] + }, + "optimizer-inner-step-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.01715, + 0.00212, + 0.0021, + 0.00212, + 0.00212, + 0.00211, + 0.00218, + 0.00213, + 0.00212, + 0.00214, + 0.00211, + 0.00226, + 0.00211, + 0.00209, + 0.00211, + 0.00218, + 0.00207, + 0.00211, + 0.00213, + 0.00218 + ] + }, + "optimizer-copy-main-to-model-params-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.00281, + 0.00282, + 0.00281, + 0.00283, + 0.00281, + 0.00283, + 0.00289, + 0.00286, + 0.00281, + 0.00284, + 0.00282, + 0.00431, + 0.00295, + 0.00284, + 0.00283, + 0.00283, + 0.18259, + 0.00284, + 0.00283, + 0.00295 + ] + }, + "optimizer-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 1.65881, + 0.03322, + 0.03326, + 0.03323, + 0.03329, + 0.03345, + 0.03361, + 0.03357, + 0.03352, + 0.03364, + 0.03349, + 0.03532, + 0.03332, + 0.03347, + 0.03313, + 0.03267, + 0.21285, + 0.03336, + 0.03358, + 0.03357 + ] + }, + "learning-rate": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.0001, + 0.0001, + 0.00009, + 0.00009, + 0.00008, + 0.00008, + 0.00007, + 0.00007, + 0.00006, + 0.00006, + 0.00005, + 0.00005, + 0.00005, + 0.00004, + 0.00004, + 0.00003, + 0.00003, + 0.00002, + 0.00002, + 0.00001 + ] + }, + "learning-rate vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 0.0001, + 0.0001, + 0.00009, + 0.00009, + 0.00008, + 0.00008, + 0.00007, + 0.00007, + 0.00006, + 0.00006, + 0.00005, + 0.00005, + 0.00005, + 0.00004, + 0.00004, + 0.00003, + 0.00003, + 0.00002, + 0.00002, + 0.00001 + ] + }, + "batch-size": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32 + ] + }, + "batch-size vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 32 + ] + }, + "lm loss": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 10.3267, + 9.41409, + 8.86422, + 8.56557, + 8.28779, + 8.10356, + 7.83669, + 7.53761, + 7.39304, + 7.29344, + 7.37755, + 7.22522, + 7.11288, + 7.06761, + 6.91847, + 6.96686, + 6.97827, + 7.04883, + 6.72143, + 6.98255 + ] + }, + "lm loss vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 10.3267, + 9.41409, + 8.86422, + 8.56557, + 8.28779, + 8.10356, + 7.83669, + 7.53761, + 7.39304, + 7.29344, + 7.37755, + 7.22522, + 7.11288, + 7.06761, + 6.91847, + 6.96686, + 6.97827, + 7.04883, + 6.72143, + 6.98255 + ] + }, + "loss-scale": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + }, + "loss-scale vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + }, + "grad-norm": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 21.2635, + 2.17416, + 2.50475, + 2.08972, + 1.9252, + 1.69975, + 1.63606, + 1.57261, + 1.48503, + 1.29641, + 1.00944, + 1.01609, + 0.95592, + 1.04635, + 0.94502, + 0.7775, + 1.07117, + 1.16813, + 1.12672, + 0.85024 + ] + }, + "grad-norm vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 21.2635, + 2.17416, + 2.50475, + 2.08972, + 1.9252, + 1.69975, + 1.63606, + 1.57261, + 1.48503, + 1.29641, + 1.00944, + 1.01609, + 0.95592, + 1.04635, + 0.94502, + 0.7775, + 1.07117, + 1.16813, + 1.12672, + 0.85024 + ] + }, + "num-zeros": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 43318, + 40956, + 43957, + 41617, + 44756, + 43946, + 41064, + 42479, + 44668, + 43904, + 41151, + 43235, + 39712, + 45373, + 43360, + 43896, + 45353, + 45682, + 46166, + 44693 + ] + }, + "num-zeros vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 43318, + 40956, + 43957, + 41617, + 44756, + 43946, + 41064, + 42479, + 44668, + 43904, + 41151, + 43235, + 39712, + 45373, + 43360, + 43896, + 45353, + 45682, + 46166, + 44693 + ] + }, + "params-norm": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 283.80362, + 283.8273, + 283.86469, + 283.90527, + 283.95059, + 284.00024, + 284.05206, + 284.10507, + 284.15643, + 284.20459, + 284.25775, + 284.30685, + 284.34851, + 284.38309, + 284.41144, + 284.43536, + 284.45441, + 284.46985, + 284.48169, + 284.49057 + ] + }, + "params-norm vs samples": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 283.80362, + 283.8273, + 283.86469, + 283.90527, + 283.95059, + 284.00024, + 284.05206, + 284.10507, + 284.15643, + 284.20459, + 284.25775, + 284.30685, + 284.34851, + 284.38309, + 284.41144, + 284.43536, + 284.45441, + 284.46985, + 284.48169, + 284.49057 + ] + }, + "iteration-time": { + "start_step": 0, + "end_step": 100, + "step_interval": 5, + "values": [ + 15.87098, + 0.73261, + 0.73669, + 0.73696, + 0.73228, + 0.73561, + 0.74191, + 0.73193, + 0.73279, + 0.75004, + 0.74974, + 0.73772, + 0.73447, + 0.73951, + 0.74553, + 0.73119, + 0.9162, + 0.74318, + 0.73275, + 0.74014 + ] + }, + "lm loss validation": { + "start_step": 0, + "end_step": 2, + "step_interval": 5, + "values": [ + 6.92026 + ] + }, + "lm loss validation vs samples": { + "start_step": 0, + "end_step": 2, + "step_interval": 5, + "values": [ + 6.92026 + ] + }, + "lm loss validation ppl": { + "start_step": 0, + "end_step": 2, + "step_interval": 5, + "values": [ + 1012.58173 + ] + }, + "lm loss validation ppl vs samples": { + "start_step": 0, + "end_step": 2, + "step_interval": 5, + "values": [ + 1012.58173 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..6aae44ca71 --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: local + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 10000 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 0 + --deterministic-mode: true + --ckpt-format: torch_dist +TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..6e9731d4ce --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,55 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: ^NVLS + CUBLAS_WORKSPACE_CONFIG: :4096:8 + N_REPEATS: 5 +MODEL_ARGS: + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + --micro-batch-size: 4 + --global-batch-size: 32 + --lr: 0.0001 + --train-iters: 100 + --lr-decay-iters: 100 + --lr-decay-style: linear + --min-lr: 0.00001 + --weight-decay: 1e-2 + --lr-warmup-fraction: .01 + --clip-grad: 1.0 + --bf16: true + --vocab-extra-ids: 100 + --init-method-std: 0.015 + --transformer-impl: local + --data-path: ${DATA_PATH}/my-t5_00_text_document + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --calculate-per-token-loss: true + --split: 99982,9,9 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --timing-log-level: 2 + --log-interval: 1 + --save-interval: 50 + --eval-interval: 1000 + --eval-iters: 10 + --distributed-backend: nccl + --data-cache-path: ${DATA_CACHE_PATH} + --encoder-pipeline-model-parallel-size: 0 + --deterministic-mode: true + --ckpt-format: torch_dist +TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml index c5dbbb35ea..64784c36a6 100644 --- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -46,7 +46,7 @@ MODEL_ARGS: --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt --tokenizer-type: BertWordPieceCase --split: 99982,9,9 - --data-cache-path: ${OUTPUT_PATH}/cache + --data-cache-path: ${DATA_CACHE_PATH} --vocab-extra-ids: 100 # EVAL_AND_LOGGING_ARGS diff --git a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py index d42b73b8af..5a31d9d3d4 100644 --- a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py +++ b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py @@ -1,10 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -from importlib.metadata import version - import pytest import torch -from pkg_resources import packaging from megatron.core import parallel_state from megatron.core.dist_checkpointing import load, load_plain_tensors, save @@ -21,11 +18,10 @@ from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.transformer.moe.experts import SequentialMLP, TEGroupedMLP from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import is_te_min_version from tests.unit_tests.dist_checkpointing import TempNamedDir from tests.unit_tests.test_utilities import Utils -_te_version = packaging.version.Version(version("transformer-engine")) - def initialize_expert_layer(seed, glu=True, moe_grouped_gemm=False, **config_kwargs): torch.manual_seed(seed) @@ -69,7 +65,7 @@ def get_pp_offsets(): moe_grouped_gemm_options = [False] -if _te_version >= packaging.version.Version("1.9.0.dev0"): +if is_te_min_version("1.9.0.dev0"): moe_grouped_gemm_options.append(True) @@ -155,7 +151,7 @@ def test_parallel_reconfiguration_e2e( assert not any(map(bool, diffs)), diffs @pytest.mark.skipif( - _te_version < packaging.version.Version("1.9.0.dev0"), + not is_te_min_version("1.9.0.dev0"), reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.", ) @pytest.mark.parametrize( diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py index a93f263d50..1238d09f76 100644 --- a/tests/unit_tests/dist_checkpointing/test_fp8.py +++ b/tests/unit_tests/dist_checkpointing/test_fp8.py @@ -51,6 +51,7 @@ def get_ten(dtype: str = 'fp8'): (False, (2, 4), (2, 4), None), ], ) + @pytest.mark.skip(reason="Flaky test") def test_fp8_save_load( self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo ): diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py index e1f3eb75f4..d5d5cdce8f 100644 --- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py +++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py @@ -29,6 +29,7 @@ def teardown_method(self, method): Utils.destroy_model_parallel() @pytest.mark.parametrize(('tp,pp'), [(2, 4)]) + @pytest.mark.skip(reason="Flaky test") def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp): Utils.initialize_model_parallel(tp, pp) num_floating_point_operations_so_far = 0 diff --git a/tests/unit_tests/export/trtllm/__init__.py b/tests/unit_tests/export/trtllm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py new file mode 100644 index 0000000000..5a0aa0e9c5 --- /dev/null +++ b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py @@ -0,0 +1,100 @@ +import pytest +import torch +from pytest_mock import mocker + +from megatron.core.export.data_type import DataType +from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT +from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import ( + DistributedTRTLLMModelWeightsConverter, +) +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec +from megatron.core.models.gpt.gpt_model import GPTModel +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + +_SEQUENCE_LENGTH = 64 +_VOCAB_SIZE = 256 + + +class TestTRTLLMDistributedGPUConverter: + + def setup_method(self, method): + Utils.initialize_model_parallel(2, 1) + model_parallel_cuda_manual_seed(123) + + transformer_config = TransformerConfig( + num_layers=2, + hidden_size=64, + num_attention_heads=2, + use_cpu_initialization=True, + pipeline_dtype=torch.float32, + add_qkv_bias=False, + add_bias_linear=False, + ) + self.gpt_model = GPTModel( + config=transformer_config, + transformer_layer_spec=get_gpt_layer_local_spec(), + vocab_size=_VOCAB_SIZE, + max_sequence_length=_SEQUENCE_LENGTH, + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_get_model_weights_converter(self, mocker): + device = torch.device("cuda") + self.gpt_model.to(device) + + transformer_config = self.gpt_model.config + + mocker.patch( + "megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.str_dtype_to_torch", + return_value=torch.float32, + ) + + dtype = DataType.bfloat16 + distributed_converter = DistributedTRTLLMModelWeightsConverter( + transformer_config, dtype, activation="gelu" + ) + + model_state_dict = {} + for key, val in self.gpt_model.state_dict().items(): + # val is non for _extra_state layers . We filter it out + if val is not None: + model_state_dict[key] = val + + distributed_converter.convert( + model_state_dict=model_state_dict, + trtllm_conversion_dict=GPT_DICT, + tokenizer_vocab_size=_VOCAB_SIZE, + ) + + expected_result = { + 'transformer.vocab_embedding.weight': torch.Size([128, 64]), + 'transformer.position_embedding.weight': torch.Size([32, 64]), + 'lm_head.weight': torch.Size([128, 64]), + 'transformer.ln_f.weight': torch.Size([64]), + 'transformer.ln_f.bias': torch.Size([64]), + 'transformer.layers.0.input_layernorm.weight': torch.Size([64]), + 'transformer.layers.0.input_layernorm.bias': torch.Size([64]), + 'transformer.layers.0.attention.dense.weight': torch.Size([64, 32]), + 'transformer.layers.0.attention.qkv.weight': torch.Size([96, 64]), + 'transformer.layers.0.post_layernorm.weight': torch.Size([64]), + 'transformer.layers.0.post_layernorm.bias': torch.Size([64]), + 'transformer.layers.0.mlp.fc.weight': torch.Size([128, 64]), + 'transformer.layers.0.mlp.proj.weight': torch.Size([64, 128]), + 'transformer.layers.1.input_layernorm.weight': torch.Size([64]), + 'transformer.layers.1.input_layernorm.bias': torch.Size([64]), + 'transformer.layers.1.attention.dense.weight': torch.Size([64, 32]), + 'transformer.layers.1.attention.qkv.weight': torch.Size([96, 64]), + 'transformer.layers.1.post_layernorm.weight': torch.Size([64]), + 'transformer.layers.1.post_layernorm.bias': torch.Size([64]), + 'transformer.layers.1.mlp.fc.weight': torch.Size([128, 64]), + 'transformer.layers.1.mlp.proj.weight': torch.Size([64, 128]), + } + + for key, value in distributed_converter.trtllm_model_weights.items(): + assert ( + expected_result[key] == value.shape + ), f"Shape mismatch for {key}. Expected {expected_result[key]} but got {value.shape}" diff --git a/tests/unit_tests/export/trtllm/test_trtllm_helper.py b/tests/unit_tests/export/trtllm/test_trtllm_helper.py new file mode 100644 index 0000000000..53c0a5ffea --- /dev/null +++ b/tests/unit_tests/export/trtllm/test_trtllm_helper.py @@ -0,0 +1,73 @@ +import pytest + +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.model_type import ModelType + + +# TODO : Remove importorskip and handle with mocker +class TestTRTLLMHelper: + + def test_exceptions(self, mocker): + pytest.importorskip('tensorrt_llm') + + from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper + + trtllm_helper = TRTLLMHelper( + transformer_config=None, + model_type=ModelType.gpt, + share_embeddings_and_output_weights=True, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + on_device_distributed_conversion=True, + vocab_size=None, + gpus_per_node=2, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + on_device_distributed_conversion=True, + ModelType=ModelType.falcon, + vocab_size=100, + gpus_per_node=2, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + export_config=ExportConfig(), + on_device_distributed_conversion=True, + vocab_size=100, + gpus_per_node=2, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + vocab_size=100, + on_device_distributed_conversion=True, + gpus_per_node=None, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + export_config=ExportConfig(use_embedding_sharing=False), + on_device_distributed_conversion=False, + ) + + with pytest.raises(AssertionError): + trtllm_helper.get_trtllm_pretrained_config_and_model_weights( + model_state_dict=None, + dtype=None, + export_config=ExportConfig(use_embedding_sharing=True), + vocab_size=100, + ) diff --git a/tests/unit_tests/export/trtllm/test_trtllm_layers.py b/tests/unit_tests/export/trtllm/test_trtllm_layers.py new file mode 100644 index 0000000000..b2e88852e5 --- /dev/null +++ b/tests/unit_tests/export/trtllm/test_trtllm_layers.py @@ -0,0 +1,111 @@ +import pytest + +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers, get_layer_name_without_prefix + + +class TestTRTLLMLayers: + + def test_rename_input_layer_names_to_trtllm_layer_names_without_layer_numbers(self): + + conversion_dict = { + "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias, + "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight, + } + sample_dict = { + "transformer.layers.attn.dense.bias": 0, + "transformer.layers.mlp.fc1.weight": 1, + } + + converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=sample_dict, + trtllm_conversion_dict=conversion_dict, + state_dict_split_by_layer_numbers=False, + ) + assert ( + converted_dict[TRTLLMLayers.attention_dense_bias.value] == 0 + ), "Something wrong with conversion dict" + assert ( + converted_dict[TRTLLMLayers.mlp_fc_weight.value] == 1 + ), "Something wrong with conversion dict" + + def test_rename_input_layer_names_to_trtllm_layer_names_exception(self): + + with pytest.raises(AssertionError): + conversion_dict = { + "transformer.layers.attn.dense.bias": "randomValue", + "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight, + } + sample_dict = { + "transformer.layers.attn.dense.bias": 0, + "transformer.layers.mlp.fc1.weight": 1, + } + TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=sample_dict, + trtllm_conversion_dict=conversion_dict, + state_dict_split_by_layer_numbers=False, + ) + + with pytest.raises(Exception): + sample_dict = { + "transformer.layers.attn.dense.bias": 0, + "transformer.layers.mlp.fc1.weight": 1, + } + del conversion_dict["attn.dense.bias"] + TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=sample_dict, + trtllm_conversion_dict=conversion_dict, + state_dict_split_by_layer_numbers=False, + ) + + with pytest.raises(Exception): + conversion_dict = { + "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias, + "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight, + } + sample_dict = { + "transformer.layers.attn.dense.bias": 0, + "transformer.layers.mlp.fc1.weight": 1, + } + + TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=sample_dict, + trtllm_conversion_dict=conversion_dict, + state_dict_split_by_layer_numbers=True, + ) + + def test_rename_input_layer_names_to_trtllm_layer_names_with_layer_numbers(self): + + conversion_dict = { + "decoder.lm_head.weight": TRTLLMLayers.lm_head, + "decoder.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias, + "deocder.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight, + } + sample_dict = { + "decoder.lm_head.weight": 2, + "decoder.layers.0.attn.dense.bias": 0, + "deocder.layers.43.mlp.fc1.weight": 1, + } + + converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names( + model_state_dict=sample_dict, + trtllm_conversion_dict=conversion_dict, + state_dict_split_by_layer_numbers=False, + ) + + assert ( + converted_dict['transformer.layers.0.attention.dense.bias'] == 0 + ), "Something wrong with conversion of layer names" + assert ( + converted_dict['transformer.layers.43.mlp.fc.weight'] == 1 + ), "Something wrong with conversion of layer names" + assert ( + converted_dict['lm_head.weight'] == 2 + ), "Something wrong with conversion of layer names" + + def test_get_layer_name_without_prefix(self): + layer_name_without_prefix = get_layer_name_without_prefix( + TRTLLMLayers.attention_dense_weight + ) + assert ( + layer_name_without_prefix == "attention.dense.weight" + ), f"get_layer_name_without_prefix returned {layer_name_without_prefix}, expected attention.dense.weight" diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py new file mode 100644 index 0000000000..e431326f0b --- /dev/null +++ b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py @@ -0,0 +1,169 @@ +import torch +from pytest_mock import mocker + +from megatron.core.export.data_type import DataType +from megatron.core.export.export_config import ExportConfig +from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers +from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import ( + SingleDeviceTRTLLMModelWeightsConverter, +) +from megatron.core.transformer.transformer_config import TransformerConfig + + +class TestTRTLLMSingleDeviceConverter: + def test_get_model_weights_converter(self, mocker): + + export_config = ExportConfig(inference_tp_size=2) + + vocab_size = 10 + hidden_dim = 4 + seq_len = 8 + num_layers = 2 + num_attn_heads = 2 + + model_config = TransformerConfig( + num_layers=num_layers, + num_attention_heads=num_attn_heads, + num_query_groups=0, + hidden_size=hidden_dim, + ffn_hidden_size=hidden_dim * 4, + ) + + dtype = DataType.bfloat16 + + model_state_dict = { + "decoder.position_embedding.weight": torch.randn(seq_len, hidden_dim), + "decoder.word_embedding.weight": torch.randn(vocab_size, hidden_dim), + "decoder.lm_head.weight": torch.randn(vocab_size, hidden_dim), + "decoder.final_layernorm.weight": torch.randn(hidden_dim), + "decoder.layers.input_layernorm.weight": torch.randn(num_layers, hidden_dim), + "decoder.layers.attention.qkv.weight": torch.randn( + num_layers, hidden_dim * 3, hidden_dim + ), + "decoder.layers.attention.qkv.bias": torch.randn(num_layers, hidden_dim * 3), + "decoder.layers.attention.dense.weight": torch.randn( + num_layers, hidden_dim, hidden_dim + ), + "deocder.layers.mlp.fc.weight": torch.randn(num_layers, 4 * hidden_dim, hidden_dim), + "decoder.layers.mlp.fc.expert": torch.randn(num_layers, hidden_dim, hidden_dim * 4), + "decoder.layers.mlp.proj.expert": torch.randn(num_layers, hidden_dim * 4, hidden_dim), + } + + trtllm_conversion_dict = { + "decoder.position_embedding.weight": TRTLLMLayers.position_embedding, + "decoder.word_embedding.weight": TRTLLMLayers.vocab_embedding, + "decoder.final_layernorm.weight": TRTLLMLayers.final_layernorm_weight, + "decoder.lm_head.weight": TRTLLMLayers.lm_head, + "decoder.layers.input_layernorm.weight": TRTLLMLayers.input_layernorm_weight, + "decoder.layers.attention.qkv.weight": TRTLLMLayers.attention_qkv_weight, + "decoder.layers.attention.qkv.bias": TRTLLMLayers.attention_qkv_bias, + "decoder.layers.attention.dense.weight": TRTLLMLayers.attention_dense_weight, + "deocder.layers.mlp.fc.weight": TRTLLMLayers.mlp_fc_weight, + "decoder.layers.mlp.fc.expert": TRTLLMLayers.mlp_fc_weight_mixture_of_experts, + "decoder.layers.mlp.proj.expert": TRTLLMLayers.mlp_projection_weight_mixture_of_experts, + } + + mocker.patch( + "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.str_dtype_to_torch", + return_value=torch.float32, + ) + + trtllm_model_weights_converter_cpu = SingleDeviceTRTLLMModelWeightsConverter( + export_config, model_config, dtype, activation="swiglu" + ) + + mocker.patch( + "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.pad_vocab_size", + return_value=10, + ) + + trtllm_model_weights_converter_cpu.convert( + model_state_dict=model_state_dict, + trtllm_conversion_dict=trtllm_conversion_dict, + state_dict_split_by_layer_numbers=False, + ) + + expected_shapes = { + 'transformer.vocab_embedding.weight': (10, 4), + 'transformer.position_embedding.weight': (8, 4), + 'lm_head.weight': (10, 4), + 'transformer.ln_f.weight': (4,), + 'transformer.layers.0.input_layernorm.weight': (4,), + 'transformer.layers.1.input_layernorm.weight': (4,), + 'transformer.layers.0.attention.qkv.weight.0.bin': (6, 4), + 'transformer.layers.0.attention.qkv.weight.1.bin': (6, 4), + 'transformer.layers.1.attention.qkv.weight.0.bin': (6, 4), + 'transformer.layers.1.attention.qkv.weight.1.bin': (6, 4), + 'transformer.layers.0.attention.qkv.bias.0.bin': (6,), + 'transformer.layers.0.attention.qkv.bias.1.bin': (6,), + 'transformer.layers.1.attention.qkv.bias.0.bin': (6,), + 'transformer.layers.1.attention.qkv.bias.1.bin': (6,), + 'transformer.layers.0.attention.dense.weight.0.bin': (4, 2), + 'transformer.layers.0.attention.dense.weight.1.bin': (4, 2), + 'transformer.layers.1.attention.dense.weight.0.bin': (4, 2), + 'transformer.layers.1.attention.dense.weight.1.bin': (4, 2), + 'transformer.layers.0.mlp.gate.weight.0.bin': (4, 4), + 'transformer.layers.0.mlp.gate.weight.1.bin': (4, 4), + 'transformer.layers.0.mlp.fc.weight.0.bin': (16, 2), + 'transformer.layers.0.mlp.fc.weight.1.bin': (16, 2), + 'transformer.layers.1.mlp.gate.weight.0.bin': (4, 4), + 'transformer.layers.1.mlp.gate.weight.1.bin': (4, 4), + 'transformer.layers.1.mlp.fc.weight.0.bin': (16, 2), + 'transformer.layers.1.mlp.fc.weight.1.bin': (16, 2), + 'transformer.layers.0.mlp.proj.weight.0.bin': (4, 8), + 'transformer.layers.0.mlp.proj.weight.1.bin': (4, 8), + 'transformer.layers.1.mlp.proj.weight.0.bin': (4, 8), + 'transformer.layers.1.mlp.proj.weight.1.bin': (4, 8), + } + + for key, value in trtllm_model_weights_converter_cpu.trtllm_model_weights.items(): + assert ( + expected_shapes[key] == value.shape + ), f"Shape mismatch for {key}. Expected {expected_shapes[key]} but got {value.shape}" + + class SampleMapping: + + def __init__(self): + self.tp_size = 2 + self.tp_rank = 1 + + def pp_layers(self, num_layers): + return [0, 1] + + def is_first_pp_rank(self): + return True + + def is_last_pp_rank(self): + return True + + trtllm_model_weights_per_gpu = ( + trtllm_model_weights_converter_cpu.get_local_model_weights_per_gpu( + mapping=SampleMapping(), trtllm_model_config=None + ) + ) + + expected_result_per_gpu = { + 'transformer.layers.0.input_layernorm.weight': (4,), + 'transformer.layers.1.input_layernorm.weight': (4,), + 'transformer.layers.0.attention.qkv.weight': (6, 4), + 'transformer.layers.1.attention.qkv.weight': (6, 4), + 'transformer.layers.0.attention.qkv.bias': (6,), + 'transformer.layers.1.attention.qkv.bias': (6,), + 'transformer.layers.0.attention.dense.weight': (4, 2), + 'transformer.layers.1.attention.dense.weight': (4, 2), + 'transformer.layers.0.mlp.gate.weight': (4, 4), + 'transformer.layers.0.mlp.fc.weight': (16, 2), + 'transformer.layers.1.mlp.gate.weight': (4, 4), + 'transformer.layers.1.mlp.fc.weight': (16, 2), + 'transformer.layers.0.mlp.proj.weight': (4, 8), + 'transformer.layers.1.mlp.proj.weight': (4, 8), + 'transformer.vocab_embedding.weight': (10, 4), + 'transformer.position_embedding.weight': (8, 4), + 'lm_head.weight': (5, 4), + 'transformer.ln_f.weight': (4,), + } + + for key, value in trtllm_model_weights_per_gpu.items(): + assert ( + expected_result_per_gpu[key] == value.shape + ), f"Shape mismatch for {key}. Expected {expected_result_per_gpu[key]} but got {value.shape}" diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py index 161284ceeb..835aeed22d 100644 --- a/tests/unit_tests/inference/engines/test_mcore_engine.py +++ b/tests/unit_tests/inference/engines/test_mcore_engine.py @@ -93,3 +93,30 @@ def test_generate(self): ), f"Status should be completed but its {result.status}" assert result.generated_length > 0, f"Generated length should be greater than zero" assert result.generated_text is not None, f'Generated text should not be None' + + def test_generate_empty_prompt(self): + self.mock_tokenizer.vocab_size = self.vocab_size + self.mock_tokenizer.eod = self.vocab_size - 1 + self.mock_tokenizer.bos = self.vocab_size - 2 + # Generating random length integer prompts + self.mock_tokenizer.tokenize.return_value = [ + random.randint(0, self.vocab_size - 1) for _ in range(random.randint(5, 10)) + ] + # Generates some random string + self.mock_tokenizer.detokenize.return_value = ''.join( + random.choices(string.ascii_letters, k=random.randint(4, 10)) + ) + + prompts = ["" for i in range(self.batch_size)] + results: List[InferenceRequest] = self.mcore_engine.generate( + prompts, + add_BOS=True, + common_inference_params=CommonInferenceParams(num_tokens_to_generate=10), + ) + + for result in results: + assert ( + result.status == Status.COMPLETED + ), f"Status should be completed but its {result.status}" + assert result.generated_length > 0, f"Generated length should be greater than zero" + assert result.generated_text is not None, f'Generated text should not be None' diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py new file mode 100644 index 0000000000..b9ece5c395 --- /dev/null +++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py @@ -0,0 +1,124 @@ +from argparse import Namespace +from copy import deepcopy +from unittest import mock + +import numpy as np +import torch + +from megatron.core import parallel_state +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( + T5InferenceWrapper, +) +from megatron.core.models.T5.t5_model import T5Model +from megatron.core.models.T5.t5_spec import ( + get_t5_decoder_with_transformer_engine_block_spec, + get_t5_encoder_with_transformer_engine_block_spec, +) +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + + +class TestT5InferenceWrapper: + + def setup_model(self, tensor_parallel_size, pipeline_parallel_size): + Utils.initialize_model_parallel( + tensor_model_parallel_size=tensor_parallel_size, + pipeline_model_parallel_size=pipeline_parallel_size, + ) + model_parallel_cuda_manual_seed(123) + self.vocab_size = 100 + self.batch_size = 8 + self.encoder_sequence_length = 32 + self.decoder_sequence_length = 16 + hidden_size = 768 + + transformer_config = TransformerConfig( + num_layers=12, + hidden_size=hidden_size, + num_attention_heads=12, + tensor_model_parallel_size=tensor_parallel_size, + pipeline_model_parallel_size=pipeline_parallel_size, + ) + + encoder_config = deepcopy(transformer_config) + encoder_config.num_layers = transformer_config.num_layers + + encoder_layers_per_pipeline = ( + encoder_config.num_layers // encoder_config.pipeline_model_parallel_size + ) + decoder_layers_per_pipeline = ( + transformer_config.num_layers // transformer_config.pipeline_model_parallel_size + ) + en_block_spec = get_t5_encoder_with_transformer_engine_block_spec( + encoder_layers_per_pipeline + ) + de_block_spec = get_t5_decoder_with_transformer_engine_block_spec( + decoder_layers_per_pipeline + ) + + t5_model = T5Model( + config=transformer_config, + encoder_config=encoder_config, + transformer_encoder_layer_spec=en_block_spec, + transformer_decoder_layer_spec=de_block_spec, + vocab_size=self.vocab_size, + max_sequence_length=self.encoder_sequence_length, + parallel_output=True, + pre_process=True, + post_process=True, + add_encoder=True, + add_decoder=True, + ).cuda() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=20, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size, + ) + + self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_inference_only_tensor_parallel(self): + self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1) + + batch_prompt_tokens = ( + torch.randint( + low=0, high=self.vocab_size, size=(self.batch_size, self.decoder_sequence_length) + ) + .int() + .cuda() + ) + batch_encoder_prompts = ["sample prompt encoders"] * self.batch_size + mock_tokenizer = mock.Mock() + mock_tokenizer.pad = self.vocab_size - 1 + mock_tokenizer.additional_special_tokens_ids = list(range(100)) + mock_tokenizer.tokenize.return_value = np.random.randint( + self.vocab_size, size=self.encoder_sequence_length + ).tolist() + + self.inference_wrapped_model.prep_model_for_inference( + prompts_tokens=batch_prompt_tokens, + encoder_prompts=batch_encoder_prompts, + tokenizer=mock_tokenizer, + ) + + inference_input = self.inference_wrapped_model.get_batch_for_context_window( + 0, self.decoder_sequence_length + ) + + logits = self.inference_wrapped_model.run_one_forward_step(inference_input) + + assert logits.shape == ( + self.batch_size, + self.decoder_sequence_length, + self.vocab_size, + ), f"Shape mismatch . Expected {(self.batch_size, self.decoder_sequence_length, self.vocab_size)}, but got {logits.shape}" diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py new file mode 100644 index 0000000000..14c9a88852 --- /dev/null +++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py @@ -0,0 +1,143 @@ +import random +import string +import time +from collections import OrderedDict +from copy import deepcopy +from typing import Dict +from unittest import mock + +import numpy as np +import pytest +import torch + +from megatron.core.inference.common_inference_params import CommonInferenceParams +from megatron.core.inference.inference_request import InferenceRequest, Status +from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import ( + InferenceWrapperConfig, +) +from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import ( + T5InferenceWrapper, +) +from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import ( + EncoderDecoderTextGenerationController, +) +from megatron.core.models.T5.t5_model import T5Model +from megatron.core.models.T5.t5_spec import ( + get_t5_decoder_with_transformer_engine_block_spec, + get_t5_encoder_with_transformer_engine_block_spec, +) +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.transformer_config import TransformerConfig +from tests.unit_tests.test_utilities import Utils + + +class TestEncoderDecoderTextGenerationController: + + def setup_method(self, method): + Utils.initialize_model_parallel( + tensor_model_parallel_size=4, pipeline_model_parallel_size=1 + ) + model_parallel_cuda_manual_seed(123) + self.vocab_size = 100 + self.batch_size = 8 + self.encoder_sequence_length = 32 + self.decoder_sequence_length = 16 + hidden_size = 768 + + transformer_config = TransformerConfig( + num_layers=12, + hidden_size=hidden_size, + num_attention_heads=12, + tensor_model_parallel_size=4, + pipeline_model_parallel_size=1, + ) + + encoder_config = deepcopy(transformer_config) + encoder_config.num_layers = transformer_config.num_layers + + encoder_layers_per_pipeline = ( + encoder_config.num_layers // encoder_config.pipeline_model_parallel_size + ) + decoder_layers_per_pipeline = ( + transformer_config.num_layers // transformer_config.pipeline_model_parallel_size + ) + en_block_spec = get_t5_encoder_with_transformer_engine_block_spec( + encoder_layers_per_pipeline + ) + de_block_spec = get_t5_decoder_with_transformer_engine_block_spec( + decoder_layers_per_pipeline + ) + + t5_model = T5Model( + config=transformer_config, + encoder_config=encoder_config, + transformer_encoder_layer_spec=en_block_spec, + transformer_decoder_layer_spec=de_block_spec, + vocab_size=self.vocab_size, + max_sequence_length=self.encoder_sequence_length, + parallel_output=True, + pre_process=True, + post_process=True, + add_encoder=True, + add_decoder=True, + ).cuda() + + inference_wrapper_config = InferenceWrapperConfig( + hidden_size=hidden_size, + inference_batch_times_seqlen_threshold=20, + fp32_residual_connection=False, + params_dtype=torch.float, + padded_vocab_size=self.vocab_size, + ) + + inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config) + + self.mock_tokenizer = mock.Mock() + + self.text_generation_controller = EncoderDecoderTextGenerationController( + inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_generate_all_output_tokens_static_batch(self): + self.mock_tokenizer.vocab_size = self.vocab_size + self.mock_tokenizer.eod = self.vocab_size - 1 + self.mock_tokenizer.pad = self.vocab_size - 2 + self.mock_tokenizer.additional_special_tokens_ids = list(range(100)) + self.mock_tokenizer.detokenize.return_value = ''.join( + random.choices(string.ascii_letters, k=random.randint(4, 10)) + ) + self.mock_tokenizer.tokenize.return_value = np.random.randint( + self.vocab_size, size=(self.encoder_sequence_length - 5) + ).tolist() + + active_requests: Dict[int, InferenceRequest] = OrderedDict() + for i in range(self.batch_size): + prompt = "decoder_sample" + prompt_tokens = np.random.randint( + self.vocab_size, size=self.decoder_sequence_length + ).tolist() + encoder_prompt = "encoder_sample" + inference_request = InferenceRequest( + request_id=i, + prompt=prompt, + encoder_prompt=encoder_prompt, + inference_parameters=CommonInferenceParams(num_tokens_to_generate=10), + arrival_time=time.time(), + prompt_tokens=prompt_tokens, + status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS, + ) + active_requests[i] = inference_request + + requests = self.text_generation_controller.generate_all_output_tokens_static_batch( + active_requests + ) + + for request_id, request in requests.items(): + assert ( + request.status == Status.COMPLETED + ), f"Status should be completed but its {request.status}" + assert request.generated_length > 0, f"Generated length should be greater than zero" + assert request.generated_text is not None, "Generated text should not be None" diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py index a9f15faf80..df7109e021 100644 --- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py +++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py @@ -26,7 +26,7 @@ from tests.unit_tests.test_utilities import Utils -class TestTextGenerationController: +class TestSimpleTextGenerationController: def setup_method(self, method): Utils.initialize_model_parallel( diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py index 30d4aec024..186ce5c34e 100644 --- a/tests/unit_tests/models/test_bert_model.py +++ b/tests/unit_tests/models/test_bert_model.py @@ -5,17 +5,20 @@ import pytest import torch -from pkg_resources import packaging +from packaging.version import Version as PkgVersion from pytest_mock import mocker -from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec +from megatron.core.models.bert.bert_layer_specs import ( + bert_layer_local_spec, + bert_layer_with_transformer_engine_spec, +) from megatron.core.models.bert.bert_model import BertModel from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import is_te_min_version from tests.unit_tests.test_utilities import Utils -_te_version = packaging.version.Version(version("transformer-engine")) - class TestBertModel: @@ -91,110 +94,131 @@ def test_post_process_forward(self): assert logits[0].shape[2] == self.bert_model.vocab_size -class TestBertModelAssertions: +class TestBertModelAttentionDimensions: - @pytest.mark.internal - def test_te_assertions_te_less_than_1_7(self, mocker): - os.environ.pop('NVTE_FLASH_ATTN', None) + def teardown_method(self, method): + Utils.destroy_model_parallel() os.environ.pop('NVTE_FUSED_ATTN', None) - tp = 1 - pp = 1 - Utils.initialize_model_parallel(tp, pp) + os.environ.pop('NVTE_FLASH_ATTN', None) + os.environ.pop('NVTE_UNFUSED_ATTN', None) + + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( + self.transformer_config = TransformerConfig( num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True, - perform_initialization=True, - tensor_model_parallel_size=tp, - pipeline_model_parallel_size=pp, pipeline_dtype=torch.bfloat16, ) + # This should convert arbitray mask to padding mask + self.bert_model = BertModel( + config=self.transformer_config, + num_tokentypes=0, + transformer_layer_spec=bert_layer_with_transformer_engine_spec, + vocab_size=100, + max_sequence_length=4, + ) + + @pytest.mark.internal + def test_local_spec(self, mocker): + self.bert_model.transformer_layer_spec = bert_layer_local_spec + attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension() + assert ( + attn_mask_dimensions == "b1ss" + ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}" + + @pytest.mark.internal + def test_transformer_engine_version_1_10(self, mocker): + bert_layer_with_transformer_engine_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] == AttnMaskType.arbitrary + + mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.10")) + self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec + attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension() + attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] + assert ( + attn_mask_type == AttnMaskType.padding + ), f"Exepcted attn mask type to be padding, but got {attn_mask_type}" + assert ( + attn_mask_dimensions == "b11s" + ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}" + + @pytest.mark.internal + def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker): + os.environ['NVTE_FLASH_ATTN'] = '1' + + mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8")) + self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec + attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension() + assert ( + attn_mask_dimensions == "b11s" + ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}" + + @pytest.mark.internal + def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker): + os.environ['NVTE_FLASH_ATTN'] = '0' + os.environ['NVTE_FUSED_ATTN'] = '0' + bert_layer_with_transformer_engine_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] == AttnMaskType.padding + mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8")) with pytest.raises(Exception) as exc_info: - mocker.patch( - "megatron.core.models.bert.bert_model.get_te_version", - return_value=packaging.version.Version("1.4"), - ) self.bert_model = BertModel( - config=transformer_config, + config=self.transformer_config, num_tokentypes=0, transformer_layer_spec=bert_layer_with_transformer_engine_spec, vocab_size=100, max_sequence_length=4, ) - - assert ( - str(exc_info.value) - == "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7" + assert str(exc_info.value) == ( + "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when " + "instantiating TERowParallelLinear when instantiating SelfAttention when " + "instantiating TransformerLayer" ) @pytest.mark.internal - def test_te_assertions_te_equal_to_1_7_exception(self, mocker): + def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker): os.environ['NVTE_FLASH_ATTN'] = '0' os.environ['NVTE_FUSED_ATTN'] = '0' - tp = 1 - pp = 1 - Utils.initialize_model_parallel(tp, pp) - model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( - num_layers=2, - hidden_size=12, - num_attention_heads=4, - use_cpu_initialization=True, - perform_initialization=True, - tensor_model_parallel_size=tp, - pipeline_model_parallel_size=pp, - pipeline_dtype=torch.bfloat16, - ) + bert_layer_with_transformer_engine_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] == AttnMaskType.padding + mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8")) + self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec + attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension() + attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[ + 'attn_mask_type' + ] + assert ( + attn_mask_type == AttnMaskType.arbitrary + ), f"Exepcted attn mask type to be arbitrary, but got {attn_mask_type}" + assert ( + attn_mask_dimensions == "b1ss" + ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}" + Utils.destroy_model_parallel() + + @pytest.mark.internal + def test_transformer_engine_version_less_than_1_7(self, mocker): + os.environ['NVTE_FLASH_ATTN'] = '1' with pytest.raises(Exception) as exc_info: - mocker.patch( - "megatron.core.models.bert.bert_model.get_te_version", - return_value=packaging.version.Version("1.7"), - ) + mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.5")) self.bert_model = BertModel( - config=transformer_config, + config=self.transformer_config, num_tokentypes=0, transformer_layer_spec=bert_layer_with_transformer_engine_spec, vocab_size=100, max_sequence_length=4, ) - assert ( - str(exc_info.value) - == "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary" - ) - - @pytest.mark.internal - def test_te_assertions_te_equal_to_1_7_no_exception(self, mocker): - os.environ.pop('NVTE_FLASH_ATTN', None) - os.environ.pop('NVTE_FUSED_ATTN', None) - tp = 1 - pp = 1 - Utils.initialize_model_parallel(tp, pp) - model_parallel_cuda_manual_seed(123) - transformer_config = TransformerConfig( - num_layers=2, - hidden_size=12, - num_attention_heads=4, - use_cpu_initialization=True, - perform_initialization=True, - tensor_model_parallel_size=tp, - pipeline_model_parallel_size=pp, - pipeline_dtype=torch.bfloat16, - ) - - mocker.patch( - "megatron.core.models.bert.bert_model.get_te_version", - return_value=packaging.version.Version("1.7"), + assert str(exc_info.value) == ( + "Flash and fused attention is not supported with transformer engine version " + "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer " + "engine >= 1.7" ) - self.bert_model = BertModel( - config=transformer_config, - num_tokentypes=0, - transformer_layer_spec=bert_layer_with_transformer_engine_spec, - vocab_size=100, - max_sequence_length=4, - ) - Utils.destroy_model_parallel() diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py index e246ef466a..b3142fb807 100644 --- a/tests/unit_tests/models/test_llava_model.py +++ b/tests/unit_tests/models/test_llava_model.py @@ -18,16 +18,22 @@ def setup_method(self, method): Utils.initialize_model_parallel(1, 1) model_parallel_cuda_manual_seed(123) + self.language_hidden_size = 64 + self.language_num_attention_heads = 4 + language_config = TransformerConfig( - num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False + num_layers=3, + hidden_size=self.language_hidden_size, + num_attention_heads=self.language_num_attention_heads, + use_cpu_initialization=False, ) vision_config = TransformerConfig( - num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False + num_layers=2, hidden_size=16, num_attention_heads=2, use_cpu_initialization=False ) vision_projection_config = TransformerConfig( num_layers=2, - hidden_size=128, - ffn_hidden_size=72, + hidden_size=self.language_hidden_size, + ffn_hidden_size=32, num_attention_heads=1, use_cpu_initialization=False, ) @@ -36,10 +42,11 @@ def setup_method(self, method): vision_layer_spec = deepcopy(language_layer_spec) vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules) + vision_config.vision_model_type = "clip" self.model = LLaVAModel( language_transformer_config=language_config, language_transformer_layer_spec=language_layer_spec, - language_vocab_size=2048, + language_vocab_size=8192, language_max_sequence_length=4096, vision_transformer_config=vision_config, vision_transformer_layer_spec=vision_layer_spec, @@ -60,7 +67,7 @@ def test_constructor(self): assert isinstance(self.model, LLaVAModel) num_weights = sum([p.numel() for p in self.model.parameters()]) - assert num_weights == 1832520 + assert num_weights == 1488736 @pytest.mark.internal def test_set_input_tensor(self): @@ -73,12 +80,18 @@ def test_set_input_tensor(self): def test_preprocess_data(self): self.model.cuda() - image_embedding_value = torch.tensor(123.0) + hidden_size = 72 + # 3 images with 1 tile and 2 image with 2 tiles = 7 tiles. - image_embeddings = image_embedding_value * torch.ones((577, 7, 128)).cuda() + image_embeddings = ( + 1e-5 + * torch.arange(577 * 7 * hidden_size, dtype=torch.float) + .reshape(577, 7, hidden_size) + .cuda() + ) image_token_index = -200 - input_ids = torch.arange(0, 1024, dtype=torch.int).expand(5, 1024).cuda() + input_ids = torch.arange(1024).expand(5, 1024).cuda() input_ids[0, 0] = image_token_index # image before text input_ids[1, 100] = image_token_index # image in between input_ids[2, -1] = image_token_index # image at the end @@ -86,8 +99,14 @@ def test_preprocess_data(self): input_ids[4, 50] = image_token_index # two images in between input_ids[4, 150] = image_token_index - language_embedding_value = torch.tensor(999.0) - language_embeddings = language_embedding_value * torch.ones((5, 1024, 128)).cuda() + # Offset by 1000 to distinguish from image embeddings. + language_embeddings = ( + 1000.0 + + 1e-5 + * torch.arange(5 * 1024 * hidden_size, dtype=torch.float) + .reshape(5, 1024, hidden_size) + .cuda() + ) # Labels are input_ids shifted to left by one. labels = torch.arange(1, 1025, dtype=torch.int).expand(5, 1024).cuda() @@ -121,14 +140,14 @@ def test_preprocess_data(self): # The fifth sample has 2 images with 3 tiles and 1024 text tokens. max_seq_len = 3 * img_seq_len - 2 + 1024 - assert embeddings.shape == torch.Size((max_seq_len, 5, 128)) + assert embeddings.shape == torch.Size((max_seq_len, 5, hidden_size)) assert labels.shape == torch.Size((5, max_seq_len)) assert loss_mask.shape == labels.shape # First sample where image is before text (index 0). - expected_embeddings = torch.empty(max_seq_len).cuda() - expected_embeddings[:577] = image_embedding_value - expected_embeddings[577:1600] = language_embedding_value + expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda() + expected_embeddings[:577] = image_embeddings[:, 0] + expected_embeddings[577:1600] = language_embeddings[0, 1:] expected_embeddings[1600:] = 0 # padding expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda() @@ -144,15 +163,16 @@ def test_preprocess_data(self): expected_loss_mask[696:1600] = 1 expected_loss_mask[1600:] = 0 - assert torch.allclose(embeddings[:, 0], expected_embeddings.unsqueeze(1)) + assert torch.allclose(embeddings[:, 0], expected_embeddings) assert torch.allclose(labels[0], expected_labels) assert torch.allclose(loss_mask[0], expected_loss_mask) # Second sample where image is in between (index 100). The image has 2 tiles. - expected_embeddings = torch.empty(max_seq_len).cuda() - expected_embeddings[:100] = language_embedding_value - expected_embeddings[100:1254] = image_embedding_value - expected_embeddings[1254:2177] = language_embedding_value + expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda() + expected_embeddings[:100] = language_embeddings[1, :100] + expected_embeddings[100:677] = image_embeddings[:, 1] + expected_embeddings[677:1254] = image_embeddings[:, 2] + expected_embeddings[1254:2177] = language_embeddings[1, 101:] expected_embeddings[2177:] = 0 # padding expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda() @@ -172,14 +192,14 @@ def test_preprocess_data(self): expected_loss_mask[1273:2177] = 1 expected_loss_mask[2177:] = 0 # padding - assert torch.allclose(embeddings[:, 1], expected_embeddings.unsqueeze(1)) + assert torch.allclose(embeddings[:, 1], expected_embeddings) assert torch.allclose(labels[1], expected_labels) assert torch.allclose(loss_mask[1], expected_loss_mask) # Third sample where image is at the end. - expected_embeddings = torch.empty(max_seq_len).cuda() - expected_embeddings[:1023] = language_embedding_value - expected_embeddings[1023:1600] = image_embedding_value + expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda() + expected_embeddings[:1023] = language_embeddings[2, :1023] + expected_embeddings[1023:1600] = image_embeddings[:, 3] expected_embeddings[1600:] = 0 # padding expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda() @@ -195,13 +215,13 @@ def test_preprocess_data(self): expected_loss_mask[1023:1600] = 0 expected_loss_mask[1600:] = 0 # padding - assert torch.allclose(embeddings[:, 2], expected_embeddings.unsqueeze(1)) + assert torch.allclose(embeddings[:, 2], expected_embeddings) assert torch.allclose(labels[2], expected_labels) assert torch.allclose(loss_mask[2], expected_loss_mask) # Fourth sample where there is no image. - expected_embeddings = torch.empty(max_seq_len).cuda() - expected_embeddings[:1024] = language_embedding_value + expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda() + expected_embeddings[:1024] = language_embeddings[3] expected_embeddings[1024:] = 0 # padding expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda() @@ -212,17 +232,18 @@ def test_preprocess_data(self): expected_loss_mask[:1024] = 1 expected_loss_mask[1024:] = 0 # padding - assert torch.allclose(embeddings[:, 3], expected_embeddings.unsqueeze(1)) + assert torch.allclose(embeddings[:, 3], expected_embeddings) assert torch.allclose(labels[3], expected_labels) assert torch.allclose(loss_mask[3], expected_loss_mask) - # Fifth sample has two images in between. The first image has two tiles. - expected_embeddings = torch.empty(max_seq_len).cuda() - expected_embeddings[:50] = language_embedding_value - expected_embeddings[50:1204] = image_embedding_value # two tiles - expected_embeddings[1204:1303] = language_embedding_value - expected_embeddings[1303:1880] = image_embedding_value - expected_embeddings[1880:] = language_embedding_value + # Fifth sample has two images in between (indices 50 and 150). The first image has two tiles. + expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda() + expected_embeddings[:50] = language_embeddings[4, :50] + expected_embeddings[50:627] = image_embeddings[:, 4] # two tiles + expected_embeddings[627:1204] = image_embeddings[:, 5] + expected_embeddings[1204:1303] = language_embeddings[4, 51:150] + expected_embeddings[1303:1880] = image_embeddings[:, 6] + expected_embeddings[1880:] = language_embeddings[4, 151:] expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda() expected_labels[:49] = torch.arange(1, 50) @@ -238,7 +259,7 @@ def test_preprocess_data(self): expected_loss_mask[1302:1880] = 0 expected_loss_mask[1880:] = 1 - assert torch.allclose(embeddings[:, 4], expected_embeddings.unsqueeze(1)) + assert torch.allclose(embeddings[:, 4], expected_embeddings) assert torch.allclose(labels[4], expected_labels) assert torch.allclose(loss_mask[4], expected_loss_mask) @@ -309,7 +330,7 @@ def test_forward(self): loss_mask=None, num_image_tiles=num_image_tiles, ) - assert logits.shape == torch.Size((5, max_seq_len, 2048)) + assert logits.shape == torch.Size((5, max_seq_len, 8192)) # Try without labels and with inference params. inference_params = InferenceParams(5, max_seq_len) @@ -323,7 +344,7 @@ def test_forward(self): num_image_tiles=num_image_tiles, inference_params=inference_params, ) - assert logits.shape == torch.Size((5, max_seq_len, 2048)) + assert logits.shape == torch.Size((5, max_seq_len, 8192)) # Check KV cache got populated correctly. kv_dict = inference_params.key_value_memory_dict @@ -332,7 +353,11 @@ def test_forward(self): for layer_no in range(1, 4): # 3 layers in the model. layer_kv = kv_dict[layer_no] # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head] - assert layer_kv[0].shape == layer_kv[1].shape == torch.Size((max_seq_len, 5, 8, 16)) + assert ( + layer_kv[0].shape + == layer_kv[1].shape + == torch.Size((max_seq_len, 5, self.language_num_attention_heads, 16)) + ) @pytest.mark.internal def test_save_load(self, tmp_path): @@ -353,3 +378,62 @@ def test_freeze(self): for param in self.model.vision_projection.parameters(): assert param.requires_grad + + +class TestLLaVAModelSigLIP: + @pytest.mark.internal # The model is under active development and its methods may change. + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + + language_config = TransformerConfig( + num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False + ) + vision_config = TransformerConfig( + num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False + ) + vision_projection_config = TransformerConfig( + num_layers=2, + hidden_size=128, + ffn_hidden_size=72, + num_attention_heads=1, + use_cpu_initialization=False, + ) + + language_layer_spec = get_gpt_layer_with_transformer_engine_spec() + vision_layer_spec = deepcopy(language_layer_spec) + vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules) + + vision_config.vision_model_type = "siglip" + self.model = LLaVAModel( + language_transformer_config=language_config, + language_transformer_layer_spec=language_layer_spec, + language_vocab_size=2048, + language_max_sequence_length=4096, + vision_transformer_config=vision_config, + vision_transformer_layer_spec=vision_layer_spec, + drop_vision_class_token=False, + vision_projection_config=vision_projection_config, + vision_projection_layer_spec=vision_projection_spec, + img_h=336, + img_w=336, + patch_dim=14, + ) + + @pytest.mark.internal + def teardown_method(self, method): + Utils.destroy_model_parallel() + + @pytest.mark.internal + def test_constructor(self): + assert isinstance(self.model, LLaVAModel) + + num_weights = sum([p.numel() for p in self.model.parameters()]) + assert num_weights == 1832456 + + @pytest.mark.internal + def test_set_input_tensor(self): + expected_shape = (1, 2, 3, 4) + input_tensor = torch.zeros(expected_shape) + self.model.set_input_tensor(input_tensor) + assert self.model.vision_model.decoder.input_tensor.shape == expected_shape diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py index 9fcc38c259..039ad071a7 100644 --- a/tests/unit_tests/tensor_parallel/test_initialization.py +++ b/tests/unit_tests/tensor_parallel/test_initialization.py @@ -4,13 +4,16 @@ import torch import megatron.core.parallel_state as ps -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec from megatron.core.tensor_parallel.layers import ( ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding, ) from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.custom_layers.transformer_engine import ( + TEColumnParallelLinear, + TERowParallelLinear, +) from megatron.core.transformer.transformer_config import TransformerConfig from tests.unit_tests.test_utilities import Utils @@ -21,6 +24,9 @@ class Test: num_layers=1, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True ) + def teardown_method(self, method): + Utils.destroy_model_parallel() + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_embedding_init(self): @@ -117,3 +123,79 @@ def test_col_init(self): rank = ps.get_tensor_model_parallel_rank() assert tp4.shape[0] * 4 == tp1.shape[0] assert torch.equal(tp1[rank * 4 : (rank + 1) * 4], tp4) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(100) + def test_te_col_init(self): + + Utils.initialize_model_parallel(1, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(42) + + tp1 = TEColumnParallelLinear( + input_size=16, + output_size=16, + init_method=self.transformer_config.init_method, + bias=True, + config=self.transformer_config, + skip_bias_add=False, + gather_output=False, + is_expert=False, + ).weight + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(4, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(41) # intentionally different. + tp4 = TEColumnParallelLinear( + input_size=16, + output_size=16, + init_method=self.transformer_config.init_method, + bias=True, + config=self.transformer_config, + skip_bias_add=False, + gather_output=False, + is_expert=False, + ).weight + + if torch.distributed.get_rank() == 0: + assert tp4.shape[0] * 4 == tp1.shape[0] + assert torch.allclose(tp1[:4], tp4) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.timeout(100) + def test_te_row_init(self): + + Utils.initialize_model_parallel(1, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(42) + + tp1 = TERowParallelLinear( + input_size=16, + output_size=16, + init_method=self.transformer_config.init_method, + bias=True, + input_is_parallel=True, + config=self.transformer_config, + skip_bias_add=False, + is_expert=False, + ).weight + Utils.destroy_model_parallel() + + Utils.initialize_model_parallel(4, 1) + torch.manual_seed(42) + model_parallel_cuda_manual_seed(41) # intentionally different. + tp4 = TERowParallelLinear( + input_size=16, + output_size=16, + init_method=self.transformer_config.init_method, + bias=True, + input_is_parallel=True, + config=self.transformer_config, + skip_bias_add=False, + is_expert=False, + ).weight + + if torch.distributed.get_rank() == 0: + assert tp4.shape[1] * 4 == tp1.shape[1] + assert torch.allclose(tp1[:, :4], tp4) diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py index a78921ad10..043bdc8c58 100644 --- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py +++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py @@ -1,17 +1,15 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -from importlib.metadata import version - import pytest import torch import torch.nn.functional as F -from pkg_resources import packaging from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.transformer.moe import grouped_gemm_util as gg from megatron.core.transformer.moe.experts import TEGroupedMLP from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import is_te_min_version from megatron.legacy.model import Float16Module from megatron.training.arguments import parse_args from megatron.training.initialize import _set_random_seed @@ -21,8 +19,6 @@ if torch.cuda.is_available(): DEVICE_CAPABILITY = torch.cuda.get_device_capability() -_te_version = packaging.version.Version(version("transformer-engine")) - class TestParallelGroupedMLP: @@ -218,7 +214,7 @@ def test_gradient_with_no_tokens_allocated(self): @pytest.mark.skipif( - _te_version < packaging.version.Version("1.9.0.dev0"), + not is_te_min_version("1.9.0.dev0"), reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.", ) class TestTEGroupedMLP: diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py index 40a0caf31a..514e098bfd 100644 --- a/tests/unit_tests/transformer/moe/test_sequential_mlp.py +++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py @@ -1,7 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. from importlib.metadata import version -import packaging import pytest import torch @@ -16,10 +15,9 @@ from megatron.core.transformer.moe.experts import SequentialMLP from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.utils import is_te_min_version from tests.unit_tests.test_utilities import Utils -te_version = packaging.version.Version(version("transformer-engine")) - class TestParallelSequentialMLP: @@ -117,7 +115,7 @@ def setup_method(self, method): ) @pytest.mark.skipif( - te_version < packaging.version.Version("1.7.0"), + not is_te_min_version("1.7.0"), reason="Transformer Engine under v1.7.0 doesn't support MoE training.", ) @pytest.mark.internal @@ -133,7 +131,7 @@ def test_constructor(self): ) @pytest.mark.skipif( - te_version < packaging.version.Version("1.7.0"), + not is_te_min_version("1.7.0"), reason="Transformer Engine under v1.7.0 doesn't support MoE training.", ) @pytest.mark.internal @@ -155,7 +153,7 @@ def test_gpu_forward(self): assert torch.equal(output_local, output_te) @pytest.mark.skipif( - te_version < packaging.version.Version("1.7.0"), + not is_te_min_version("1.7.0"), reason="Transformer Engine under v1.7.0 doesn't support MoE training.", ) @pytest.mark.internal @@ -179,7 +177,7 @@ def test_gpu_forward_with_one_local_expert(self): assert torch.equal(output_local, output_te) @pytest.mark.skipif( - te_version < packaging.version.Version("1.7.0"), + not is_te_min_version("1.7.0"), reason="Transformer Engine under v1.7.0 doesn't support MoE training.", ) @pytest.mark.internal diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py new file mode 100644 index 0000000000..4188d7b069 --- /dev/null +++ b/tests/unit_tests/transformer/test_multi_latent_attention.py @@ -0,0 +1,160 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import os +from importlib.metadata import version + +import pytest +import torch +import transformer_engine as te + +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed +from megatron.core.transformer.multi_latent_attention import MLASelfAttention +from megatron.core.transformer.transformer_config import MLATransformerConfig +from megatron.core.utils import is_te_min_version +from tests.unit_tests.test_utilities import Utils + + +class TestParallelMLAAttention: + + def setup_method(self, method): + Utils.initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + self.transformer_config = MLATransformerConfig( + num_layers=2, + hidden_size=12, + num_attention_heads=4, + use_cpu_initialization=True, + q_lora_rank=32, + kv_lora_rank=32, + qk_head_dim=128, + v_head_dim=128, + qk_pos_emb_head_dim=64, + rotary_base=10000, + ) + self.parallel_attention = MLASelfAttention( + self.transformer_config, + get_gpt_layer_with_transformer_engine_spec( + multi_latent_attention=True + ).submodules.self_attention.submodules, + layer_number=1, + ) + + def teardown_method(self, method): + Utils.destroy_model_parallel() + + def test_constructor(self): + assert isinstance(self.parallel_attention, MLASelfAttention) + assert self.parallel_attention.layer_number == 1 + + num_weights = sum([p.numel() for p in self.parallel_attention.parameters()]) + assert num_weights == 65036 + + def test_cpu_forward(self): + # we can't currently do this because the global memory buffer is on GPU + pass + + def test_gpu_forward(self): + if is_te_min_version("1.10.0"): + + # use flash attention for hopper, future may support fused attention for ampere + os.environ['NVTE_FUSED_ATTN'] = "0" + os.environ['NVTE_FLASH_ATTN'] = "1" + + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 2 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size) + ) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + output, bias = self.parallel_attention(hidden_states, attention_mask) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + + def test_fused_rope_gpu_forward(self): + if is_te_min_version("1.10.0"): + # use flash attention for hopper, future may support fused attention for ampere + os.environ['NVTE_FUSED_ATTN'] = "0" + os.environ['NVTE_FLASH_ATTN'] = "1" + + self.parallel_attention.config.apply_rope_fusion = True + config = self.parallel_attention.config + sequence_length = 32 + micro_batch_size = 2 + + self.parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size) + ) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + rotary_pos_emb = torch.ones( + sequence_length, 1, 1, self.parallel_attention.config.kv_channels + ).cuda() + output, bias = self.parallel_attention( + hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb + ) + + assert config.recompute_granularity is None + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size + self.parallel_attention.config.apply_rope_fusion = False + + def test_checkpointed_gpu_forward(self): + if is_te_min_version("1.10.0"): + # use flash attention for hopper, future may support fused attention for ampere + os.environ['NVTE_FUSED_ATTN'] = "0" + os.environ['NVTE_FLASH_ATTN'] = "1" + + transformer_config = self.transformer_config + transformer_config.recompute_granularity = 'selective' + checkpointed_parallel_attention = MLASelfAttention( + transformer_config, + get_gpt_layer_with_transformer_engine_spec( + multi_latent_attention=True + ).submodules.self_attention.submodules, + layer_number=1, + ) + config = checkpointed_parallel_attention.config + + sequence_length = 32 + micro_batch_size = 2 + + checkpointed_parallel_attention.cuda() + + # [sequence length, batch size, hidden size] + hidden_states = torch.ones( + ( + sequence_length, + micro_batch_size, + checkpointed_parallel_attention.config.hidden_size, + ) + ) + hidden_states = hidden_states.cuda() + + attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() + + output, bias = checkpointed_parallel_attention(hidden_states, attention_mask) + + assert config.recompute_granularity == 'selective' + assert output.shape[0] == sequence_length + assert output.shape[1] == micro_batch_size + assert output.shape[2] == config.hidden_size + assert bias.shape[0] == config.hidden_size diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py index 80c3bf7577..a9a245b861 100755 --- a/tests/unit_tests/transformer/test_spec_customization.py +++ b/tests/unit_tests/transformer/test_spec_customization.py @@ -2,12 +2,10 @@ import sys from dataclasses import dataclass, fields -from importlib.metadata import version import pytest import torch import transformer_engine as te -from pkg_resources import packaging from megatron.core.extensions.transformer_engine import ( TEDotProductAttention, @@ -26,6 +24,7 @@ from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules +from megatron.core.utils import is_te_min_version from tests.unit_tests.test_utilities import Utils @@ -134,8 +133,7 @@ def test_build_module(self): assert id(bda_op) == id(get_bias_dropout_add) def test_sliding_window_attention(self): - te_version = packaging.version.Version(version("transformer-engine")) - if te_version < packaging.version.Version("1.2.0"): + if not is_te_min_version("1.2.0"): print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr) return diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py index 1b5fec9afd..ea803c5543 100644 --- a/tools/checkpoint/loader_llama_mistral.py +++ b/tools/checkpoint/loader_llama_mistral.py @@ -385,15 +385,10 @@ def load_checkpoint_to_model(args): '''Set model params.''' from pretrain_gpt import model_provider - if "llama" in args.model_size or "yi" in args.model_size: - from transformers import LlamaForCausalLM as ModelForCausalLM - elif "mistral" in args.model_size: - from transformers import MistralForCausalLM as ModelForCausalLM - else: - raise AttributeError(f"args.model_size={args.model_size} not supported") + from transformers import AutoModelForCausalLM # Load Huggingface model. - hf_model = ModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu") + hf_model = AutoModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu") # Init Megatron model. model = model_provider(True, True).to(args.params_dtype) diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py index aea481abed..6aec90e41b 100644 --- a/tools/checkpoint/saver_mcore.py +++ b/tools/checkpoint/saver_mcore.py @@ -3,11 +3,10 @@ import os import sys import torch -from importlib.metadata import version -from pkg_resources import packaging from setter import ModelSetter from utils import get_mcore_transformer_block_key, print_memory_usage +from megatron.core.utils import get_te_version, is_te_min_version class MCoreSetter(ModelSetter): @@ -288,9 +287,8 @@ def add_arguments(parser): def save_checkpoint(queue, args): # Transformer engine >= 0.12.0, for CPU initialization. - te_version = packaging.version.Version(version("transformer-engine")) - assert te_version >= packaging.version.Version("0.12.0"), \ - "transformer engine version: %s (>=0.12.0 required)." % te_version + assert is_te_min_version("0.12.0"), \ + "transformer engine version: %s (>=0.12.0 required)." % get_te_version() # Search in directory above this sys.path.append(os.path.abspath( diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index 861d8d6d73..e5b3f08a58 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -83,7 +83,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, position_embedding_type=args.position_embedding_type, rotary_percent=args.rotary_percent, - rotary_base=args.rotary_base + rotary_base=args.rotary_base, + rope_scaling=args.use_rope_scaling ) return model @@ -122,6 +123,8 @@ def add_text_generate_args(parser): assert len(model) == 1, "Above condition should have caught this" model = model[0] + model.eval() + if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: server = MegatronServer(model) server.run("0.0.0.0",port=args.port)