Skip to content

Commit

Permalink
Merge branch 'ko3n1g/tests/refactor-testcases-multimodal-llava' into …
Browse files Browse the repository at this point in the history
…'main'

refactor: model=multimodal-llava - scope=mr

See merge request ADLR/megatron-lm!1895
  • Loading branch information
ko3n1g committed Aug 8, 2024
2 parents 4a67bdc + a98216a commit 0363328
Show file tree
Hide file tree
Showing 10 changed files with 253 additions and 256 deletions.
58 changes: 0 additions & 58 deletions tests/functional_tests/jet_recipes/MR-multimodal.yaml

This file was deleted.

37 changes: 37 additions & 0 deletions tests/functional_tests/jet_recipes/multimodal-llava.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
model: multimodal-llava
build: mcore-pyt
nodes: 1
gpus: 8
platforms: dgx_a100
time_limit: 1200
scope: null
script: |-
ls
cd /workspace/megatron-lm
ARGUMENTS=(
"DATA_PATH=''"
"DATA_CACHE_PATH=''"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_vlm.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- scope: [mr]
test_case:
- multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
- multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 624
--attention-dropout: 0.0
--hidden-dropout: 0.0
--num-attention-heads: 12
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--split: 949,50,1
--tokenizer-type: NullTokenizer
--vocab-size: 8192
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 1
--deterministic-mode: true
--attention-softmax-in-fp32: true
--ckpt-format: torch
--no-gradient-accumulation-fusion: true
--bf16: true
--img-h: 336
--img-w: 336
--patch-dim: 14
--mock-data: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 624
--attention-dropout: 0.0
--hidden-dropout: 0.0
--num-attention-heads: 12
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--split: 949,50,1
--tokenizer-type: NullTokenizer
--vocab-size: 8192
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 3
--encoder-pipeline-model-parallel-size: 1
--deterministic-mode: true
--attention-softmax-in-fp32: true
--ckpt-format: torch
--no-gradient-accumulation-fusion: true
--bf16: true
--img-h: 336
--img-w: 336
--patch-dim: 14
--mock-data: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
GPUS_PER_NODE: 7
MODEL_ARGS:
--num-layers: 12
--hidden-size: 624
--attention-dropout: 0.0
--hidden-dropout: 0.0
--num-attention-heads: 12
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--split: 949,50,1
--tokenizer-type: NullTokenizer
--vocab-size: 8192
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 1
--encoder-pipeline-model-parallel-size: 1
--encoder-tensor-model-parallel-size: 3
--deterministic-mode: true
--attention-softmax-in-fp32: true
--ckpt-format: torch
--no-gradient-accumulation-fusion: true
--bf16: true
--img-h: 336
--img-w: 336
--patch-dim: 14
--mock-data: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
GPUS_PER_NODE: 7
MODEL_ARGS:
--num-layers: 12
--hidden-size: 624
--attention-dropout: 0.0
--hidden-dropout: 0.0
--num-attention-heads: 12
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--split: 949,50,1
--tokenizer-type: NullTokenizer
--vocab-size: 8192
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 50
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 1
--encoder-pipeline-model-parallel-size: 1
--encoder-tensor-model-parallel-size: 3
--deterministic-mode: true
--attention-softmax-in-fp32: true
--use-checkpoint-opt_param-scheduler: true
--ckpt-format: torch
--no-gradient-accumulation-fusion: true
--bf16: true
--img-h: 336
--img-w: 336
--patch-dim: 14
--mock-data: true
TEST_TYPE: ckpt-resume
Loading

0 comments on commit 0363328

Please sign in to comment.