Merge branch 'ko3n1g/tests/refactor-testcases-multimodal-llava' into …

…'main' refactor: model=multimodal-llava - scope=mr See merge request ADLR/megatron-lm!1895
TrustLLMeu · Aug 8, 2024 · 0363328 · 0363328
2 parents 4a67bdc + a98216a
commit 0363328
Show file tree

Hide file tree

Showing 10 changed files with 253 additions and 256 deletions.
diff --git a/tests/functional_tests/jet_recipes/MR-multimodal.yaml b/tests/functional_tests/jet_recipes/MR-multimodal.yaml
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -0,0 +1,37 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: multimodal-llava
+  build: mcore-pyt
+  nodes: 1
+  gpus: 8
+  platforms: dgx_a100
+  time_limit: 1200
+  scope: null
+  script: |-
+    ls
+    cd /workspace/megatron-lm
+
+    ARGUMENTS=(
+        "DATA_PATH=''"
+        "DATA_CACHE_PATH=''"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - scope: [mr]
+    test_case:
+    - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
+    - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
+    - multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
+    - multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
diff --git a/...va_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json → ..._tp1_pp1_dgx_a100_1N8G/golden_values.json b/...va_mr_mcore_te_tp1_pp1_dgx_a100_1N8G.json → ..._tp1_pp1_dgx_a100_1N8G/golden_values.json
diff --git a/...ses/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/...ses/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/...va_mr_mcore_te_tp2_pp3_dgx_a100_1N8G.json → ..._tp2_pp3_dgx_a100_1N8G/golden_values.json b/...va_mr_mcore_te_tp2_pp3_dgx_a100_1N8G.json → ..._tp2_pp3_dgx_a100_1N8G/golden_values.json
diff --git a/...ses/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/...ses/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 3
+  --encoder-pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/..._mcore_te_tp4_pp1_etp3_dgx_a100_1N7G.json → ...pp1_etp3_dgx_a100_1N7G/golden_values.json b/..._mcore_te_tp4_pp1_etp3_dgx_a100_1N7G.json → ...pp1_etp3_dgx_a100_1N7G/golden_values.json
diff --git a/...ultimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/...ultimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 7
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 3
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/...va/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/...va/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
@@ -0,0 +1,56 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 7
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 3
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: ckpt-resume