@@ -1455,16 +1455,17 @@ jobs:
1455
1455
AFTER_SCRIPT : |
1456
1456
rm -rf nemo_experiments
1457
1457
1458
- L2_VLM_HF_Transformer_SFT_FSDP2 :
1458
+ Optional_L2_VLM_HF_Transformer_SFT_FSDP2 :
1459
1459
needs : [pre-flight, cicd-test-container-build]
1460
1460
uses : ./.github/workflows/_test_template.yml
1461
- if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_SFT_FSDP2 ')
1461
+ if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Optional_L2_VLM_HF_Transformer_SFT_FSDP2 ')
1462
1462
with :
1463
- RUNNER : self-hosted-azure-gpus-1
1463
+ RUNNER : self-hosted-azure
1464
1464
SCRIPT : |
1465
1465
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/sft_fsdp2.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3
1466
1466
AFTER_SCRIPT : |
1467
1467
rm -rf nemo_experiments
1468
+ IS_OPTIONAL : true
1468
1469
1469
1470
L2_HF_Transformer_PEFT_notebook :
1470
1471
needs : [pre-flight, cicd-test-container-build]
@@ -1603,16 +1604,17 @@ jobs:
1603
1604
AFTER_SCRIPT : |
1604
1605
rm -rf nemo_experiments
1605
1606
1606
- L2_HF_Transformer_SFT_FSDP2_2gpu :
1607
+ Optional_L2_HF_Transformer_SFT_FSDP2_2gpu :
1607
1608
needs : [pre-flight, cicd-test-container-build]
1608
1609
uses : ./.github/workflows/_test_template.yml
1609
- if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_FSDP2_2gpu ')
1610
+ if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Optional_L2_HF_Transformer_SFT_FSDP2_2gpu ')
1610
1611
with :
1611
1612
RUNNER : self-hosted-azure
1612
1613
SCRIPT : |
1613
1614
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_fsdp2.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2
1614
1615
AFTER_SCRIPT : |
1615
1616
rm -rf nemo_experiments
1617
+ IS_OPTIONAL : true
1616
1618
1617
1619
L2_HF_Transformer_PT_2gpu :
1618
1620
needs : [pre-flight, cicd-test-container-build]
@@ -1696,16 +1698,17 @@ jobs:
1696
1698
AFTER_SCRIPT : |
1697
1699
rm -rf nemo_experiments
1698
1700
1699
- L2_HF_Transformer_SFT_TE_Acceleration :
1701
+ Optional_L2_HF_Transformer_SFT_TE_Acceleration :
1700
1702
needs : [pre-flight, cicd-test-container-build]
1701
1703
uses : ./.github/workflows/_test_template.yml
1702
- if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_TE_Acceleration ')
1704
+ if : contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Optional_L2_HF_Transformer_SFT_TE_Acceleration ')
1703
1705
with :
1704
1706
RUNNER : self-hosted-azure-gpus-1
1705
1707
SCRIPT : |
1706
1708
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft.py --model /home/TestData/akoumparouli/hf_mixtral_2l/ --model-accelerator te --max-steps 3
1707
1709
AFTER_SCRIPT : |
1708
1710
rm -rf nemo_experiments
1711
+ IS_OPTIONAL : true
1709
1712
1710
1713
L2_HF_Transformer_PT_TE_Acceleration :
1711
1714
needs : [pre-flight, cicd-test-container-build]
@@ -2115,7 +2118,8 @@ jobs:
2115
2118
--devices 1 \
2116
2119
--max-steps 10 \
2117
2120
--experiment-dir /tmp/nlp_megatron_mamba_nemo-ux-mamba_cicd_test_sft/${{ github.run_id }} \
2118
- --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
2121
+ --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
2122
+ --ckpt_load_strictness log_all
2119
2123
2120
2124
L2_NeMo_2_HF_MODEL_IMPORT :
2121
2125
needs : [pre-flight, cicd-test-container-build]
@@ -2253,7 +2257,7 @@ jobs:
2253
2257
SCRIPT : |
2254
2258
2255
2259
python tests/collections/llm/gpt_finetuning.py \
2256
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2260
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2257
2261
--devices 2 \
2258
2262
--max_steps 3 \
2259
2263
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2263,7 +2267,7 @@ jobs:
2263
2267
--mbs 1
2264
2268
2265
2269
python tests/collections/llm/gpt_finetuning.py \
2266
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2270
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2267
2271
--devices 2 \
2268
2272
--max_steps 6 \
2269
2273
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2281,7 +2285,7 @@ jobs:
2281
2285
SCRIPT : |
2282
2286
2283
2287
python tests/collections/llm/gpt_finetuning.py \
2284
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2288
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2285
2289
--devices 2 \
2286
2290
--max_steps 3 \
2287
2291
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2291,7 +2295,7 @@ jobs:
2291
2295
--mbs 2
2292
2296
2293
2297
python tests/collections/llm/gpt_finetuning.py \
2294
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2298
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2295
2299
--devices 2 \
2296
2300
--max_steps 6 \
2297
2301
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2309,7 +2313,7 @@ jobs:
2309
2313
SCRIPT : |
2310
2314
2311
2315
python tests/collections/llm/gpt_finetuning.py \
2312
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2316
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2313
2317
--devices 2 \
2314
2318
--max_steps 3 \
2315
2319
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2319,7 +2323,7 @@ jobs:
2319
2323
--mbs 2
2320
2324
2321
2325
python tests/collections/llm/gpt_finetuning.py \
2322
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2326
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2323
2327
--devices 2 \
2324
2328
--max_steps 6 \
2325
2329
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2337,7 +2341,7 @@ jobs:
2337
2341
SCRIPT : |
2338
2342
2339
2343
python tests/collections/llm/gpt_finetuning.py \
2340
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2344
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2341
2345
--devices 2 \
2342
2346
--max_steps 3 \
2343
2347
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2347,7 +2351,7 @@ jobs:
2347
2351
--mbs 2
2348
2352
2349
2353
python tests/collections/llm/gpt_finetuning.py \
2350
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2354
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2351
2355
--devices 2 \
2352
2356
--max_steps 6 \
2353
2357
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2365,7 +2369,7 @@ jobs:
2365
2369
SCRIPT : |
2366
2370
2367
2371
python tests/collections/llm/gpt_finetuning.py \
2368
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2372
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2369
2373
--devices 2 \
2370
2374
--max_steps 3 \
2371
2375
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2375,7 +2379,7 @@ jobs:
2375
2379
--mbs 1 --packed
2376
2380
2377
2381
python tests/collections/llm/gpt_finetuning.py \
2378
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2382
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2379
2383
--devices 2 \
2380
2384
--max_steps 6 \
2381
2385
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2393,7 +2397,7 @@ jobs:
2393
2397
SCRIPT : |
2394
2398
2395
2399
python tests/collections/llm/gpt_finetuning.py \
2396
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2400
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2397
2401
--devices 2 \
2398
2402
--max_steps 3 \
2399
2403
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2403,7 +2407,7 @@ jobs:
2403
2407
--mbs 1
2404
2408
2405
2409
python tests/collections/llm/gpt_finetuning.py \
2406
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2410
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2407
2411
--devices 2 \
2408
2412
--max_steps 6 \
2409
2413
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2421,7 +2425,7 @@ jobs:
2421
2425
SCRIPT : |
2422
2426
2423
2427
python tests/collections/llm/gpt_finetuning.py \
2424
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2428
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2425
2429
--devices 2 \
2426
2430
--max_steps 3 \
2427
2431
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2431,7 +2435,7 @@ jobs:
2431
2435
--mbs 2
2432
2436
2433
2437
python tests/collections/llm/gpt_finetuning.py \
2434
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2438
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2435
2439
--devices 2 \
2436
2440
--max_steps 6 \
2437
2441
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2449,7 +2453,7 @@ jobs:
2449
2453
SCRIPT : |
2450
2454
2451
2455
python tests/collections/llm/gpt_finetuning.py \
2452
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2456
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2453
2457
--devices 2 \
2454
2458
--max_steps 3 \
2455
2459
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2459,7 +2463,7 @@ jobs:
2459
2463
--mbs 2
2460
2464
2461
2465
python tests/collections/llm/gpt_finetuning.py \
2462
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2466
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2463
2467
--devices 2 \
2464
2468
--max_steps 6 \
2465
2469
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2477,7 +2481,7 @@ jobs:
2477
2481
SCRIPT : |
2478
2482
2479
2483
python tests/collections/llm/gpt_finetuning.py \
2480
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2484
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2481
2485
--devices 2 \
2482
2486
--max_steps 3 \
2483
2487
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2487,7 +2491,7 @@ jobs:
2487
2491
--mbs 2
2488
2492
2489
2493
python tests/collections/llm/gpt_finetuning.py \
2490
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2494
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2491
2495
--devices 2 \
2492
2496
--max_steps 6 \
2493
2497
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2505,7 +2509,7 @@ jobs:
2505
2509
SCRIPT : |
2506
2510
2507
2511
python tests/collections/llm/gpt_finetuning.py \
2508
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2512
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2509
2513
--devices 2 \
2510
2514
--max_steps 3 \
2511
2515
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2515,7 +2519,7 @@ jobs:
2515
2519
--mbs 1 --packed
2516
2520
2517
2521
python tests/collections/llm/gpt_finetuning.py \
2518
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2522
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2519
2523
--devices 2 \
2520
2524
--max_steps 6 \
2521
2525
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2533,7 +2537,7 @@ jobs:
2533
2537
SCRIPT : |
2534
2538
2535
2539
python tests/collections/llm/gpt_finetuning.py \
2536
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2540
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2537
2541
--devices 2 \
2538
2542
--max_steps 3 \
2539
2543
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2543,7 +2547,7 @@ jobs:
2543
2547
--mbs 1 --packed
2544
2548
2545
2549
python tests/collections/llm/gpt_finetuning.py \
2546
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2550
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2547
2551
--devices 2 \
2548
2552
--max_steps 6 \
2549
2553
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2560,7 +2564,7 @@ jobs:
2560
2564
RUNNER : self-hosted-azure
2561
2565
SCRIPT : |
2562
2566
python tests/collections/llm/gpt_finetuning.py \
2563
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2567
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2564
2568
--devices 2 \
2565
2569
--max_steps 3 \
2566
2570
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2570,7 +2574,7 @@ jobs:
2570
2574
--mbs 1 --packed
2571
2575
2572
2576
python tests/collections/llm/gpt_finetuning.py \
2573
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2577
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2574
2578
--devices 2 \
2575
2579
--max_steps 6 \
2576
2580
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2588,7 +2592,7 @@ jobs:
2588
2592
SCRIPT : |
2589
2593
2590
2594
python tests/collections/llm/gpt_finetuning.py \
2591
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2595
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2592
2596
--devices 2 \
2593
2597
--max_steps 3 \
2594
2598
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2599,7 +2603,7 @@ jobs:
2599
2603
--dataset chat
2600
2604
2601
2605
python tests/collections/llm/gpt_finetuning.py \
2602
- --restore_path /home/TestData/nemo2_ckpt/llama_68M_v2 \
2606
+ --restore_path /home/TestData/nemo2_ckpt/llama_68M_v3 \
2603
2607
--devices 2 \
2604
2608
--max_steps 6 \
2605
2609
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
@@ -2726,7 +2730,8 @@ jobs:
2726
2730
2727
2731
python tests/collections/llm/peft/lora_merge.py \
2728
2732
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v2/ \
2729
- --output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
2733
+ --output_path=/tmp/nemo2_lora_merge/${{ github.run_id }} \
2734
+ --legacy_ckpt
2730
2735
2731
2736
L2_NEMO_2_LoRA_Export :
2732
2737
needs : [pre-flight, cicd-test-container-build]
@@ -2755,7 +2760,8 @@ jobs:
2755
2760
--devices 1 \
2756
2761
--top_p 0.0 \
2757
2762
--top_k 1 \
2758
- --num_tokens_to_generate 3
2763
+ --num_tokens_to_generate 3 \
2764
+ --legacy_ckpt
2759
2765
2760
2766
L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact :
2761
2767
needs : [pre-flight, cicd-test-container-build]
@@ -2775,7 +2781,7 @@ jobs:
2775
2781
SCRIPT : |
2776
2782
python tests/collections/llm/test_hf_import.py --hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf --output_path /tmp/nemo2_ckpt
2777
2783
2778
- python scripts/llm/ptq.py -nc /tmp/nemo2_ckpt -algo fp8 -out /tmp/nemo2_ptq_engine
2784
+ python scripts/llm/ptq.py -nc /tmp/nemo2_ckpt -algo fp8 -out /tmp/nemo2_ptq_engine --ckpt_load_strictness log_all
2779
2785
2780
2786
AFTER_SCRIPT : |
2781
2787
rm -rf /tmp/nemo2_ckpt
@@ -2809,7 +2815,8 @@ jobs:
2809
2815
--warmup_steps 1 \
2810
2816
--val_check_interval 5 \
2811
2817
--log_interval 5 \
2812
- --limit_val_batches 2
2818
+ --limit_val_batches 2 \
2819
+ --legacy_ckpt
2813
2820
2814
2821
AFTER_SCRIPT : |
2815
2822
rm -rf /tmp/nemo2_ckpt
@@ -3058,9 +3065,9 @@ jobs:
3058
3065
- L2_VLM_HF_Transformer_PEFT
3059
3066
- L2_VLM_HF_Transformer_PEFT_FSDP
3060
3067
- L2_VLM_HF_Transformer_PEFT_4bit
3061
- - L2_VLM_HF_Transformer_SFT_FSDP2
3068
+ # - Optional_L2_VLM_HF_Transformer_SFT_FSDP2
3062
3069
- L2_HF_Transformer_SFT_2gpu_nemorun
3063
- - L2_HF_Transformer_SFT_TE_Acceleration
3070
+ # - Optional_L2_HF_Transformer_SFT_TE_Acceleration
3064
3071
- L2_HF_Transformer_PT
3065
3072
- L2_HF_Transformer_PT_nemorun
3066
3073
- L2_HF_Transformer_PT_2gpu
@@ -3110,7 +3117,7 @@ jobs:
3110
3117
- L2_NeMo_2_Export_In_Framework
3111
3118
- L2_NeMo_2_jit_callback
3112
3119
- L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
3113
- - L2_HF_Transformer_SFT_FSDP2_2gpu
3120
+ # - Optional_L2_HF_Transformer_SFT_FSDP2_2gpu
3114
3121
- L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
3115
3122
- L2_NeMo_2_VLLM_EXPORT
3116
3123
- L2_NeMo_2_EVAL
0 commit comments