From 63be779b4608403f956aa1ef6c9013ab78db3eeb Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Thu, 19 Sep 2024 06:39:59 -0700
Subject: [PATCH 01/50] ADLR/megatron-lm!2033 - Online eval

---
 examples/multimodal/config.py                 |  25 ++
 examples/multimodal/evaluate_textvqa.py       |  20 +-
 examples/multimodal/evaluate_vqav2.py         |  12 +-
 examples/multimodal/model.py                  | 149 ++++++++++++
 examples/multimodal/multimodal_args.py        |  43 ++++
 examples/multimodal/pretrain_mistral_clip.sh  |   2 -
 examples/multimodal/run_text_generation.py    | 183 +++++++++------
 examples/multimodal/sft_mistral_clip.sh       |   2 -
 .../text_generation_mistral_clip.sh           |   7 -
 examples/multimodal/train.py                  | 214 ++++++------------
 megatron/core/models/gpt/gpt_model.py         |   9 +-
 .../core/models/multimodal/llava_model.py     |   4 +
 megatron/core/tensor_parallel/layers.py       |  30 ++-
 megatron/training/training.py                 |  30 ++-
 14 files changed, 489 insertions(+), 241 deletions(-)
 create mode 100644 examples/multimodal/model.py
 create mode 100644 examples/multimodal/multimodal_args.py

diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
index f8c3714eb3..d4ee17db1b 100644
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
@@ -1,4 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+
 import torch
 
 from megatron.training.activations import quick_gelu, squared_relu
@@ -107,3 +109,26 @@ def get_vision_projection_config(config, hidden_size):
         config.activation_func = torch.nn.functional.gelu
 
     return config
+
+
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+
+    out_seq_length: int = 32
+
+    output_path: str = ""
+
+    input_image_path: str = ""
+    gt_path: str = ""
+
+    num_partitions: int = 1
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
+
+    prompt_format: str = "mistral"
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
index b80974a893..7d0a059f4d 100644
--- a/examples/multimodal/evaluate_textvqa.py
+++ b/examples/multimodal/evaluate_textvqa.py
@@ -1,16 +1,23 @@
 import argparse
 import glob
 import json
+import os
 
 from evaluate_vqav2 import compute_vqa_accuracy
 
 
 def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
-    output_file_path = input_path + "-TextVQA-merged.json"
+    # Single input file.
+    if os.path.exists(input_path):
+        input_file_paths = [input_path]
+        output_file_path = input_path.replace(".jsonl", "-merged.json")
+    # Directory of partitioned input files.
+    else:
+        pattern = input_path + "-TextVQA-[0-9].*jsonl"
+        input_file_paths = glob.glob(pattern)
 
-    pattern = input_path + "-TextVQA-[0-9].*jsonl"
-    input_file_paths = glob.glob(pattern)
+        output_file_path = input_path + "-TextVQA-merged.json"
 
     results = []
 
@@ -35,7 +42,8 @@ def merge_input_files(input_path):
 def textvqa_eval(input_path):
     """Run TextVQA evaluation."""
     result_file_path = merge_input_files(input_path)
-    compute_vqa_accuracy(result_file_path)
+    avg_acc = compute_vqa_accuracy(result_file_path)
+    return avg_acc
 
 
 if __name__ == "__main__":
@@ -43,4 +51,6 @@ def textvqa_eval(input_path):
     parser.add_argument('--input-path', type=str, help="Path to input file(s)")
     args = parser.parse_args()
 
-    textvqa_eval(args.input_path)
+    avg_acc = textvqa_eval(args.input_path)
+
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
index 5d9dfe7844..cf10a0549d 100644
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -55,7 +55,7 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
         # "We consider an answer to be correct if it is within 5% of the gold answer.
         #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
         if use_chartqa_metric:
-            acc = 0.
+            acc = 0.0
             assert len(gt) == 1, "expected exactly one groundtruth answer."
             gt = gt[0]
 
@@ -74,13 +74,15 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
             all_acc.append(acc)
 
     acc_avg = sum(all_acc) / len(all_acc) * 100
-    print(f"===== Accuracy {acc_avg:.2f}% =====")
+
+    return acc_avg
 
 
 def vqav2_eval(input_path):
     """Run VQAv2 evaluation."""
     result_file = merge_input_files(input_path)
-    compute_vqa_accuracy(result_file)
+    avg_acc = compute_vqa_accuracy(result_file)
+    return avg_acc
 
 
 if __name__ == "__main__":
@@ -88,4 +90,6 @@ def vqav2_eval(input_path):
     parser.add_argument('--input-path', type=str, help="Path to input file(s)")
     args = parser.parse_args()
 
-    vqav2_eval(args.input_path)
+    avg_acc = vqav2_eval(args.input_path)
+
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
new file mode 100644
index 0000000000..b21c687525
--- /dev/null
+++ b/examples/multimodal/model.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec
+
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+
+    vision_model_type = args.vision_model_type
+    if vision_model_type == "clip":
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+        vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
+        vision_projection_config.pipeline_model_parallel_size = (
+            args.encoder_pipeline_model_parallel_size
+        )
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py
new file mode 100644
index 0000000000..a7cb4235e3
--- /dev/null
+++ b/examples/multimodal/multimodal_args.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
+    )
+    group.add_argument("--use-te", action="store_true", default=False)
+    group.add_argument(
+        "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
+    )
+    group.add_argument(
+        "--use-tiling", action="store_true", default=False, help="Use input image tiling"
+    )
+    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
+    group.add_argument(
+        "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
+    )
+    group.add_argument(
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--num-frames",
+        type=int,
+        default=1,
+        help="Number of frames to regularly sample from the video as input to the model.",
+    )
+    group.add_argument(
+        "--online-evaluation-config", type=str, help="Config file for online evaluation."
+    )
+
+    return parser
diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh
index da72c335c0..b06dbfe53c 100755
--- a/examples/multimodal/pretrain_mistral_clip.sh
+++ b/examples/multimodal/pretrain_mistral_clip.sh
@@ -32,7 +32,6 @@ fi
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
-DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
 
 DEBUG=0
 if [[ $DEBUG -eq 1 ]]; then
@@ -96,7 +95,6 @@ OPTIONS=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
     --data-path ${DATA_TRAIN} \
-    --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
     --save-interval 1000 \
     --save ${FINETUNE_DIR} \
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index 391f3071d0..bc406217b7 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -1,13 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 """Generate text using a vision language model."""
 import glob
+import itertools
 import json
 import logging
 import os
 import sys
 from collections import defaultdict
 from functools import partial
-import itertools
 
 # Add megatron to the path.
 sys.path.append(
@@ -17,7 +17,8 @@
 import datasets
 import numpy as np
 import torch
-from torchvision.io import read_video
+import yaml
+from config import EvaluationConfig
 from dataset_helpers import tokenizer_image_token
 from image_processing import get_visual_transform
 from MMMU.mmmu.utils.data_utils import (
@@ -27,10 +28,13 @@
     process_single_sample,
 )
 from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
 from PIL import Image
-from train import add_multimodal_extra_args, get_num_image_embeddings, model_provider
+from torchvision.io import read_video
 
 from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.inference.text_generation.api import generate_and_post_process
 from megatron.inference.text_generation.forward_step import ForwardStep
 from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
@@ -48,14 +52,12 @@ def add_text_generation_args(parser):
     group.add_argument(
         "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
     )
-    group.add_argument("--output-path", type=str, required=True, help='Output file path')
-    group.add_argument('--input-image-path', type=str, required=True, help="Input image directory")
-    group.add_argument('--input-metadata-path', type=str, help="Input metadata path")
+    group.add_argument("--output-path", type=str, help='Output file path')
+    group.add_argument('--input-image-path', type=str, help="Input image directory")
     group.add_argument(
         '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
     )
     group.add_argument('--partition-id', type=int, default=0, help="Partition index")
-    group.add_argument("--drop-vision-class-token", action="store_true", default=False)
     group.add_argument("--gt-path", type=str, help="Optional ground truth file")
     group.add_argument(
         "--task",
@@ -69,10 +71,11 @@ def add_text_generation_args(parser):
     group.add_argument(
         "--prompt-format",
         type=str,
-        required=True,
+        default="mistral",
         choices=["llama3", "mistral"],
         help="Prompting format to use",
     )
+    group.add_argument("--config-path", type=str, help="Config file to use.")
 
     # Add common multimodal arguments needed for e.g. building the model.
     parser = add_multimodal_extra_args(parser)
@@ -85,8 +88,9 @@ def _get_partition_bounds(
 ):
     if num_samples_per_partition == 0:
         samples_per_partition = [
-            int(x) for x in np.linspace(0, total_num_samples, num_partitions+1)]
-        return samples_per_partition[partition_id], samples_per_partition[partition_id+1] 
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
     return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
 
 
@@ -286,33 +290,34 @@ def get_evaluation_dataset(
                 continue
             gt["video_path"] = video_path
             ground_truth.append(gt)
-        
+
         ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
         print_rank_0(f"Found {len(ground_truth)} videos to process.")
 
         if num_partitions > 0:
             start_idx, end_idx = _get_partition_bounds(
-                len(ground_truth), num_samples_per_partition,
-                num_partitions, partition_id
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
             )
             ground_truth = ground_truth[start_idx:end_idx]
 
         # Run image preprocessing.
         for idx, gt in enumerate(ground_truth):
             print_rank_0(f"Processing input video: {idx} / {len(ground_truth)}")
-            video, _, _ = read_video(
-                gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+            video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
             video = video.numpy()
-            selected_frames = torch.linspace(
-                0, video.shape[0] - 1, num_frames).long()
+            selected_frames = torch.linspace(0, video.shape[0] - 1, num_frames).long()
             video_frames = video[selected_frames]
             if num_frames == 1:
                 video_frames = video_frames[None]
 
-            imgs = list(itertools.chain.from_iterable(
-                get_visual_transform(
-                    img, img_h, img_w, use_tiling, max_num_tiles,
-                    use_thumbnail, augment=False) for img in video_frames))
+            imgs = list(
+                itertools.chain.from_iterable(
+                    get_visual_transform(
+                        img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
+                    )
+                    for img in video_frames
+                )
+            )
 
             for question in gt["questions"]:
                 # Very hacky, but we essentially re-create gt holding only the
@@ -324,7 +329,7 @@ def get_evaluation_dataset(
                     "video_category": gt["video_category"],
                     "video_subcategory": gt["video_subcategory"],
                     "url": gt["url"],
-                    "questions": [question]
+                    "questions": [question],
                 }
                 images.append(imgs)
                 tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
@@ -336,26 +341,30 @@ def get_evaluation_dataset(
     return images, tile_counts, samples, sample_ids, questions, answers
 
 
-def generate_samples(model):
+def generate_samples(model, config: EvaluationConfig):
     """Text generation using a trained vision language model."""
     args = get_args()
     images, tile_counts, samples, sample_ids, questions, answers = get_evaluation_dataset(
-        args.task,
-        args.input_image_path,
-        args.gt_path,
+        config.task,
+        config.input_image_path,
+        config.gt_path,
         args.img_h,
         args.img_w,
         args.use_tiling,
         args.max_num_tiles,
         args.use_thumbnail,
-        args.num_samples_per_partition,
-        args.num_partitions,
-        args.partition_id,
-        args.num_frames
+        config.num_samples_per_partition,
+        config.num_partitions,
+        config.partition_id,
+        args.num_frames,
+    )
+
+    num_image_embeddings_per_tile = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
     )
     num_img_embeddings_per_tile = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim,
-        args.disable_vision_class_token, 1)
+        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+    )
     num_samples = len(sample_ids)
     idx = 0
     while idx < num_samples:
@@ -363,21 +372,20 @@ def generate_samples(model):
         num_tiles = tile_counts[idx].cuda()
         sample_id = sample_ids[idx]
 
-        prompt = get_prompt(args.task, questions, idx, args.prompt_format)
+        prompt = get_prompt(config.task, questions, idx, config.prompt_format)
 
-        forward_step = partial(
-            VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
+        forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
 
         if torch.distributed.get_rank() == 0:
             resp_sentences, _, _, _ = generate_and_post_process(
                 model,
                 forward_step=forward_step,
                 prompts=[prompt],
-                tokens_to_generate=args.out_seq_length,
-                top_k_sampling=args.top_k,
-                top_p_sampling=args.top_p,
+                tokens_to_generate=config.out_seq_length,
+                top_k_sampling=config.top_k,
+                top_p_sampling=config.top_p,
                 add_BOS=False,
-                temperature=args.temperature,
+                temperature=config.temperature,
                 random_seed=args.seed,
                 detokenize_segments=False,
             )
@@ -386,29 +394,29 @@ def generate_samples(model):
                 output = {"sample_id": sample_id, "prompt": prompt}
 
                 output_name = ""
-                if args.task == "captioning":
+                if config.task == "captioning":
                     output_name = "caption"
-                elif args.task in ("TextVQA", "VQAv2", "ChartQA"):
+                elif config.task in ("TextVQA", "VQAv2", "ChartQA"):
                     output_name = "answer"
-                elif args.task in ("MMMU"):
+                elif config.task in ("MMMU"):
                     output_name = "text"
-                elif args.task == "VideoMME":
+                elif config.task == "VideoMME":
                     output_name = "response"
                     output = questions[idx]
 
-                generated = get_generated(prompt, args.prompt_format, generation)
-                if args.task == "VideoMME":
+                generated = get_generated(prompt, config.prompt_format, generation)
+                if config.task == "VideoMME":
                     output["questions"][0][output_name] = generated
                 else:
                     output[output_name] = generated
 
-                if args.task == "captioning":
+                if config.task == "captioning":
                     output["ground_truth"] = answers[sample_id]
-                elif args.task in ("TextVQA", "VQAv2"):
+                elif config.task in ("TextVQA", "VQAv2"):
                     output["gt_answer"] = [ans for ans in answers[idx]]
-                elif args.task == "ChartQA":
+                elif config.task == "ChartQA":
                     output["gt_answer"] = [answers[idx]]
-                elif args.task == "MMMU":
+                elif config.task == "MMMU":
                     sample = samples[idx]
 
                     prediction = generated
@@ -429,27 +437,63 @@ def generate_samples(model):
             idx += 1
 
 
-def generate_and_write_samples(model):
-    """Generate text and write to an output file."""
+def get_evaluation_config():
+    """Get evaluation config from a config file or command-line arguments."""
     args = get_args()
+    if args.config_path:
+        with open(args.config_path, "r") as f:
+            config_dict = yaml.safe_load(f)
+
+        config = EvaluationConfig(**config_dict)
+    else:
+        config = EvaluationConfig(
+            task=args.task,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            out_seq_length=args.out_seq_length,
+            output_path=args.output_path,
+            input_image_path=args.input_image_path,
+            gt_path=args.gt_path,
+            num_partitions=args.num_partitions,
+            partition_id=args.partition_id,
+            num_samples_per_partition=args.num_samples_per_partition,
+            prompt_format=args.prompt_format,
+        )
+
+    # Default output path if not defined...
+    if not config.output_path:
+        os.makedirs("generated", exist_ok=True)
+        config.output_path = "generated/" + args.language_model_type
 
-    for output in generate_samples(model):
+    return config
+
+
+def generate_and_write_samples(model, config):
+    """Generate text and write to an output file."""
+    for output in generate_samples(model, config):
         if torch.distributed.get_rank() == 0:
-            with open(args.output_path, 'a') as f:
+            with open(config.output_path, 'a') as f:
                 f.write(json.dumps(output) + "\n")
 
 
 class VLMForwardStep(ForwardStep):
     """Inference forward step for a multimodal model."""
 
-    def __init__(self, num_img_embeddings_per_tile, images, num_tiles, model,
-                 max_batch_size, max_sequence_length):
+    def __init__(
+        self,
+        num_img_embeddings_per_tile,
+        images,
+        num_tiles,
+        model,
+        max_batch_size,
+        max_sequence_length,
+    ):
         """Create multimodal forward step."""
         total_num_tiles = torch.sum(num_tiles).item()
-        num_img_embeddings =  num_img_embeddings_per_tile * total_num_tiles
+        num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
 
-        super().__init__(
-            model, max_batch_size, max_sequence_length + num_img_embeddings)
+        super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
         self._images = images
         self._num_tiles = num_tiles
 
@@ -461,6 +505,7 @@ def _forward(self, tokens, position_ids, attention_mask):
             attention_mask=None,
             inference_params=self.inference_params,
             num_image_tiles=self._num_tiles,
+            runtime_gather_output=True,
         )
 
     def __call__(self, tokens, position_ids, attention_mask):
@@ -532,20 +577,19 @@ def get_prompt(task, questions, idx, prompt_format):
         question = (
             "Select the best answer to the following multiple-choice "
             "question based on the video. Respond with only the letter "
-            "(A, B, C, or D) of the correct option.\n")
-        question += (questions[idx]["questions"][0]["question"] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][0] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][1] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][2] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][3] + "\n")
+            "(A, B, C, or D) of the correct option.\n"
+        )
+        question += questions[idx]["questions"][0]["question"] + "\n"
+        question += questions[idx]["questions"][0]["choices"][0] + "\n"
+        question += questions[idx]["questions"][0]["choices"][1] + "\n"
+        question += questions[idx]["questions"][0]["choices"][2] + "\n"
+        question += questions[idx]["questions"][0]["choices"][3] + "\n"
 
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
             prompt = prompt.format("", question)
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}".format(
-                question
-            )
+            prompt = "<image>\n{}".format(question)
 
     return prompt
 
@@ -617,9 +661,12 @@ def wrapped_model_provider(pre_process, post_process):
         _ = load_checkpoint(model, None, None)
 
     model = model[0]
+
     model.eval()
 
-    generate_and_write_samples(model)
+    config = get_evaluation_config()
+
+    generate_and_write_samples(model, config)
 
 
 if __name__ == "__main__":
diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh
index 93a0a91366..46fc996055 100755
--- a/examples/multimodal/sft_mistral_clip.sh
+++ b/examples/multimodal/sft_mistral_clip.sh
@@ -37,7 +37,6 @@ fi
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
-DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml"
 
 DEBUG=0
 if [[ $DEBUG -eq 1 ]]; then
@@ -101,7 +100,6 @@ OPTIONS=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
     --data-path ${DATA_TRAIN} \
-    --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
     --save-interval 500 \
     --save ${FINETUNE_DIR} \
diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh
index 30d1b06ab4..b78969ab59 100755
--- a/examples/multimodal/text_generation_mistral_clip.sh
+++ b/examples/multimodal/text_generation_mistral_clip.sh
@@ -4,7 +4,6 @@ export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=0
 
-INPUT_METADATA_PATH="placeholder"
 GROUNDTRUTH_PATH="placeholder"
 NUM_FRAMES=1
 
@@ -15,11 +14,6 @@ while [[ $# -gt 0 ]]; do
             shift
             shift
             ;;
-        --input-metadata-path)
-            INPUT_METADATA_PATH="$2"
-            shift
-            shift
-            ;;
         --num-frames)
             NUM_FRAMES="$2"
             shift
@@ -112,7 +106,6 @@ do
         --no-load-rng \
         --no-load-optim \
         --input-image-path ${INPUT_IMAGE_PATH} \
-        --input-metadata-path ${INPUT_METADATA_PATH} \
         --num-partitions ${NUM_PARTITIONS} \
         --partition-id ${PARTITION_ID} \
         --output-path ${OUTPUT_PATH}-${TASK}-${PARTITION_ID}.jsonl \
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
index e1cad7814e..386cdc03d0 100644
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -1,131 +1,29 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 """Pretrain or SFT multimodal."""
-from copy import deepcopy
-from functools import partial
+import json
 import os
 import sys
-import warnings
+from functools import partial
 
 import torch
+import yaml
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir, os.path.pardir)))
 
-from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
+from config import EvaluationConfig
+from dataloader_provider import train_valid_test_dataloaders_provider
+from evaluate_textvqa import textvqa_eval
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+from run_text_generation import generate_samples, patch_tokenizer
+
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
-from megatron.core.parallel_state import get_tensor_model_parallel_rank
-from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
-from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.core.models.multimodal.llava_model import LLaVAModel
-from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te
-from megatron.training import pretrain
-from dataloader_provider import train_valid_test_dataloaders_provider
-
-def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True,
-    parallel_output=True) -> LLaVAModel:
-    """Builds the model.
-
-    Args:
-        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
-            will live on only a subset of the pipeline stages (specifically, only the first stage).
-        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
-            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
-        parallel_output (bool): Enable parallel model output.
-
-    Returns:
-        model: A multimodal model.
-    """
-    args = get_args()
-
-    use_te = args.use_te
-
-    print_rank_0('building a multimodal model ...')
-
-    num_image_embeddings = get_num_image_embeddings(args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1)
-    old_seq_length = args.seq_length
-    args.seq_length = args.encoder_seq_length = num_image_embeddings
-    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
-        warnings.warn(f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})")
-
-    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
-
-    assert args.decoder_seq_length is not None, "Please provide --decoder-seq-length to set the language model sequence length"
-    assert args.decoder_seq_length > max_num_image_embeddings, "Language model sequence length must be greater than the maximum number of image embeddings"
-    if args.decoder_seq_length > args.max_position_embeddings:
-        args.max_position_embeddings = args.decoder_seq_length
-        warnings.warn(f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length")
-
-    base_config = core_transformer_config_from_args(get_args())
-    base_config.language_model_type = args.language_model_type
-    base_config.vision_model_type = args.vision_model_type
-    base_config.calculate_per_token_loss = True
-
-    language_config = deepcopy(base_config)
-    language_config = get_language_model_config(language_config)
-
-    if use_te:
-        language_transformer_layer_spec = get_layer_spec_te(is_vit=False)   # TENorm detects LayerNorm/RMS automatically.
-    else:
-        language_transformer_layer_spec = get_layer_spec(is_vit=False, normalization=language_config.normalization)
-
-    vision_config = deepcopy(base_config)
-    vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling)
-
-    vision_model_type = args.vision_model_type
-    if vision_model_type == "clip":
-        if use_te:
-            vision_transformer_layer_spec = get_layer_spec_te(is_vit=True)  # TENorm detects LayerNorm/RMS automatically.
-        else:
-            vision_transformer_layer_spec = get_layer_spec(is_vit=True, normalization=vision_config.normalization)
-    else:
-        raise RuntimeError("unsupported vision model type", vision_model_type)
-
-    vision_projection_config = deepcopy(base_config)
-    vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size)
-
-    if args.encoder_pipeline_model_parallel_size > 0:
-        assert args.encoder_pipeline_model_parallel_size == 1, "vision model and projection can only live on 1 pipeline stage."
-        vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
-        vision_projection_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
-        if args.encoder_tensor_model_parallel_size > 0:
-            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
-            vision_projection_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
-
-    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
-
-    model = LLaVAModel(
-        language_transformer_config=language_config,
-        language_transformer_layer_spec=language_transformer_layer_spec,
-        language_vocab_size=args.padded_vocab_size,
-        language_max_sequence_length=args.decoder_seq_length,
-        vision_transformer_config=vision_config,
-        vision_transformer_layer_spec=vision_transformer_layer_spec,
-        drop_vision_class_token=args.disable_vision_class_token,
-        vision_projection_config=vision_projection_config,
-        vision_projection_layer_spec=vision_projection_layer_spec,
-        vision_projection_type="mlp",
-        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
-        parallel_output=parallel_output,
-        language_position_embedding_type=args.position_embedding_type,
-        language_rotary_percent=args.rotary_percent,
-        pre_process=pre_process,
-        post_process=post_process,
-        add_encoder=add_encoder,
-        add_decoder=add_decoder,
-        img_h=args.img_h,
-        img_w=args.img_w,
-        patch_dim=args.patch_dim,
-        language_rotary_base=args.rotary_base,
-    )
-
-    model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False)
-
-    return model
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import is_last_rank
 
 
 def get_batch(data_iterator):
@@ -314,32 +212,6 @@ def forward_step(data_iterator, model: LLaVAModel):
 
     return output_tensor, partial(loss_func, loss_mask)
 
-def add_multimodal_extra_args(parser):
-    """Extra arguments."""
-    group = parser.add_argument_group(title='multimodal arguments')
-    group.add_argument('--valid-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--dataset-config', type=str, default=None)
-    group.add_argument("--prompt-path", type=str, default=None)
-    group.add_argument('--freeze-LM', action='store_true', default=False)
-    group.add_argument('--freeze-ViT', action='store_true', default=False)
-    group.add_argument('--language-model-type', type=str, required=True)
-    group.add_argument('--vision-model-type', type=str, default="clip")
-    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
-    group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False)
-    group.add_argument("--use-te", action="store_true", default=False)
-    group.add_argument("--dataloader-save", type=str, default=None, help="Energon dataloader state save path")
-    group.add_argument("--use-tiling", action="store_true", default=False, help="Use input image tiling")
-    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
-    group.add_argument("--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile")
-    group.add_argument("--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.")
-    group.add_argument("--num-frames", type=int, default=1, help="Number of frames to regularly sample from the video as input to the model.")
-
-    return parser
-
 
 def llava_embedding_ranks(pp_ranks):
     """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
@@ -375,6 +247,64 @@ def llava_position_embedding_ranks(pp_ranks):
         return [pp_ranks[epp]]
 
 
+
+def run_online_eval(model):
+    """Run an evaluation benchmark during training."""
+    args = get_args()
+
+    # Online evaluation config is not defined. Do nothing.
+    if not args.online_evaluation_config:
+        return []
+
+    with open(args.online_evaluation_config, "r") as f:
+        config_dict = yaml.safe_load(f)
+
+    config = EvaluationConfig(**config_dict)
+
+    patch_tokenizer(args)
+
+    # The inference code assumes the first rank is the leader.
+    # Tensorboard writer is on the last rank.
+    # We must write to a storage space that all ranks see.
+    output_dir = os.path.join(args.save, "online_eval")
+    os.makedirs(output_dir, exist_ok=True)
+    config.output_path = os.path.join(output_dir, f"{config.task}.jsonl")
+
+    if torch.distributed.get_rank() == 0:
+        output_file = open(config.output_path, "w")
+
+    with torch.no_grad():
+        for output in generate_samples(model[0].module, config):
+            if torch.distributed.get_rank() == 0:
+                output_file.write(json.dumps(output) + "\n")
+
+    if torch.distributed.get_rank() == 0:
+        output_file.close()
+
+    # Make sure the first rank is done writing so that the last rank can run eval.
+    torch.distributed.barrier()
+
+    if not is_last_rank():
+        return []
+
+    if config.task.lower() == "textvqa":
+        avg_acc = textvqa_eval(config.output_path)
+
+        return [{"textvqa accuracy": avg_acc}]
+    else:
+        raise NotImplementedError(f"online evaluation of {config.task} not implemented yet")
+
+
+def write_online_eval_to_tensorboard(data, iteration, writer):
+    """Write online evaluation data to Tensorboard."""
+    if not writer:
+        return
+
+    for item in data:
+        for k, v in item.items():
+            writer.add_scalar(k, v, iteration)
+
+
 if __name__ == "__main__":
     train_valid_test_dataloaders_provider.is_distributed = True
 
@@ -385,6 +315,8 @@ def llava_position_embedding_ranks(pp_ranks):
         forward_step,
         args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
         extra_args_provider=add_multimodal_extra_args,
+        process_non_loss_data_func=write_online_eval_to_tensorboard,
         get_embedding_ranks=llava_embedding_ranks,
         get_position_embedding_ranks=llava_position_embedding_ranks,
+        non_loss_data_func=run_online_eval
     )
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 20f83976c4..b5f7ce51e9 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -185,12 +185,17 @@ def forward(
         inference_params: InferenceParams = None,
         packed_seq_params: PackedSeqParams = None,
         extra_block_kwargs: dict = None,
+        runtime_gather_output: Optional[bool] = None,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoeder and finally into the post
         processing layer (optional).
 
         It either returns the Loss values if labels are given  or the final hidden units
+
+        Args:
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
         """
         # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
         # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
@@ -230,7 +235,9 @@ def forward(
         output_weight = None
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
-        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+        logits, _ = self.output_layer(
+            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+        )
 
         if has_config_logger_enabled(self.config):
             payload = OrderedDict(
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index a8ddc94ced..68d963bdf9 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -429,6 +429,7 @@ def forward(
         inference_params: Optional[InferenceParams] = None,
         num_image_tiles: Optional[List[int]] = None,
         image_token_index: Optional[int] = IMAGE_TOKEN_INDEX,
+        runtime_gather_output: Optional[bool] = None,
     ) -> torch.Tensor:
         """Forward function of the LLaVA model.
 
@@ -445,6 +446,8 @@ def forward(
             inference_params (InferenceParams): Inference-time parameters including KV cache.
             num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image.
             image_token_index (int): ID for input images.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
 
         Returns:
             output (torch.Tensor): Loss of shape [b, s] if labels are provided,
@@ -528,6 +531,7 @@ def forward(
             decoder_input=combined_embeddings,
             labels=new_labels,
             inference_params=inference_params,
+            runtime_gather_output=runtime_gather_output,
         )
 
         if labels is None or loss_mask is None:
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index ff0be00bb8..61d9c7c34d 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -69,6 +69,8 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
 
 
 def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    """Set default model parallel attributes if not set explicitly already."""
+
     def maybe_set(attribute, value):
         if not hasattr(tensor, attribute):
             setattr(tensor, attribute, value)
@@ -78,6 +80,8 @@ def maybe_set(attribute, value):
 
 
 def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    """Copy model parallel attributes from one tensor to another."""
+
     def maybe_copy(attribute):
         if hasattr(source_tensor, attribute):
             setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
@@ -219,6 +223,11 @@ def __init__(
                 _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
 
     def forward(self, input_):
+        """Forward.
+
+        Args:
+            input_ (torch.Tensor): Input tensor.
+        """
         if self.tensor_model_parallel_size > 1:
             # Build the mask.
             input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
@@ -278,6 +287,7 @@ class LinearWithFrozenWeight(torch.autograd.Function):
     @staticmethod
     @custom_fwd
     def forward(ctx, input, weight, bias, allreduce_dgrad):
+        """Forward with frozen weight."""
         ctx.save_for_backward(weight)
         ctx.allreduce_dgrad = allreduce_dgrad
         output = torch.matmul(input, weight.t())
@@ -288,6 +298,7 @@ def forward(ctx, input, weight, bias, allreduce_dgrad):
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        """Backward with frozen weight."""
         (weight,) = ctx.saved_tensors
         grad_input = grad_output.matmul(weight)
 
@@ -389,6 +400,7 @@ def forward(
         grad_output_buffer,
         wgrad_deferral_limit,
     ):
+        """Forward."""
         ctx.save_for_backward(input, weight)
         ctx.use_bias = bias is not None
         ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
@@ -418,6 +430,7 @@ def forward(
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        """Backward."""
         input, weight = ctx.saved_tensors
         use_bias = ctx.use_bias
         grad_output_buffer = ctx.grad_output_buffer
@@ -847,7 +860,12 @@ def __init__(
             )
         )
 
-    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
+    def forward(
+        self,
+        input_: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        runtime_gather_output: Optional[bool] = None,
+    ):
         """Forward of ColumnParallelLinear
 
         Args:
@@ -855,6 +873,8 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
                 3D tensor whose order of dimension is [sequence, batch, hidden]
             weight (optional):
                 weight tensor to use, compulsory when skip_weight_param_allocation is True.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `gather_output` arg in the constructor will be used.
 
         Returns:
             - output
@@ -927,7 +947,13 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
             ),
             allreduce_dgrad=allreduce_dgrad,
         )
-        if self.gather_output:
+
+        gather_output = self.gather_output
+        # Use the runtime gather output if it's set explicitly.
+        if runtime_gather_output is not None:
+            gather_output = runtime_gather_output
+
+        if gather_output:
             # All-gather across the partitions.
             assert not self.sequence_parallel
             output = gather_from_tensor_model_parallel_region(output_parallel)
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 7d60f41f5c..fbe4ecf079 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -205,6 +205,7 @@ def pretrain(
     args_defaults={},
     get_embedding_ranks=None,
     get_position_embedding_ranks=None,
+    non_loss_data_func=None,
 ):
     """Main training program.
 
@@ -233,6 +234,10 @@ def pretrain(
             to it. It is used for programs to add their own arguments.
         args_defaults: a dictionary from argument-name to argument-value. It
             to set already parse arguments.
+        get_embedding_ranks (TODO):
+        get_position_embedding_ranks (TODO):
+        non_loss_data_func (callable): A custom function to call during evaluation.
+            It can run e.g. benchmarks.
     """
 
     # Initalize and get arguments, timers, and Tensorboard writer.
@@ -356,7 +361,8 @@ def pretrain(
                 forward_step_func,
                 model, optimizer, opt_param_scheduler,
                 train_data_iterator, valid_data_iterator,
-                process_non_loss_data_func, config, checkpointing_context)
+                process_non_loss_data_func, config, checkpointing_context,
+                non_loss_data_func)
 
         print_datetime('after training is done')
 
@@ -381,14 +387,16 @@ def pretrain(
         evaluate_and_print_results(prefix, forward_step_func,
                                    valid_data_iterator, model,
                                    iteration, process_non_loss_data_func, config,
-                                   verbose=True, write_to_tensorboard=not args.skip_train)
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
 
     if args.do_test:
         prefix = f'iteration {iteration} on test set'
         evaluate_and_print_results(prefix, forward_step_func,
                                    test_data_iterator, model,
                                    iteration, process_non_loss_data_func, config,
-                                   verbose=True, write_to_tensorboard=not args.skip_train)
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
 
     wandb_writer = get_wandb_writer()
     if wandb_writer:
@@ -1095,7 +1103,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
 
 def train(forward_step_func, model, optimizer, opt_param_scheduler,
           train_data_iterator, valid_data_iterator,
-          process_non_loss_data_func, config, checkpointing_context):
+          process_non_loss_data_func, config, checkpointing_context, non_loss_data_func):
     """Train the model function."""
     args = get_args()
     timers = get_timers()
@@ -1331,7 +1339,8 @@ def get_e2e_base_metrics():
             evaluate_and_print_results(prefix, forward_step_func,
                                        valid_data_iterator, model,
                                        iteration, process_non_loss_data_func,
-                                       config, False)
+                                       config, verbose=False, write_to_tensorboard=True,
+                                       non_loss_data_func=non_loss_data_func)
             eval_duration += timers('eval-time').elapsed()
             eval_iterations += args.eval_iters
             timers('eval-time').stop()
@@ -1456,7 +1465,8 @@ def evaluate(forward_step_func,
              model,
              process_non_loss_data_func,
              config,
-             verbose=False):
+             verbose=False,
+             non_loss_data_func=None):
     """Evaluation."""
     args = get_args()
     timers = get_timers()
@@ -1534,7 +1544,9 @@ def evaluate(forward_step_func,
                     return None, None, True
 
         collected_non_loss_data = None
-        if process_non_loss_data_func is not None and is_last_rank():
+        if non_loss_data_func is not None:
+            collected_non_loss_data = non_loss_data_func(model)
+        elif process_non_loss_data_func is not None and is_last_rank():
             collected_non_loss_data = forward_backward_func(
                 forward_step_func=forward_step_func,
                 data_iterator=data_iterator,
@@ -1562,7 +1574,7 @@ def evaluate(forward_step_func,
 def evaluate_and_print_results(prefix, forward_step_func,
                                data_iterator, model,
                                iteration, process_non_loss_data_func, config,
-                               verbose=False, write_to_tensorboard=True):
+                               verbose=False, write_to_tensorboard=True, non_loss_data_func=None):
     """Helper function to evaluate and dump results on screen."""
     args = get_args()
     if write_to_tensorboard:
@@ -1574,7 +1586,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
 
     total_loss_dict, collected_non_loss_data, timelimit = evaluate(
         forward_step_func, data_iterator, model,
-        process_non_loss_data_func, config, verbose)
+        process_non_loss_data_func, config, verbose, non_loss_data_func)
     # Timelimit hit during evaluation
     if timelimit:
         return

From 69d4c44c7656ff3273f12eae28afbc9f5ed5c1c7 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Thu, 19 Sep 2024 16:12:34 -0700
Subject: [PATCH 02/50] ADLR/megatron-lm!1973 - MMMU multi-image support

---
 examples/multimodal/run_text_generation.py | 119 ++++++++++++++-------
 1 file changed, 81 insertions(+), 38 deletions(-)

diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index bc406217b7..b4c020dcbb 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import os
+import re
 import sys
 from collections import defaultdict
 from functools import partial
@@ -257,23 +258,69 @@ def get_evaluation_dataset(
 
         for idx in range(start_idx, end_idx):
             sample = dataset[idx]
-            sample = process_single_sample(sample)
-            sample = construct_prompt(sample, config)
 
-            img = sample["image"]
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+            single_image = True
+            # Use the single image approach from the MMMU repo.
+            if single_image:
+                sample = process_single_sample(sample)
+                sample = construct_prompt(sample, config)
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+                img = sample["image"]
+                sample_imgs = get_visual_transform(
+                    img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
+                )
+                sample_num_tiles = [len(sample_imgs)]
+            else:
+                sample = construct_prompt(sample, config)
+
+                sample_imgs = []
+                sample_num_tiles = []
+
+                img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+                # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+                adjusted_max_num_tiles = max(1, max_num_tiles // len(img_indices))
+
+                for img_idx in img_indices:
+                    img_key = f"image_{img_idx}"
+                    img_str = f"<image {img_idx}>"
+
+                    img = sample[img_key]
+                    assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                    # Note: Only replace the current image tag.
+                    sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                        img_str, "<image>", 1
+                    )
+
+                    imgs = get_visual_transform(
+                        img,
+                        img_h,
+                        img_w,
+                        use_tiling,
+                        adjusted_max_num_tiles,
+                        use_thumbnail,
+                        augment=False,
+                    )  # List of tiles.
+
+                    sample_imgs.extend(imgs)
+                    sample_num_tiles.append(len(imgs))
+
+                # Sanity check.
+                for i in range(1, 8):
+                    assert (
+                        f"<image {i}>" not in sample["final_input_prompt"]
+                    ), "prompt contains unhandled image tags"
+
+            images.append(sample_imgs)
+            tile_counts.append(torch.tensor(sample_num_tiles, dtype=torch.int))
 
             sample_ids.append(sample['id'])
 
-            # TODO: Support multiple input images and the original image position. Note: <image> is added back in the prompt construction below.
             prompt = sample['final_input_prompt']
-            for i in range(8):
-                prompt = prompt.replace(f"<image {i}>", "")
+            if single_image:
+                for i in range(8):
+                    prompt = prompt.replace(f"<image {i}>", "")
+                prompt = f"<image>\n{prompt}"
             questions.append(prompt)
 
             answers.append(sample['answer'])
@@ -359,9 +406,6 @@ def generate_samples(model, config: EvaluationConfig):
         args.num_frames,
     )
 
-    num_image_embeddings_per_tile = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
-    )
     num_img_embeddings_per_tile = get_num_image_embeddings(
         args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
     )
@@ -404,7 +448,7 @@ def generate_samples(model, config: EvaluationConfig):
                     output_name = "response"
                     output = questions[idx]
 
-                generated = get_generated(prompt, config.prompt_format, generation)
+                generated = get_generated(generation, args.prompt_format)
                 if config.task == "VideoMME":
                     output["questions"][0][output_name] = generated
                 else:
@@ -513,11 +557,11 @@ def __call__(self, tokens, position_ids, attention_mask):
 
         # On the first inference iteration, we compute image tokens.
         # Update the sequence length offset by the number of image tokens.
-        num_images = (tokens == -200).sum().item()
+        num_image_tokens = (tokens == -200).sum().item()
         num_tokens = tokens.size(1)
-        if num_tokens > 1 and num_images > 0:
+        if num_tokens > 1 and num_image_tokens > 0:
             self.inference_params.sequence_len_offset += (
-                self.inference_params.key_value_memory_dict["image_tokens_count"] - num_images
+                self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens
             )
 
         return logits
@@ -529,7 +573,9 @@ def get_prompt(task, questions, idx, prompt_format):
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nProvide a one-sentence caption for provided image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
         elif prompt_format == "mistral":
-            prompt = "<image>Give a short and clear explanation of the subsequent image.\n"
+            prompt = (
+                "[INST] <image>Give a short and clear explanation of the subsequent image. [/INST]"
+            )
     elif task == "TextVQA":
         question = questions[idx]
 
@@ -538,7 +584,7 @@ def get_prompt(task, questions, idx, prompt_format):
                 question
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "VQAv2":
@@ -549,7 +595,7 @@ def get_prompt(task, questions, idx, prompt_format):
                 question
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "ChartQA":
@@ -560,19 +606,17 @@ def get_prompt(task, questions, idx, prompt_format):
                 questions
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "MMMU":
         question = questions[idx]
 
         if prompt_format == "llama3":
-            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            prompt = prompt.format("", question)
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(question)
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
-                question
-            )
+            prompt = "[INST] {} [/INST]".format(question)
     elif task == "VideoMME":
         question = (
             "Select the best answer to the following multiple-choice "
@@ -594,19 +638,17 @@ def get_prompt(task, questions, idx, prompt_format):
     return prompt
 
 
-def get_generated(prompt, prompt_format, prompt_and_generation):
+def get_generated(prompt_and_generation, prompt_format):
     """Strip prompt and other unnecessary text from generation."""
-    start = len(prompt.replace("<image>", ""))
     if prompt_format == "llama3":
-        start += len("<|begin_of_text|>")
-        start += 1
+        generated = prompt_and_generation.split(
+            "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        )[-1]
+        generated = generated.split("<|eot_id|>")[0]
     elif prompt_format == "mistral":
-        start += len("<s><unk><s> ")
+        generated = prompt_and_generation.split("[/INST]")[-1]
+        generated = generated.split("</s>")[0]
 
-    generated = prompt_and_generation[start:]
-    generated = generated.replace("<s> ", "")
-    generated = generated.split("<|eot_id|>")[0]
-    generated = generated.split("</s>")[0]
     generated = generated.strip()
     generated = generated.split("\n\n")[0]
     generated = generated.split("\n")[0]
@@ -621,15 +663,16 @@ def _decorate_tokenize(f):
         # When tokenizing, replace <image> with the image token index (-200)
         def wrapper(prompt):
             tokens = tokenizer_image_token(args, prompt, f)
+
             return tokens
 
         return wrapper
 
     def _decorate_detokenize(f):
-        # When detokenizing, replace image token index (-200) with a dummy value.
+        # When detokenizing, skip image token index.
         def wrapper(tokens):
             tokens = np.array(tokens)
-            tokens[tokens == IMAGE_TOKEN_INDEX] = 0
+            tokens = tokens[tokens != IMAGE_TOKEN_INDEX]
             tokens = tokens.tolist()
 
             return f(tokens)

From 7754f56528fc87d8140d75c10eac3e7138e5fc87 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Thu, 19 Sep 2024 19:33:57 -0700
Subject: [PATCH 03/50] ADLR/megatron-lm!2113 - build: Use multi-stage for
 parallel builds

---
 .gitlab/stages/01.tests.yml |  74 +++++++++++-----------
 Dockerfile.ci               | 118 ++++++++++++++----------------------
 2 files changed, 80 insertions(+), 112 deletions(-)

diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index b3cefc0fde..d087425af9 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -35,48 +35,44 @@ build_image:
   variables:
     STAGE: main
   script:
+    - apk add bash
     - |
-      set -x
-      env
-      eval "IMAGE=\$$IMAGE"
-
-      docker system prune -a --filter "until=24h" -f || true
-
-      if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
-        ADDITIONAL_PARAMS="--pull"
-      fi
-
-      docker pull ${IMAGE}:${CI_PIPELINE_ID} || true
-      docker pull ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} || true
-      docker pull ${IMAGE}:buildcache || true
-
-      docker build \
-        --secret id=JET_INDEX_URLS \
-        --target $STAGE \
-        -f $FILE \
-        -t ${IMAGE}:${CI_PIPELINE_ID} \
-        -t ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \
-        --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
-        --cache-to type=inline \
-        --cache-from type=registry,ref=${IMAGE}:buildcache \
-        --cache-from type=registry,ref=${IMAGE}:${CI_PIPELINE_ID} \
-        --cache-from type=registry,ref=${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \
-        --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
-        ${ADDITIONAL_PARAMS} .
-
-      docker push ${IMAGE}:${CI_PIPELINE_ID}
-      docker push ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop}
-
-      if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
-        docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:nightly
-        docker push ${IMAGE}:nightly
-      fi
+      bash -c '
+        set -x
+        env
+        eval "IMAGE=\$$IMAGE"
+
+        docker system prune -a --filter "until=24h" -f || true
+        
+        docker buildx create --name container --driver=docker-container
+      
+        ADDITIONAL_PARAMS=()
+
+        if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
+          ADDITIONAL_PARAMS+=("--pull")
+          ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
+        fi
 
-      if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
-        docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache
-        docker push ${IMAGE}:buildcache
-      fi
+        if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
+          ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+        fi
 
+        DOCKER_BUILDKIT=1 docker build \
+          --secret id=JET_INDEX_URLS \
+          --target $STAGE \
+          -f $FILE \
+          -t ${IMAGE}:${CI_PIPELINE_ID} \
+          --builder=container \
+          --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:main \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+          --push \
+          ${ADDITIONAL_PARAMS[@]} .
+        '
   retry:
     max: 2
 
diff --git a/Dockerfile.ci b/Dockerfile.ci
index dfcc7381f7..40c1464154 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -1,82 +1,54 @@
-# syntax=docker/dockerfile:experimental
+# syntax=docker/dockerfile:1.3-labs
 
 ARG FROM_IMAGE_NAME
-FROM $FROM_IMAGE_NAME as main
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
-      /etc/apt/apt.conf.d/docker-clean
-
-RUN apt-get update && \
-      apt-get install -y --no-install-recommends gettext && \
-      apt-get clean
-
-RUN wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
-chmod a+x /usr/local/bin/yq
-
-##### For Mamba begin #####
-RUN pip uninstall -y triton && \
-    pip install triton==2.1.0
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
 
-# The causal-conv1d and mamba-ssm packages below are built from scratch here
-# (which takes significant time) because there are no wheels available on PyPI
-# for these relatively newer versions of the packages that are compatible with
-# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
-# are using (in the NGC base container). Generally, if the package is not
-# compatible with the PyTorch version, then it will generate a Python import
-# error. The package authors tend to only release wheels for new versions of
-# these pacakges which are compatible with the versions of regular PyTorch and
-# NGC-variant PyTorch that are newer at the time of release. So, to use newer
-# versions of these packages with relatively older versions of the NGC PyTorch
-# container, we tend to have to build the packages from scratch.
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
 
-RUN cd /tmp && \
-    pip uninstall -y causal-conv1d && \
-    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
-    cd causal-conv1d && \
-    git checkout v1.2.2.post1 && \
-    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
-    cd .. && \
-    rm -rf causal-conv1d
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
 
-RUN cd /tmp && \
-    pip uninstall -y mamba-ssm && \
-    git clone https://github.com/state-spaces/mamba.git && \
-    cd mamba && \
-    git checkout v2.0.3 && \
-    MAMBA_FORCE_BUILD=TRUE pip install . && \
-    cd .. && \
-    rm -rf mamba
-##### For Mamba end #####
-
-##### For JET-API start #####
-RUN apt-get update && \ 
-    apt-get install -y python3-venv && \
-    apt-get clean -y && \
-    python -m venv /opt/jet
-##### For JET-API end #####
-
-RUN pip3 install --no-cache-dir \
-      einops \
-      flask-restful \
-      nltk \
-      pytest \
-      pytest-cov \
-      pytest_mock \
-      pytest-random-order \
-      sentencepiece \
-      wrapt \
-      git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 \
-      zarr \
-      tensorstore==0.1.45 \
-      wandb
-
-COPY . /workspace/megatron-lm
-
-COPY . /workspace/megatron-lm
-RUN cp -r /workspace/megatron-lm /opt && \
-    pip install /opt/megatron-lm  
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
 
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+triton==2.1.0 \
+causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
+mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
+grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+COPY . /opt/megatron-lm
+RUN pip install /opt/megatron-lm
 
 ##### For NVIDIANS only #####
 FROM main as jet

From 121d05e0d422b2dfe899a2b29019129867de534e Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Fri, 20 Sep 2024 17:35:27 -0700
Subject: [PATCH 04/50] ADLR/megatron-lm!2126 - Only print warning when
 relevant

---
 megatron/training/arguments.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 3dcfe4f2b2..7a0c2d8d37 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -288,7 +288,8 @@ def validate_args(args, defaults={}):
         # Overlap P2P communication is disabled if not using the interleaved schedule.
         args.overlap_p2p_comm = False
         args.align_param_gather = False
-        if args.rank == 0:
+        # Only print warning if PP size > 1.
+        if args.rank == 0 and args.pipeline_model_parallel_size > 1:
             print('WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False '
                   'since non-interleaved schedule does not support overlapping p2p communication '
                   'and aligned param AG')

From 3a0ca4b0364f771e61c4c4b771cbbe61f556cc4d Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 21 Sep 2024 10:45:53 -0700
Subject: [PATCH 05/50] ADLR/megatron-lm!2124 - tests: Fix location of megatron

---
 tests/functional_tests/jet_recipes/bert.yaml  |   2 +-
 tests/functional_tests/jet_recipes/gpt.yaml   |   2 +-
 .../jet_recipes/multimodal-llava.yaml         |   2 +-
 tests/functional_tests/jet_recipes/t5.yaml    |   2 +-
 .../shell_test_utils/run_ci_test_locally.sh   | 124 ------------------
 5 files changed, 4 insertions(+), 128 deletions(-)
 delete mode 100644 tests/functional_tests/shell_test_utils/run_ci_test_locally.sh

diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
index 75aac2faab..717664a69e 100644
--- a/tests/functional_tests/jet_recipes/bert.yaml
+++ b/tests/functional_tests/jet_recipes/bert.yaml
@@ -13,7 +13,7 @@ spec:
     /workspace/data/bert_data: text/the_pile/bert_shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/bert_data"
diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
index abaef86b81..8c09d0bd13 100644
--- a/tests/functional_tests/jet_recipes/gpt.yaml
+++ b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -12,7 +12,7 @@ spec:
     /workspace/data/gpt3_data: text/the_pile/shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/gpt3_data"
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
index 7a20b1145a..4bf1370304 100644
--- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml
+++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -13,7 +13,7 @@ spec:
   scope: null
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=''"
diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
index 947023b0eb..b2451a9600 100644
--- a/tests/functional_tests/jet_recipes/t5.yaml
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -13,7 +13,7 @@ spec:
     /workspace/data/t5_data: text/the_pile/t5_shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/t5_data"
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh b/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
deleted file mode 100644
index 3ee776ce9b..0000000000
--- a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/bin/bash
-
-#######################################################################################
-#
-# Script for capturing a reference model.
-#
-# It will train a model until a target iteration was hit.
-#
-#
-########################################################################################
-
-set -exo pipefail
-
-echo "------ARGUMENTS LIST --------"
-for ARGUMENT in "$@"; do
-    KEY=$(echo $ARGUMENT | cut -f1 -d=)
-
-    KEY_LENGTH=${#KEY}
-    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
-
-    export "$KEY"="$VALUE"
-    echo "$KEY=$VALUE"
-done
-echo "---------------------------------"
-
-# Check that mandatory vars are set
-MANDATORY_VARS=(
-    "MODEL"
-    "VARIANT"
-    "TRAINING_SCRIPT_PATH"
-    "OUTPUT_PATH"
-    "IMAGE_TAG"
-    "NODES"
-    "PPP"
-    "PARTITION"
-    "ITERATIONS"
-    "WANDB_API_KEY"
-    "CLUSTER"
-    "DATASET"
-    "WANDB_EXPERIMENT"
-    "GPUS_PER_NODE"
-)
-for mandatory_var in "${MANDATORY_VARS[@]}"; do
-    if [[ -z "${!mandatory_var}" ]]; then
-        echo 'Providing $'$mandatory_var' is mandatory.'
-        exit 1
-    fi
-done
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)
-
-# Fetch dataset base path via JET and refresh DATA_BELDN
-DATA_PATH=$(jet -c -tf plain -th artifacts registry list -c storages.$CLUSTER.identifier -f "key == '$DATASET'")
-DATA_BLEND=$(eval echo "$DATA_BLEND")
-
-########################################################################################
-# Dont change below
-########################################################################################
-
-SLURM_LOGS=$OUTPUT_PATH/slurm_logs/
-mkdir -p $SLURM_LOGS
-
-# Container settings
-ARGUMENTS=(
-    "TRAINING_SCRIPT_PATH=${TRAINING_SCRIPT_PATH}"
-    "TEST_CASE_PATH=./tests/functional_tests/test_cases/$MODEL/$VARIANT"
-    "OUTPUT_PATH=${OUTPUT_PATH}"
-    "TENSORBOARD_PATH=${OUTPUT_PATH}/tensorboard"
-    "CHECKPOINT_PATH=${OUTPUT_PATH}/checkpoints"
-    "DATA_PATH=${DATA_PATH}"
-    "DATA_CACHE_PATH=${OUTPUT_PATH}/data-cache"
-    "WANDB_API_KEY=${WANDB_API_KEY}"
-    "WANDB_EXPERIMENT=${WANDB_EXPERIMENT}"
-    "DATA_BLEND=\"${DATA_BLEND}\""
-)
-
-if [[ -n $LOAD_PATH ]]; then
-    ARGUMENTS+=("LOAD_PATH=${LOAD_PATH}")
-fi
-
-echo ${ARGUMENTS[@]}
-
-while : 
-do
-
-if [[ $(cat "${OUTPUT_PATH}/checkpoints/latest_checkpointed_iteration.txt" || echo 0) -ge $ITERATIONS ]]; then
-    break
-fi
-
-# Fire of sbatch
-echo '#!/bin/bash' > sbatch.sh
-
-if [[ $GPUS_PER_NODE != null ]]; then
-    echo '#SBATCH --gres=gpu:8' >> sbatch.sh
-fi
-echo "#SBATCH --nodes=$NODES
-#SBATCH --account $PPP
-#SBATCH --partition $PARTITION
-#SBATCH --ntasks-per-node=1
-#SBATCH --time "04:00:00"
-#SBATCH --job-name=$PPP:mcore:release:$MODEL
-#SBATCH --dependency=singleton
-#SBATCH --output=/dev/null 
-#SBATCH --error=/dev/null
-#SBATCH --exclusive
-
-# Prepare SLURM job
-echo "SLURM_JOB_ID=\$SLURM_JOB_ID" > "$SLURM_LOGS/\${SLURM_JOB_ID}.log"
-
-srun \
-    --ntasks-per-node=1 \
-    --container-image='gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG' \
-    --container-mounts='${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}' \
-    --container-workdir=/workspace/megatron-lm \
-    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>'$SLURM_LOGS/\${SLURM_JOB_ID}.log' 2>&1" >> sbatch.sh
-
-set +e
-sbatch -W sbatch.sh
-set -e
-done
-
-# Write golden values into repo if this run should become a reference
-cp $OUTPUT_PATH/golden_values.json > ./golden_values.json

From ca219ed1243dcb6a18bf7169f330b1d4c4110309 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 21 Sep 2024 10:46:01 -0700
Subject: [PATCH 06/50] ADLR/megatron-lm!2127 - ci: Bump sha

---
 .gitlab/stages/01.tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index d087425af9..94808a1921 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -87,7 +87,7 @@ unit_tests:
   parallel:
     matrix:
       - TAG: latest
-      - TAG: 8fc755388a03bae05cb740857008b8916e01a63c
+      - TAG: 63be779b4608403f956aa1ef6c9013ab78db3eeb
   tags: [8xL40S]
   variables:
     GIT_STRATEGY: clone

From 1899bb76be32575de151e97ce3360135eae644d4 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 21 Sep 2024 21:47:11 -0700
Subject: [PATCH 07/50] ADLR/megatron-lm!2128 - ci: Improve cherry pick
 workflow

---
 .gitlab/stages/00.pre.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index e0b5c579c1..a6d6319e57 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -101,8 +101,11 @@ maybe_cherry_pick_commit:
     - git config --global user.email "mcore-bot@nvidia.com"
     - git config --global user.name "Mcore Bot"
     - |
-      LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}" | jq '.labels | join(",")' | tr -d '"')
-      
+      MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
+
+      LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
+      AUTHOR=$(echo -E $MR | jq '.assignee.id')
+      TITLE=$(echo -E $MR | jq '.title')
       TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
 
       if [[ $TARGET_BRANCHES == "" ]]; then
@@ -134,8 +137,9 @@ maybe_cherry_pick_commit:
             --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
             -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
             -d "target_branch=$RELEASE_BRANCH" \
-            -d "title=Cherry-pick $MR_ID into $RELEASE_BRANCH" \
-            -d "labels=cherry-pick"
+            -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
+            -d "labels=cherry-pick" \
+            -d "assignee_id=$AUTHOR_ID"
 
         else
           URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID

From 26f5c32138097ccd1524485f2b567c06c817d123 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sun, 22 Sep 2024 01:54:16 -0700
Subject: [PATCH 08/50] ADLR/megatron-lm!2034 - ci: Introduce JET Python SDK

---
 .gitlab-ci.yml                                |  30 +--
 .gitlab/stages/02.functional-tests.yml        | 116 ++++------
 .gitlab/stages/03.convergence-tests.yml       |  86 --------
 .../stages/{04.publish.yml => 03.publish.yml} |   0
 Dockerfile.ci                                 |   2 +
 .../jet_recipes/_build-mcore.yaml             |  11 +
 .../jet_recipes/_build-nemo.yaml              |  10 +
 .../jet_recipes/_build-pyt.yaml               |  23 --
 .../jet_recipes/gpt-nemo.yaml                 |   4 +-
 .../jet_recipes/multimodal-llava.yaml         |   4 +-
 .../python_test_utils/jet/common.py           | 120 +++++++++++
 .../jet/generate_jet_trigger_job.py           |  79 +++++++
 .../jet/launch_jet_workload.py                | 200 ++++++++++++++++++
 .../shell_test_utils/notify.sh                |  70 +++---
 .../shell_test_utils/run_ci_test.sh           |   4 +-
 .../gpt/gpt3_15b_8t_release/model_config.yaml |   2 +-
 .../gpt3_15b_8t_release_sm/model_config.yaml  |   2 +-
 .../model_config.yaml                         |   2 +-
 .../model_config.yaml                         |   2 +-
 .../model_config.yaml                         |   2 +-
 .../model_config.yaml                         |   2 +-
 .../t5/t5_release/model_config.yaml           |   2 +-
 .../unit_tests/dist_checkpointing/test_fp8.py |   1 +
 .../dist_checkpointing/test_nonpersistent.py  |   1 +
 24 files changed, 523 insertions(+), 252 deletions(-)
 delete mode 100644 .gitlab/stages/03.convergence-tests.yml
 rename .gitlab/stages/{04.publish.yml => 03.publish.yml} (100%)
 create mode 100644 tests/functional_tests/jet_recipes/_build-mcore.yaml
 create mode 100644 tests/functional_tests/jet_recipes/_build-nemo.yaml
 delete mode 100644 tests/functional_tests/jet_recipes/_build-pyt.yaml
 create mode 100644 tests/functional_tests/python_test_utils/jet/common.py
 create mode 100644 tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
 create mode 100644 tests/functional_tests/python_test_utils/jet/launch_jet_workload.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e2f7725fb1..fb222e080b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -58,29 +58,22 @@ variables:
       - "mr"
       - "nightly"
       - "weekly"
+      - "pre-release"
+      - "release"
     description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
-  FUNCTIONAL_TEST_CLUSTER:
+  FUNCTIONAL_TEST_CLUSTER_A100:
     value: "dgxa100_dracooci"
     options:
       - "dgxa100_dracooci"
       - "dgxa100_dracooci-ord"
-      - "dgxh100_eos"
-    description: '"dgxa100_dracooci" for OCI-IAD, "dgxh100_eos" for EOS'
-  CONVERGENCE_TEST:
-    value: "no"
-    options:
-      - "yes"
-      - "no"
-    description: To run a convergence test
-  CONVERGENCE_TEST_SCOPE:
-    value: "release"
+    description: 'Cluster for A100 workloads'
+  FUNCTIONAL_TEST_CLUSTER_H100:
+    value: "dgxh100_eos"
     options:
-      - "release"
-      - "pre-release"
-    description: "Test suite to run (only for CONVERGENCE_TEST=yes)"
-  CONVERGENCE_TEST_RUN_NAME:
-    value: "pre-release-$$CI_PIPELINE_ID"
-    description: "Run directory of convergence test"
+      - "dgxh100_eos"
+    description: 'Cluster for H100 workloads'
+  FUNCTIONAL_TEST_NAME:
+    description: "Name of functional test run (only for pre-release and release)"
   PUBLISH: 
     value: "no"
     options: 
@@ -105,5 +98,4 @@ include:
   - .gitlab/stages/00.pre.yml
   - .gitlab/stages/01.tests.yml
   - .gitlab/stages/02.functional-tests.yml
-  - .gitlab/stages/03.convergence-tests.yml
-  - .gitlab/stages/04.publish.yml
+  - .gitlab/stages/03.publish.yml
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 0c30857409..1962523d0e 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -16,91 +16,65 @@ include:
     ref: main
     file: downstreams.yml
 
-jet-configure:
-  image:
-    name: mikefarah/yq:4.35.2
-    entrypoint: [""]
-  extends: [.jet_common, .jet-configure]
+jet-build:
+  extends: [build_image, .jet_common]
+  variables:
+    STAGE: jet
+
+jet-generate:
+  needs: [jet-build]
+  extends: [.jet_common]
+  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
   tags: [mcore-docker-node-small]
-  script:
+  before_script:
+    - git rm -r tests/functional_tests/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
+    - ls tests/functional_tests/local_recipes
+  script: 
     - set -x
     - |
-      if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" && "$CI_MERGE_REQUEST_LABELS" == "*H100*" ]]; then
-        FUNCTIONAL_TEST_CLUSTER=$DEFAULT_H100_CLUSTER
-      fi
+      A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
     - |
-      JET_CUSTOM_FILTER="type == 'basic'"
-
-      if [[ $FUNCTIONAL_TEST_CLUSTER == dgxh100_eos ]]; then
-        JET_CI_BRANCH=mcore/eos
-        PLATFORM=dgx_h100
-      elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci ]]; then
-        JET_CI_BRANCH=mcore/draco-oci
-        PLATFORM=dgx_a100
-      elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci-ord ]]; then
-        JET_CI_BRANCH=mcore/draco-oci-ord
-        PLATFORM=dgx_a100
-      fi
-
-      # Add platform
-      JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$PLATFORM' in spec.platforms"
-
-      # Add scope
-      JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$FUNCTIONAL_TEST_SCOPE' in spec.scope"
-
-      if [[ "$JET_CUSTOM_FILTER" == "type == 'basic'" ]]; then
-        JET_CUSTOM_FILTER="False"
+      if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
+        RELEASE_ARGS=(
+          "--run-name"
+          $FUNCTIONAL_TEST_NAME
+          "--wandb-experiment"
+          "test"
+        )
+      else
+        RELEASE_ARGS=()
       fi
 
-      echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a jet.env
-      echo "JET_CUSTOM_FILTER=$JET_CUSTOM_FILTER" | tee -a jet.env
-
     - |
-      IMAGE=${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} yq '. |=
-        (
-          select(.spec.name == "mcore-pyt")
-          | .spec.source.image = env(IMAGE)
-        )
-      ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml
-
-      IMAGE=${CI_NEMO_IMAGE}:${CI_PIPELINE_ID} yq '. |=
-        (
-          select(.spec.name == "mcore-nemo")
-          | .spec.source.image = env(IMAGE)
-        )
-      ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml
+      python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
+        --scope $FUNCTIONAL_TEST_SCOPE \
+        --a100-cluster $A100_CLUSTER \
+        --h100-cluster $H100_CLUSTER \
+        --container-tag ${CI_PIPELINE_ID} \
+        --container-image ${CI_MCORE_IMAGE} \
+        --output-path "jet-trigger-job.yaml" \
+        ${RELEASE_ARGS[@]}
   artifacts:
-    reports:
-      dotenv: jet.env
     paths:
-      - tests/functional_tests/jet_recipes
-  retry:
-    max: 2
-    when: job_execution_timeout
-
-jet-build:
-  extends: [build_image, .jet_common]
-  variables:
-    STAGE: jet
+      - jet-trigger-job.yaml
+      - tests/functional_tests/local_recipes
 
 jet-trigger:
-  extends: [.jet_common, .jet-trigger]
-  needs: [jet-configure, jet-build]
+  stage: functional_tests
+  needs: [jet-generate]
   trigger:
-    project: dl/jet/ci
-    branch: $JET_CI_BRANCH
+    include:
+      - artifact: jet-trigger-job.yaml
+        job: jet-generate
     strategy: depend
   variables:
-    JET_WORKLOADS_FILTER: '$JET_CUSTOM_FILTER'
-    JET_CUSTOM_CONFIG: |
-      retrier:
-        enabled: true
-        max_retries: 2
-        retry_on: ['1.2', '1.2.*'] # All infra related issues
-        waiting_time: 60
-        environment: jet-auto-retrier
-      builds: 
-        jet_flavour: # An empty mapping will disable building the JET flavor 
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_IMAGE: $CI_MCORE_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   inherit:
     variables: true
       
diff --git a/.gitlab/stages/03.convergence-tests.yml b/.gitlab/stages/03.convergence-tests.yml
deleted file mode 100644
index 5c7bd6a7a3..0000000000
--- a/.gitlab/stages/03.convergence-tests.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-.common_release:
-  stage: convergence_tests
-  needs: [build_image]
-  timeout: 7d
-  before_script:
-    - git rm -r tests/functional_tests/local_recipes || true
-    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
-    - ls tests/functional_tests/local_recipes
-    - INSTALL_DIR=$(pwd)/local
-    - rm -rf "$INSTALL_DIR"
-    - mkdir -p "$INSTALL_DIR"
-    - wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname --machine).sh" -O "$INSTALL_DIR/miniconda.sh"
-    - bash "$INSTALL_DIR/miniconda.sh" -b -u -p "$INSTALL_DIR"
-    - rm -rf "$INSTALL_DIR/miniconda.sh"
-    - source $INSTALL_DIR/bin/activate
-    - pip install jet-api --upgrade $JET_INDEX_URLS
-  variables:
-    GIT_STRATEGY: clone
-    GIT_SUBMODULE_STRATEGY: none
-  script:
-    - |
-      env 
-      set -x
-      
-      export IMAGE_TAG=${CI_PIPELINE_ID} 
-      export WANDB_API_KEY
-      CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME)
-      
-      if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then
-        echo Please assign a CONVERGENCE_TEST_RUN_NAME
-      fi
-
-      export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT
-      export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT
-
-      bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
-  artifacts:
-    paths:
-      - ./golden_values.json
-  retry:
-    max: 2
-
-release-test:
-  rules:
-    - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release"
-  extends: [.common_release]
-  tags:
-    - ${TAG}
-  parallel:
-    matrix:
-      - MODEL: bert
-        VARIANT: bert_release
-        TAG: mcore-ssh-node-B
-      - MODEL: gpt
-        VARIANT: gpt3_15b_8t_release 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_tp1pp4ep8vpp8_release
-        TAG: mcore-ssh-agent-C
-      - MODEL: mixtral
-        VARIANT: mixtral_8x22b_tp2pp8ep8vpp1_release
-        TAG: mcore-ssh-agent-C
-      - MODEL: t5
-        VARIANT: t5_release
-        TAG: mcore-ssh-agent-C
-  
-pre-release-test:
-  rules:
-    - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release"
-  extends: [.common_release]
-  tags:
-    - ${TAG}
-  parallel:
-    matrix:
-      - MODEL: bert
-        VARIANT: bert_release
-        TAG: mcore-ssh-node-B
-      - MODEL: gpt
-        VARIANT: gpt3_15b_8t_release_sm 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
-        TAG: mcore-ssh-node-B
diff --git a/.gitlab/stages/04.publish.yml b/.gitlab/stages/03.publish.yml
similarity index 100%
rename from .gitlab/stages/04.publish.yml
rename to .gitlab/stages/03.publish.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 40c1464154..fa13c48fd4 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -49,12 +49,14 @@ rm *.whl
 # Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
 COPY . /opt/megatron-lm
 RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
 
 ##### For NVIDIANS only #####
 FROM main as jet
 ARG CACHEBUST=0
 RUN --mount=type=secret,id=JET_INDEX_URLS \
     JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
     /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
 ENV PATH="$PATH:/opt/jet/bin"
 ###
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-mcore.yaml b/tests/functional_tests/jet_recipes/_build-mcore.yaml
new file mode 100644
index 0000000000..81b38b69ce
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-mcore.yaml
@@ -0,0 +1,11 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-pyt
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci
+    
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-nemo.yaml b/tests/functional_tests/jet_recipes/_build-nemo.yaml
new file mode 100644
index 0000000000..eb2b318ab5
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-nemo.yaml
@@ -0,0 +1,10 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-nemo
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-pyt.yaml b/tests/functional_tests/jet_recipes/_build-pyt.yaml
deleted file mode 100644
index d24836e44c..0000000000
--- a/tests/functional_tests/jet_recipes/_build-pyt.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-type: build
-format_version: 1
-maintainers: [maanug]
-spec:
-  name: mcore-pyt
-  platforms: [linux/amd64]
-  source:
-    # The image tag will be added via `jet-tests.yaml`
-    # Tags are one of {buildcache, $CI_PIPELINE_ID}
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci
-    
-
----
-type: build
-format_version: 1
-maintainers: [maanug]
-spec:
-  name: mcore-nemo
-  platforms: [linux/amd64]
-  source:
-    # The image tag will be added via `jet-tests.yaml`
-    # Tags are one of {buildcache, $CI_PIPELINE_ID}
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
index 87a6fb2c23..f14d2f0afa 100644
--- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml
+++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
@@ -16,8 +16,8 @@ spec:
     cd /opt/NeMo
   
     ARGUMENTS=(
-        "DATA_PATH=''"
-        "DATA_CACHE_PATH=''"
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
         "OUTPUT_PATH={assets_dir}"
         "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "CHECKPOINT_PATH=/workspace/checkpoints"
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
index 4bf1370304..3149f5664f 100644
--- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml
+++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -16,8 +16,8 @@ spec:
     cd /opt/megatron-lm
 
     ARGUMENTS=(
-        "DATA_PATH=''"
-        "DATA_CACHE_PATH=''"
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
         "OUTPUT_PATH={assets_dir}"
         "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "CHECKPOINT_PATH=/workspace/checkpoints"
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
new file mode 100644
index 0000000000..5699b32324
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -0,0 +1,120 @@
+import copy
+import itertools
+import pathlib
+from typing import List, Optional
+
+import jetclient
+import yaml
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def flatten_products(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> jetclient.JETWorkloadManifest:
+    """Flattens a nested dict of products"""
+    workload_manifest.products = [
+        dict(zip(inp.keys(), values))
+        for inp in workload_manifest.products
+        for values in itertools.product(*inp.values())
+    ]
+
+    return workload_manifest
+
+
+def flatten_workload(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Flattens a workload with products into a list of workloads that don't have products."""
+    workload_manifest = dict(workload_manifest)
+    products = workload_manifest.pop("products")
+    workload_manifests = []
+    for product in products:
+        workload = copy.deepcopy(workload_manifest)
+        workload['spec'] = {k: v for k, v in workload['spec'] if k not in product.keys()}
+        workload['spec'] = dict(**dict(workload['spec']), **product)
+        workload_manifests.append(jetclient.JETWorkloadManifest(**workload))
+    return workload_manifests
+
+
+def load_config(config_path: str) -> jetclient.JETWorkloadManifest:
+    """Loads and parses a yaml file into a JETWorkloadManifest"""
+    with open(config_path) as stream:
+        try:
+            return jetclient.JETWorkloadManifest(**yaml.safe_load(stream))
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]:
+    """Wrapper function for doing all the fun at once."""
+    return flatten_workload(flatten_products(load_config(config_path=config_path)))
+
+
+def filter_by_test_case(
+    workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str
+) -> jetclient.JETWorkloadManifest:
+    """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.test_case == test_case
+    )
+
+    if len(workload_manifests) > 1:
+        raise ValueError("Duplicate test_case found!")
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests[0]
+
+
+def filter_by_scope(
+    workload_manifests: List[jetclient.JETWorkloadManifest], scope: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching scope."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.scope == scope
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def load_workloads(
+    container_tag: str,
+    scope: Optional[str] = None,
+    test_case: Optional[str] = None,
+    container_image: Optional[str] = None,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Return all workloads from disk that match scope and platform."""
+    recipes_dir = BASE_PATH / ".." / ".." / "jet_recipes"
+    local_dir = BASE_PATH / ".." / ".." / "local_recipes"
+
+    workloads: List[jetclient.JETWorkloadManifest] = []
+    build_workloads: List[jetclient.JETClient] = []
+    for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")):
+        workloads += load_and_flatten(config_path=file)
+        if file.stem.startswith("_build"):
+            build_workloads.append(load_config(config_path=file))
+
+    if scope:
+        workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
+
+    if test_case:
+        workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)]
+
+    for workload in list(workloads):
+        for build_workload in build_workloads:
+            if (
+                workload.spec.build == build_workload.spec.name
+            ) and build_workload not in workloads:
+                container_image = container_image or build_workload.spec.source.image
+                build_workload.spec.source.image = f"{container_image}:{container_tag}"
+                workloads.append(build_workload)
+    return workloads
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
new file mode 100644
index 0000000000..252cf541c7
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -0,0 +1,79 @@
+from typing import Optional
+
+import click
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+
+@click.command()
+@click.option("--scope", required=True, type=str, help="Test scope")
+@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
+@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
+@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
+@click.option("--container-image", required=True, type=str, help="Container tag to use")
+@click.option("--container-tag", required=True, type=str, help="Container tag to use")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    scope: str,
+    a100_cluster: str,
+    h100_cluster: str,
+    output_path: str,
+    container_image: str,
+    container_tag: str,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+
+    gitlab_pipeline = {"stages": ["functional_tests"], "default": {"interruptible": True}}
+
+    for test_case in common.load_workloads(scope=scope, container_tag=container_tag):
+        if test_case.type == "build":
+            continue
+
+        if test_case.spec.platforms == "dgx_a100":
+            cluster = a100_cluster
+        elif test_case.spec.platforms == "dgx_h100":
+            cluster = h100_cluster
+        else:
+            raise ValueError(f"Platform {test_case.spec.platforms} unknown")
+
+        script = [
+            "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
+            f"--model {test_case.spec.model}",
+            f"--test-case {test_case.spec.test_case}",
+            f"--container-tag {container_tag}",
+            f"--cluster {cluster}",
+        ]
+
+        if run_name is not None and wandb_experiment is not None:
+            script.append(f"--run-name {run_name}")
+            script.append(f"--wandb-experiment {wandb_experiment}")
+
+        gitlab_pipeline[test_case.spec.test_case] = {
+            "stage": "functional_tests",
+            "image": f"{container_image}:{container_tag}",
+            "tags": ["mcore-docker-node-jet"],
+            "rules": [
+                {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
+                {"if": '$CI_MERGE_REQUEST_ID'},
+            ],
+            "timeout": "7 days",
+            "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "jet-generate"}],
+            "script": [" ".join(script)],
+        }
+
+    with open(output_path, 'w') as outfile:
+        yaml.dump(gitlab_pipeline, outfile, default_flow_style=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
new file mode 100644
index 0000000000..4e796ceb6c
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -0,0 +1,200 @@
+import os
+import pathlib
+import re
+import signal
+import sys
+import tempfile
+from typing import List, Optional, Tuple
+
+import click
+import jetclient
+import yaml
+from jetclient.services.dtos.pipeline import PipelineStatus
+
+from tests.functional_tests.python_test_utils.jet import common
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def resolve_cluster_config(cluster: str) -> str:
+    if cluster == "dgxh100_eos":
+        return "mcore/eos"
+    if cluster == "dgxa100_dracooci":
+        return "mcore/draco-oci"
+    if cluster == "dgxa100_dracooci-ord":
+        return "mcore/draco-oci-ord"
+    raise ValueError(f"Unknown cluster {cluster} provided.")
+
+
+def register_pipeline_terminator(pipeline: jetclient.JETPipeline):
+    def sigterm_handler(_signo, _stack_frame):
+        print(f"Trying to terminate pipeline {pipeline.jet_id}")
+        pipeline.cancel()
+        print(f"Pipeline {pipeline.jet_id} terminated")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, sigterm_handler)
+    signal.signal(signal.SIGTERM, sigterm_handler)
+
+
+def launch_and_wait_for_completion(
+    test_case: str,
+    container_image: str,
+    container_tag: str,
+    cluster: str,
+    account: str,
+    run_name: Optional[str],
+    wandb_experiment: Optional[str],
+) -> jetclient.JETPipeline:
+    pipeline = jetclient.JETClient(
+        customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod"
+    ).workloads.submit(
+        workloads=common.load_workloads(
+            test_case=test_case, container_image=container_image, container_tag=container_tag
+        ),
+        config_id=resolve_cluster_config(cluster),
+        custom_config={
+            "retrier": {
+                "enabled": True,
+                "max_retries": 2,
+                "retry_on": ['1.2', '1.2.*'],
+                "waiting_time": 60,
+                "environment": "jet-auto-retrier",
+            },
+            "builds": {"jet_flavour": None},
+            "launchers": {cluster: {"account": account}},
+            "executors": {
+                "jet-ci": {
+                    "environments": {
+                        cluster: {
+                            "variables": {
+                                "RUN_NAME": run_name or "",
+                                "WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "",
+                                "WANDB_EXPERIMENT": wandb_experiment or "",
+                            }
+                        }
+                    }
+                }
+            },
+        },
+        wait_for_validation=True,
+    )
+
+    register_pipeline_terminator(pipeline=pipeline)
+
+    print(
+        f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}",
+        flush=True,
+    )
+
+    pipeline.wait(max_wait_time=60 * 60 * 24 * 7)
+    print(f"Pipeline terminated; status: {pipeline.get_status()}")
+    return pipeline
+
+
+def download_job_logs(job: jetclient.JETJob) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets = logs[0].get_assets()
+    log_filename = [key for key in assets.keys() if key.endswith(".log")][0]
+
+    with tempfile.NamedTemporaryFile() as tmp_file:
+        assets[log_filename].download(pathlib.Path(tmp_file.name))
+        with open(pathlib.Path(tmp_file.name), "r") as fh:
+            return fh.readlines()
+
+
+def parse_iterations_from_logs(logs: List[str]) -> Optional[Tuple[int, int]]:
+    for log_row in logs[::-1]:
+        match = re.search(r"iteration\s+(\d+)\s*/\s*(\d+)", log_row)
+        if match is not None:
+            return int(match.group(1)), int(match.group(2))
+
+
+@click.command()
+@click.option("--model", required=True, type=str, help="Model")
+@click.option("--test-case", required=True, type=str, help="Test case")
+@click.option(
+    "--account",
+    required=False,
+    type=str,
+    help="Slurm account to use",
+    default="coreai_dlalgo_mcore",
+)
+@click.option("--cluster", required=True, type=str, help="Cluster to run on")
+@click.option("--container-tag", required=True, type=str, help="Base image of Mcore image")
+@click.option("--container-image", required=False, type=str, help="Base image of Mcore image")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    model: str,
+    test_case: str,
+    account: str,
+    cluster: str,
+    container_tag: str,
+    container_image: Optional[str] = None,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+
+    with open(
+        pathlib.Path(
+            BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml"
+        )
+    ) as stream:
+        try:
+            test_case_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    test_type = test_case_dict['TEST_TYPE']
+
+    if test_type == "release" and (run_name is None or wandb_experiment is None):
+        print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
+        sys.exit(1)
+
+    n_attempts = 0
+    while True and n_attempts < 3:
+        pipeline = launch_and_wait_for_completion(
+            test_case=test_case,
+            container_image=container_image,
+            container_tag=container_tag,
+            cluster=cluster,
+            account=account,
+            run_name=run_name,
+            wandb_experiment=wandb_experiment,
+        )
+
+        logs = download_job_logs(
+            job=[job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
+        )
+        concat_logs = "\n".join(logs)
+        print(f"Logs:\n{concat_logs}")
+
+        if test_type != "release":
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+
+        parsed_result = parse_iterations_from_logs(logs=logs)
+        if not parsed_result:
+            print("Weird log, no iterations found")
+            n_attempts += 1
+            continue
+
+        current_iteration, total_iterations = parsed_result
+        if current_iteration == total_iterations:
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh
index 277d46add1..1bb2ea5c3c 100644
--- a/tests/functional_tests/shell_test_utils/notify.sh
+++ b/tests/functional_tests/shell_test_utils/notify.sh
@@ -1,6 +1,6 @@
 set -euxo pipefail
 
-collect_jet_jobs () {
+collect_jobs () {
   PAGE=1
   PER_PAGE=100
   RESULTS="[]"
@@ -11,7 +11,7 @@ collect_jet_jobs () {
                   -s \
                   --globoff \
                   --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
-                  "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${JET_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
               )
     # Combine the results
     RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
@@ -85,31 +85,16 @@ if [[ $DOWNSTREAM_PIPELINE_ID == null ]]; then
 
 else
     set +x
-    JET_PIPELINE_JSON=$(curl \
-                        --fail \
-                        --silent \
-                        --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
-                        "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${DOWNSTREAM_PIPELINE_ID}/bridges?per_page=100"
-                        )
+    JOBS=$(echo "$(collect_jobs)" | jq '[.[] | {id, name, status}]')
+    echo $JOBS
     set -x
-    JET_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$JET_PIPELINE_JSON")
 
-    set +x
-    JET_LOGS=$(echo "$(collect_jet_jobs)" \
-                | jq '[
-                    .[] 
-                    | select(.name | startswith("build/") | not)
-                    | select(.name | contains("3 logs_after") | not)
-                    | select(.name | contains("1 logs_before") | not)
-                ]'
-            ) 
-
-    FAILED_JET_LOGS=$(echo "$JET_LOGS" \
+    FAILED_JOBS=$(echo "$JOBS" \
                 | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[
                     .[] 
                     | select(.status != "success")
                     | {
-                        "name": (.name[6:] | split(" ")[0]),
+                        name,
                         id,
                         "url": ("https://" + $GITLAB_ENDPOINT + "/dl/jet/ci/-/jobs/" + (.id | tostring)),
                     }
@@ -117,29 +102,34 @@ else
             ) 
     set -x
 
-    for row in $(echo "${FAILED_JET_LOGS}" | jq -r '.[] | @base64'); do
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
         _jq() {
         echo ${row} | base64 --decode | jq -r ${1}
         }
         JOB_ID=$(_jq '.id')
-        SLURM_FAILURE=$(jet \
-                                -c -df json -th logs query --raw \
-                                -c "obj_status.s_message" \
-                                --eq obj_ci.l_job_id "$JOB_ID" \
-                            | jq '.[0].obj_status.s_message' \
-                            | tr -d '"'
-                        )
-        FAILED_JET_LOGS=$(echo "$FAILED_JET_LOGS" \
-                            | jq \
-                                --argjson JOB_ID "$JOB_ID" \
-                                --arg SLURM_FAILURE "$SLURM_FAILURE" '
-                                    .[] |= ((select(.id==$JOB_ID) += {
-                                        "slurm_failure_reason": $SLURM_FAILURE}))
-                            ')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
     done
 
-    NUM_FAILED=$(echo "$FAILED_JET_LOGS" | jq 'length')
-    NUM_TOTAL=$(echo "$JET_LOGS" | jq 'length')
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$JOBS" | jq 'length')
 
     if [[ $NUM_FAILED -eq 0 ]]; then
         BLOCKS='[
@@ -152,7 +142,7 @@ else
             }
         ]'
     else
-        BLOCKS=$(echo -e "$FAILED_JET_LOGS" \
+        BLOCKS=$(echo "$FAILED_JOBS" \
                     | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
                         [
                             {                
@@ -170,7 +160,7 @@ else
                                     "type": "mrkdwn",
                                     "text": (                               
                                         "• Job: <" +.url + "|" + .name + ">"
-                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason[-2000:] + "```"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
                                         
                                     )
                                 }
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
index 7578d25c2d..c9c16b43c6 100644
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -4,11 +4,11 @@ set -exo pipefail
 
 echo "------ARGUMENTS LIST --------"
 for ARGUMENT in "$@"; do
+    echo $ARGUMENT
     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 
     KEY_LENGTH=${#KEY}
-    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
-
+    VALUE=$(eval echo ${ARGUMENT:$KEY_LENGTH+1})
     export "$KEY"="$VALUE"
     echo "$KEY=$VALUE"
 done
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
index 941e8b7bdb..9453db100c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -32,7 +32,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
index 941e8b7bdb..9453db100c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -32,7 +32,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
index ee149b884e..af474ac150 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -30,7 +30,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
   --tokenizer-model: ${DATA_PATH}/tokenizer.model
   --data-path: ${DATA_BLEND}
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
index 1fe7611a81..585d9bb2c7 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -33,7 +33,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
index d80246eecd..22607416a3 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -33,7 +33,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
index b2f6983a62..95b151569a 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -31,7 +31,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
   --tokenizer-model: ${DATA_PATH}/tokenizer.model
   --data-path: ${DATA_BLEND}
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
index c5dbbb35ea..64784c36a6 100644
--- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
@@ -46,7 +46,7 @@ MODEL_ARGS:
   --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
   --tokenizer-type: BertWordPieceCase
   --split: 99982,9,9
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --vocab-extra-ids: 100
 
   # EVAL_AND_LOGGING_ARGS
diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
index a93f263d50..1238d09f76 100644
--- a/tests/unit_tests/dist_checkpointing/test_fp8.py
+++ b/tests/unit_tests/dist_checkpointing/test_fp8.py
@@ -51,6 +51,7 @@ def get_ten(dtype: str = 'fp8'):
             (False, (2, 4), (2, 4), None),
         ],
     )
+    @pytest.mark.skip(reason="Flaky test")
     def test_fp8_save_load(
         self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
     ):
diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
index e1f3eb75f4..d5d5cdce8f 100644
--- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
+++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
@@ -29,6 +29,7 @@ def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
+    @pytest.mark.skip(reason="Flaky test")
     def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0

From d626aebae2b5811126a9cc037f8b8b2e2bd7121b Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sun, 22 Sep 2024 03:42:37 -0700
Subject: [PATCH 09/50] ADLR/megatron-lm!2130 - ci: Improve cherry pick MR
 description

---
 .gitlab/stages/00.pre.yml              | 6 ++++--
 .gitlab/stages/02.functional-tests.yml | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index a6d6319e57..312f460977 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -104,7 +104,8 @@ maybe_cherry_pick_commit:
       MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
 
       LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
-      AUTHOR=$(echo -E $MR | jq '.assignee.id')
+      AUTHOR_ID=$(echo -E $MR | jq '.assignee.id')
+      AUTHOR_NAME=$(echo -E $MR | jq '.assignee.username')
       TITLE=$(echo -E $MR | jq '.title')
       TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
 
@@ -139,7 +140,8 @@ maybe_cherry_pick_commit:
             -d "target_branch=$RELEASE_BRANCH" \
             -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
             -d "labels=cherry-pick" \
-            -d "assignee_id=$AUTHOR_ID"
+            -d "reviewer_ids=$AUTHOR_ID" \
+            -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
 
         else
           URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 1962523d0e..c930668722 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -64,6 +64,7 @@ jet-generate:
 jet-trigger:
   stage: functional_tests
   needs: [jet-generate]
+  extends: [.jet_common]
   trigger:
     include:
       - artifact: jet-trigger-job.yaml

From ea83faa3c098aa837d6d7c0d17722f2436435f05 Mon Sep 17 00:00:00 2001
From: Huy Vu <huvu@nvidia.com>
Date: Sun, 22 Sep 2024 22:34:16 -0700
Subject: [PATCH 10/50] ADLR/megatron-lm!2119 - Huvu/t5 te10 fix nemoci pr482

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
---
 megatron/core/models/T5/t5_spec.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
index 42da1889a9..ecdcdbc260 100644
--- a/megatron/core/models/T5/t5_spec.py
+++ b/megatron/core/models/T5/t5_spec.py
@@ -52,7 +52,7 @@ def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
         submodules=TransformerLayerSubmodules(
             self_attention=ModuleSpec(
                 module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=TELayerNormColumnParallelLinear,
                     core_attention=TEDotProductAttention,
@@ -94,6 +94,7 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
             pre_cross_attn_layernorm=TENorm,
             cross_attention=ModuleSpec(
                 module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=CrossAttentionSubmodules(
                     linear_q=TEColumnParallelLinear,
                     linear_kv=TEColumnParallelLinear,
@@ -122,7 +123,7 @@ def encoder_model_with_local_spec() -> ModuleSpec:
             input_layernorm=LNImpl,
             self_attention=ModuleSpec(
                 module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
                     core_attention=DotProductAttention,
@@ -170,6 +171,7 @@ def decoder_model_with_local_spec() -> ModuleSpec:
             pre_cross_attn_layernorm=LNImpl,
             cross_attention=ModuleSpec(
                 module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=CrossAttentionSubmodules(
                     linear_q=ColumnParallelLinear,
                     linear_kv=ColumnParallelLinear,

From ae1bffb8ff983edc2308472c52d991dfdcefd92f Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 01:34:51 -0700
Subject: [PATCH 11/50] ADLR/megatron-lm!2134 - ci: Set author and milestone
 for cherry-picks

---
 .gitlab/stages/00.pre.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index 312f460977..478c432c4a 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -104,9 +104,10 @@ maybe_cherry_pick_commit:
       MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
 
       LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
-      AUTHOR_ID=$(echo -E $MR | jq '.assignee.id')
-      AUTHOR_NAME=$(echo -E $MR | jq '.assignee.username')
+      AUTHOR_ID=$(echo -E $MR | jq '.author.id')
+      AUTHOR_NAME=$(echo -E $MR | jq '.author.username')
       TITLE=$(echo -E $MR | jq '.title')
+      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id')
       TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
 
       if [[ $TARGET_BRANCHES == "" ]]; then
@@ -141,6 +142,7 @@ maybe_cherry_pick_commit:
             -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
             -d "labels=cherry-pick" \
             -d "reviewer_ids=$AUTHOR_ID" \
+            -d "milestone_id=$MILESTONE_ID" \
             -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
 
         else

From baad0ad9be945a7adc98103a0908a06667365a8e Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 05:37:11 -0700
Subject: [PATCH 12/50] ADLR/megatron-lm!2135 - ci: Send alerts on
 unit-tests-extended

---
 .gitlab/stages/01.tests.yml                   |  22 +++
 .../shell_test_utils/notify_unit_tests.sh     | 186 ++++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 tests/functional_tests/shell_test_utils/notify_unit_tests.sh

diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index 94808a1921..2fe5ddafae 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -125,6 +125,28 @@ unit_tests:
       allow_failure: true
     - when: always
 
+unit-tests-results-notify:
+  extends: [.test_mr_rules]
+  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
+  needs: [unit_tests]
+  tags:
+    - mcore-docker-node-small
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export DATE=$(date +"%Y-%m-%d")
+    - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID}
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
+      when: always
+    - when: never
+
 docs_build_test:
   extends: [.test_mr_rules]
   image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
new file mode 100644
index 0000000000..46be8b078e
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
@@ -0,0 +1,186 @@
+set -euxo pipefail
+
+collect_jobs () {
+  PAGE=1
+  PER_PAGE=100
+  RESULTS="[]"
+
+  while true; do
+    # Fetch the paginated results
+    RESPONSE=$(curl \
+                  -s \
+                  --globoff \
+                  --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+              )
+    # Combine the results
+    RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
+
+    # Check if there are more pages
+    if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then
+      break
+    fi
+
+    # Increment the page number
+    PAGE=$((PAGE + 1))
+  done
+
+  echo "$RESULTS"
+}
+
+CI_PIPELINE_ID=${1:-16595865}
+CI_PROJECT_ID=${CI_PROJECT_ID:-19378}
+PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID
+JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/
+CONTEXT="unit-tests-extended"
+
+# Fetch Elastic logs
+set +x
+PIPELINE_JSON=$(curl \
+                  --fail \
+                  --silent \
+                  --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs"
+                ) || ret_code=$?
+set -x
+if [[ ${ret_code:-0} -ne 0 ]]; then
+    echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist
+    exit 1
+fi
+
+UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("unit_tests"))]')
+
+if [[ $UNIT_TESTS_JOBS == null ]]; then
+    FAILED_JOBS=$(curl \
+                    --fail \
+                    --silent \
+                    --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                    "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \
+                  | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"')
+    curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        --data '
+            {
+                "blocks": [
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n"   
+                        }
+                    },
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "\n• Job: '"$FAILED_JOBS"'"   
+                        }
+                    },
+                ]
+            
+            }' \
+        $WEBHOOK_URL
+
+else
+    FAILED_JOBS=$(echo -E "$UNIT_TESTS_JOBS" \
+                | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" --arg JOB_URL "$JOB_URL" '[
+                    .[] 
+                    | select(.status != "success")
+                    | {
+                        name,
+                        id,
+                        "url": ($JOB_URL + (.id | tostring)),
+                    }
+                ]'
+            ) 
+    set -x
+
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
+        _jq() {
+        echo ${row} | base64 --decode | jq -r ${1}
+        }
+        JOB_ID=$(_jq '.id')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
+    done
+
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$UNIT_TESTS_JOBS" | jq 'length')
+
+    if [[ $NUM_FAILED -eq 0 ]]; then
+        BLOCKS='[
+            {                
+                "type": "section",
+                "text": {            
+                    "type": "mrkdwn",
+                    "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed"
+                }
+            }
+        ]'
+    else
+        BLOCKS=$(echo "$FAILED_JOBS" \
+                    | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
+                        [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed")
+                                }
+                            }
+                        ] + [
+                            .[] 
+                            | {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (                               
+                                        "• Job: <" +.url + "|" + .name + ">"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
+                                        
+                                    )
+                                }
+                            }
+                        ] + [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": ("===============================================")
+                                }
+                            }
+                        ]'
+        )
+    fi
+
+    for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do
+        _jq() {
+            echo ${row} | base64 --decode
+        }
+
+        curl \
+            -X POST \
+            -H "Content-type: application/json" \
+            --data '{"blocks": '["$(_jq)"]'}' \
+            $WEBHOOK_URL
+    done
+
+fi
\ No newline at end of file

From 460c6a9cf83e90a9434b3a3244d13952bd2c6f8f Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 07:41:08 -0700
Subject: [PATCH 13/50] ADLR/megatron-lm!2133 - tests: Minor improvements to
 JET

---
 .gitlab/stages/02.functional-tests.yml        |  3 +-
 .../python_test_utils/jet/common.py           | 20 ++++++++
 .../jet/generate_jet_trigger_job.py           |  1 +
 .../jet/generate_local_jobs.py                | 49 +++++++++++++++++++
 4 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 tests/functional_tests/python_test_utils/jet/generate_local_jobs.py

diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index c930668722..3ac0bcc0c5 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -41,13 +41,14 @@ jet-generate:
           "--run-name"
           $FUNCTIONAL_TEST_NAME
           "--wandb-experiment"
-          "test"
+          $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-')
         )
       else
         RELEASE_ARGS=()
       fi
 
     - |
+      export PYTHONPATH=$(pwd)
       python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
         --scope $FUNCTIONAL_TEST_SCOPE \
         --a100-cluster $A100_CLUSTER \
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
index 5699b32324..5ee31bc232 100644
--- a/tests/functional_tests/python_test_utils/jet/common.py
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -86,9 +86,26 @@ def filter_by_scope(
     return workload_manifests
 
 
+def filter_by_model(
+    workload_manifests: List[jetclient.JETWorkloadManifest], model: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching model."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.model == model
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
 def load_workloads(
     container_tag: str,
     scope: Optional[str] = None,
+    model: Optional[str] = None,
     test_case: Optional[str] = None,
     container_image: Optional[str] = None,
 ) -> List[jetclient.JETWorkloadManifest]:
@@ -106,6 +123,9 @@ def load_workloads(
     if scope:
         workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
 
+    if model:
+        workloads = filter_by_model(workload_manifests=workloads, model=model)
+
     if test_case:
         workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)]
 
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index 252cf541c7..42030257c5 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -47,6 +47,7 @@ def main(
             raise ValueError(f"Platform {test_case.spec.platforms} unknown")
 
         script = [
+            "export PYTHONPATH=$(pwd); "
             "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
             f"--model {test_case.spec.model}",
             f"--test-case {test_case.spec.test_case}",
diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
new file mode 100644
index 0000000000..4124e1c338
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
@@ -0,0 +1,49 @@
+import pathlib
+from typing import Optional
+
+import click
+import jetclient
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+
+def load_script(config_path: str) -> str:
+    with open(config_path) as stream:
+        try:
+            jetclient.JETWorkloadManifest(**yaml.safe_load(stream)).spec.script
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+@click.command()
+@click.option("--model", required=False, type=str, help="Filters all tests by matching model")
+@click.option("--scope", required=False, type=str, help="Filters all tests by matching scope")
+@click.option(
+    "--test-case", required=False, type=str, help="Returns a single test-case with matching name."
+)
+@click.option("--output-path", required=True, type=str, help="Path to write jobs to")
+def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
+    workloads = common.load_workloads(
+        container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
+    )
+
+    for workload in workloads:
+        if workload.type == "build":
+            continue
+        magic_values = dict(workload.spec)
+        magic_values["assets_dir"] = "."
+
+        file_path = (
+            pathlib.Path(output_path)
+            / "test_cases"
+            / workload.spec.model
+            / f"{workload.spec.test_case}.sh"
+        )
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(file_path, "w", encoding="utf-8") as fh:
+            fh.write(workload.spec.script.format(**magic_values))
+
+
+if __name__ == "__main__":
+    main()

From a0799f4ad4f5e3458bb632c144edf696b2061f79 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 07:41:09 -0700
Subject: [PATCH 14/50] ADLR/megatron-lm!2136 - tests: Fix GPT test

---
 .../test_cases/gpt/gpt3_15b_8t_release/model_config.yaml        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
index 9453db100c..bf88792152 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -25,7 +25,7 @@ MODEL_ARGS:
   --micro-batch-size: 4
   --rampup-batch-size: "384 384 97656250"
   --global-batch-size: 1152
-  --train-samples: 4882812
+  --train-samples: 19531250
   --manual-gc: true
 
   # Transformer Engine args

From 4c3e06ae7749592657991d1ef1266834c72f7bb1 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 08:44:05 -0700
Subject: [PATCH 15/50] ADLR/megatron-lm!2139 - ci: Fix cherry-pick strings

---
 .gitlab/stages/00.pre.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index 478c432c4a..e358a6aa95 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -104,10 +104,10 @@ maybe_cherry_pick_commit:
       MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
 
       LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
-      AUTHOR_ID=$(echo -E $MR | jq '.author.id')
-      AUTHOR_NAME=$(echo -E $MR | jq '.author.username')
-      TITLE=$(echo -E $MR | jq '.title')
-      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id')
+      AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
+      AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
+      TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
+      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
       TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
 
       if [[ $TARGET_BRANCHES == "" ]]; then

From 71b2aa12bcbdfecab6e72d75008d8135abe907ae Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Mon, 23 Sep 2024 12:41:50 -0700
Subject: [PATCH 16/50] ADLR/megatron-lm!2110 - Use torch dataloader in
 multimodal evaluation

---
 examples/multimodal/dataloader_provider.py |  10 +-
 examples/multimodal/run_text_generation.py | 721 ++++++++++++++-------
 2 files changed, 483 insertions(+), 248 deletions(-)

diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
index 33bcf1bf1f..4bd1b29e51 100644
--- a/examples/multimodal/dataloader_provider.py
+++ b/examples/multimodal/dataloader_provider.py
@@ -4,7 +4,7 @@
 import torch
 from dataset_helpers import TaskEncoder, print_error_handler
 
-from megatron.core import mpu
+from megatron.core import parallel_state
 from megatron.energon import (
     LimitDataset,
     RepeatDataset,
@@ -71,9 +71,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
     worker_debug_path = None
     worker_log_level = 0
 
-    rank = mpu.get_data_parallel_rank()
-    world_size = mpu.get_data_parallel_world_size()
-    data_parallel_group = mpu.get_data_parallel_group()
+    rank = parallel_state.get_data_parallel_rank()
+    world_size = parallel_state.get_data_parallel_world_size()
+    data_parallel_group = parallel_state.get_data_parallel_group()
 
     worker_config = WorkerConfig(
         rank=rank,
@@ -88,7 +88,7 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
     train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
     if args.load is not None:
         if getattr(args, "dataloader_save", None):
-            dp_rank = mpu.get_data_parallel_rank()
+            dp_rank = parallel_state.get_data_parallel_rank()
             data_save_name = get_checkpoint_name(
                 args.dataloader_save,
                 args.iteration,
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index b4c020dcbb..6cf5fd6232 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -34,6 +34,7 @@
 from PIL import Image
 from torchvision.io import read_video
 
+from megatron.core import parallel_state
 from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX
 from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.inference.text_generation.api import generate_and_post_process
@@ -95,56 +96,24 @@ def _get_partition_bounds(
     return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
 
 
-def get_evaluation_dataset(
-    task,
-    input_image_path,
-    gt_path,
-    img_h,
-    img_w,
-    use_tiling,
-    max_num_tiles,
-    use_thumbnail,
-    num_samples_per_partition,
-    num_partitions,
-    partition_id,
-    num_frames,
-):
-    """Build evaluation dataset."""
-    images = []
-    tile_counts = []
-    questions, answers = [], []
-    samples, sample_ids = [], []
-
-    if task == "TextVQA":
-        samples = json.load(open(gt_path, encoding='utf-8'))['data']
-
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(samples), num_samples_per_partition, num_partitions, partition_id
-            )
-            samples = samples[lb:ub]
-
-        for i in range(len(samples)):
-            sample = samples[i]
-
-            img_file = "{}/{}.jpg".format(input_image_path, sample["image_id"])
-            if not os.path.exists(img_file):
-                img_file = img_file.replace('.jpg', '.png')
-
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
-
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
-
-            questions.append(sample["question"])
-            answers.append(sample["answers"])
-            sample_ids.append(sample["question_id"])
-    elif task == "VQAv2":
+class VQADataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
         samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
 
         # Optionally, process only a subset of the input files.
         if num_partitions > 0:
@@ -153,50 +122,72 @@ def get_evaluation_dataset(
             )
             samples = samples[lb:ub]
 
-        for i in range(len(samples)):
-            sample = samples[i]
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
 
-            img_file = "{}/{}".format(input_image_path, sample["image"])
+    def __len__(self):
+        return len(self._samples)
 
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
 
-            questions.append(sample["question"])
-            answers.append(sample["answer"])
-            sample_ids.append(sample["question_id"])
-    elif task == "ChartQA":
-        samples = json.load(open(gt_path, encoding='utf-8'))
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
 
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(samples), num_samples_per_partition, num_partitions, partition_id
-            )
-            samples = samples[lb:ub]
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
 
-        for i in range(len(samples)):
-            sample = samples[i]
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
 
-            img_file = "{}/{}".format(input_image_path, sample["imgname"])
+        metadata = ""  # Not used.
 
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
 
-            questions.append(sample["query"])
-            answers.append(sample["label"])
-            sample_ids.append(i)
-    elif task == "captioning":
+class CaptioningDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
         image_files = sorted(glob.glob(input_image_path + "/*"))
+
         # Optionally, process only a subset of the input files.
         if num_partitions > 0:
             lb, ub = _get_partition_bounds(
@@ -209,20 +200,54 @@ def get_evaluation_dataset(
         for gt in gts["annotations"]:
             answers[gt["image_id"]].append(gt['caption'])
 
-        # Run image preprocessing.
-        for i in range(len(image_files)):
-            image_file = image_files[i]
-            img = Image.open(image_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+
+    def __len__(self):
+        return len(self._image_files)
+
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
 
-            image_id = int(image_file.split("_")[-1].split(".")[0])
-            sample_ids.append(image_id)
-    elif task == 'MMMU':
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        single_image,
+    ):
         # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
         all_mmmu_datasets = []
 
@@ -230,9 +255,22 @@ def get_evaluation_dataset(
         assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
 
         for subject in CAT_SHORT2LONG.values():
-            subject_dataset = datasets.load_dataset(
-                "MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache
-            )
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+
             all_mmmu_datasets.append(subject_dataset)
 
         dataset = datasets.concatenate_datasets(all_mmmu_datasets)
@@ -240,14 +278,11 @@ def get_evaluation_dataset(
         dataset = [s for s in dataset if s['id'].startswith("val")]
 
         # Optionally, process only a subset of the input files.
-        start_idx = 0
-        end_idx = len(dataset)
         if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
+            lb, ub = _get_partition_bounds(
                 len(dataset), num_samples_per_partition, num_partitions, partition_id
             )
-
-        end_idx = min(len(dataset), end_idx)
+            dataset = dataset[lb:ub]
 
         # Using the LLaVA config from the MMMU repo.
         config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
@@ -256,76 +291,119 @@ def get_evaluation_dataset(
                 assert len(v) == 1, "only one value supported."
                 config[k] = v[0]
 
-        for idx in range(start_idx, end_idx):
-            sample = dataset[idx]
+        self._config = config
+
+        self._dataset = dataset
+
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._single_image = single_image
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        sample = self._dataset[idx]
+
+        # Use the single image approach from the MMMU repo.
+        if self._single_image:
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, self._config)
+
+            img = sample["image"]
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+            sample_num_tiles = [len(sample_imgs)]
+        else:
+            sample = construct_prompt(sample, self._config)
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
 
-            single_image = True
-            # Use the single image approach from the MMMU repo.
-            if single_image:
-                sample = process_single_sample(sample)
-                sample = construct_prompt(sample, config)
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
 
-                img = sample["image"]
-                sample_imgs = get_visual_transform(
-                    img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
                 )
-                sample_num_tiles = [len(sample_imgs)]
-            else:
-                sample = construct_prompt(sample, config)
-
-                sample_imgs = []
-                sample_num_tiles = []
-
-                img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
-                # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
-                adjusted_max_num_tiles = max(1, max_num_tiles // len(img_indices))
-
-                for img_idx in img_indices:
-                    img_key = f"image_{img_idx}"
-                    img_str = f"<image {img_idx}>"
-
-                    img = sample[img_key]
-                    assert img is not None, f"{img_str} is in prompt but not in sample images"
-
-                    # Note: Only replace the current image tag.
-                    sample["final_input_prompt"] = sample["final_input_prompt"].replace(
-                        img_str, "<image>", 1
-                    )
-
-                    imgs = get_visual_transform(
-                        img,
-                        img_h,
-                        img_w,
-                        use_tiling,
-                        adjusted_max_num_tiles,
-                        use_thumbnail,
-                        augment=False,
-                    )  # List of tiles.
-
-                    sample_imgs.extend(imgs)
-                    sample_num_tiles.append(len(imgs))
-
-                # Sanity check.
-                for i in range(1, 8):
-                    assert (
-                        f"<image {i}>" not in sample["final_input_prompt"]
-                    ), "prompt contains unhandled image tags"
-
-            images.append(sample_imgs)
-            tile_counts.append(torch.tensor(sample_num_tiles, dtype=torch.int))
-
-            sample_ids.append(sample['id'])
-
-            prompt = sample['final_input_prompt']
-            if single_image:
-                for i in range(8):
-                    prompt = prompt.replace(f"<image {i}>", "")
-                prompt = f"<image>\n{prompt}"
-            questions.append(prompt)
-
-            answers.append(sample['answer'])
-            samples.append(sample)
-    elif task == "VideoMME":
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+
+        prompt = sample['final_input_prompt']
+        if self._single_image:
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            prompt = f"<image>\n{prompt}"
+
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
+
+
+class VideoMMMEDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+    ):
         ground_truth_original = json.load(open(gt_path))
         ground_truth = []
         for gt in ground_truth_original:
@@ -347,51 +425,210 @@ def get_evaluation_dataset(
             )
             ground_truth = ground_truth[start_idx:end_idx]
 
-        # Run image preprocessing.
-        for idx, gt in enumerate(ground_truth):
-            print_rank_0(f"Processing input video: {idx} / {len(ground_truth)}")
-            video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
-            video = video.numpy()
-            selected_frames = torch.linspace(0, video.shape[0] - 1, num_frames).long()
-            video_frames = video[selected_frames]
-            if num_frames == 1:
-                video_frames = video_frames[None]
-
-            imgs = list(
-                itertools.chain.from_iterable(
-                    get_visual_transform(
-                        img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-                    )
-                    for img in video_frames
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+
+    def __len__(self):
+        return len(self._ground_truth)
+
+    def __getitem__(self, idx):
+        gt = self._ground_truth[idx]
+
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+
+        imgs = list(
+            itertools.chain.from_iterable(
+                get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
                 )
+                for img in video_frames
             )
+        )
+
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+        answer = ""
+        metadata = ""
+
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+
 
-            for question in gt["questions"]:
-                # Very hacky, but we essentially re-create gt holding only the
-                # question of interest. This is the make this generation script
-                # compatible with the Video MME evaluation script.
-                question_dict = {
-                    "video_id": gt["video_id"],
-                    "duration_category": gt["duration_category"],
-                    "video_category": gt["video_category"],
-                    "video_subcategory": gt["video_subcategory"],
-                    "url": gt["url"],
-                    "questions": [question],
-                }
-                images.append(imgs)
-                tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
-                questions.append(question_dict)
-                sample_ids.append(question["question_id"])
+def get_evaluation_dataloader(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    num_workers,
+):
+    """Build evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == 'MMMU':
+        # Note: single_image=True uses only one image like in the MMMU repo example.
+        # single_image=False uses all images in the sample.
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            single_image=True,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+        )
     else:
-        raise NotImplementedError("unsupported task")
+        raise NotImplementedError(f"unsupported task {task}")
 
-    return images, tile_counts, samples, sample_ids, questions, answers
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_world_size = parallel_state.get_data_parallel_world_size()
+
+    sampler = torch.utils.data.DistributedSampler(
+        dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
+    )
+    # TODO: Batched inference is not supported yet.
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
+    )
+
+    return dataloader
 
 
 def generate_samples(model, config: EvaluationConfig):
     """Text generation using a trained vision language model."""
     args = get_args()
-    images, tile_counts, samples, sample_ids, questions, answers = get_evaluation_dataset(
+
+    rank = torch.distributed.get_rank()
+
+    dataloader = get_evaluation_dataloader(
         config.task,
         config.input_image_path,
         config.gt_path,
@@ -404,23 +641,22 @@ def generate_samples(model, config: EvaluationConfig):
         config.num_partitions,
         config.partition_id,
         args.num_frames,
+        args.num_workers,
     )
 
     num_img_embeddings_per_tile = get_num_image_embeddings(
         args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
     )
-    num_samples = len(sample_ids)
-    idx = 0
-    while idx < num_samples:
-        imgs = torch.stack(images[idx]).cuda()
-        num_tiles = tile_counts[idx].cuda()
-        sample_id = sample_ids[idx]
 
-        prompt = get_prompt(config.task, questions, idx, config.prompt_format)
+    for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
+        imgs = imgs.to("cuda")
+        num_tiles = num_tiles.to("cuda")
+
+        prompt = get_prompt(config.task, question, config.prompt_format)
 
         forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
 
-        if torch.distributed.get_rank() == 0:
+        if rank == 0:
             resp_sentences, _, _, _ = generate_and_post_process(
                 model,
                 forward_step=forward_step,
@@ -435,6 +671,9 @@ def generate_samples(model, config: EvaluationConfig):
             )
 
             for prompt, generation in zip([prompt], resp_sentences):
+                if isinstance(sample_id, torch.Tensor):
+                    sample_id = sample_id.item()
+
                 output = {"sample_id": sample_id, "prompt": prompt}
 
                 output_name = ""
@@ -446,27 +685,25 @@ def generate_samples(model, config: EvaluationConfig):
                     output_name = "text"
                 elif config.task == "VideoMME":
                     output_name = "response"
-                    output = questions[idx]
+                    output = question
 
-                generated = get_generated(generation, args.prompt_format)
+                generated = get_generated(generation, config.prompt_format)
                 if config.task == "VideoMME":
                     output["questions"][0][output_name] = generated
                 else:
                     output[output_name] = generated
 
                 if config.task == "captioning":
-                    output["ground_truth"] = answers[sample_id]
+                    output["ground_truth"] = answers
                 elif config.task in ("TextVQA", "VQAv2"):
-                    output["gt_answer"] = [ans for ans in answers[idx]]
+                    output["gt_answer"] = [ans for ans in answers]
                 elif config.task == "ChartQA":
-                    output["gt_answer"] = [answers[idx]]
+                    output["gt_answer"] = [answers]
                 elif config.task == "MMMU":
-                    sample = samples[idx]
-
                     prediction = generated
-                    if sample["question_type"] == "multiple-choice":
+                    if metadata["question_type"] == "multiple-choice":
                         prediction = parse_multi_choice_response(
-                            generated, sample["all_choices"], sample["index2ans"]
+                            generated, metadata["all_choices"], metadata["index2ans"]
                         )
 
                     output["prediction"] = prediction
@@ -515,10 +752,16 @@ def get_evaluation_config():
 
 def generate_and_write_samples(model, config):
     """Generate text and write to an output file."""
+    rank = torch.distributed.get_rank()
+
+    if rank == 0:
+        output_file = open(config.output_path, "w")
+        print(f"output path: {output_file.name}")
+
     for output in generate_samples(model, config):
-        if torch.distributed.get_rank() == 0:
-            with open(config.output_path, 'a') as f:
-                f.write(json.dumps(output) + "\n")
+        if rank == 0:
+            output_file.write(json.dumps(output) + "\n")
+            output_file.flush()
 
 
 class VLMForwardStep(ForwardStep):
@@ -567,7 +810,7 @@ def __call__(self, tokens, position_ids, attention_mask):
         return logits
 
 
-def get_prompt(task, questions, idx, prompt_format):
+def get_prompt(task, question, prompt_format):
     """Get a prompt for the evaluation task."""
     if task == "captioning":
         if prompt_format == "llama3":
@@ -577,8 +820,6 @@ def get_prompt(task, questions, idx, prompt_format):
                 "[INST] <image>Give a short and clear explanation of the subsequent image. [/INST]"
             )
     elif task == "TextVQA":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
                 question
@@ -588,8 +829,6 @@ def get_prompt(task, questions, idx, prompt_format):
                 question
             )
     elif task == "VQAv2":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
                 question
@@ -599,41 +838,37 @@ def get_prompt(task, questions, idx, prompt_format):
                 question
             )
     elif task == "ChartQA":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
-                questions
+                question
             )
         elif prompt_format == "mistral":
             prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "MMMU":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
             prompt = prompt.format(question)
         elif prompt_format == "mistral":
             prompt = "[INST] {} [/INST]".format(question)
     elif task == "VideoMME":
-        question = (
+        q = (
             "Select the best answer to the following multiple-choice "
             "question based on the video. Respond with only the letter "
             "(A, B, C, or D) of the correct option.\n"
         )
-        question += questions[idx]["questions"][0]["question"] + "\n"
-        question += questions[idx]["questions"][0]["choices"][0] + "\n"
-        question += questions[idx]["questions"][0]["choices"][1] + "\n"
-        question += questions[idx]["questions"][0]["choices"][2] + "\n"
-        question += questions[idx]["questions"][0]["choices"][3] + "\n"
+        q += question["questions"][0]["question"] + "\n"
+        q += question["questions"][0]["choices"][0] + "\n"
+        q += question["questions"][0]["choices"][1] + "\n"
+        q += question["questions"][0]["choices"][2] + "\n"
+        q += question["questions"][0]["choices"][3] + "\n"
 
         if prompt_format == "llama3":
-            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            prompt = prompt.format("", question)
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(q)
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}".format(question)
+            prompt = "[INST] <image>\n{} [/INST]".format(q)
 
     return prompt
 

From 3ab6da9a0e2e4bbbde0a51ccae7f46e06d7cefd1 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 14:27:37 -0700
Subject: [PATCH 17/50] ADLR/megatron-lm!2137 - ci: Enable dev container for
 new features

---
 .gitlab-ci.yml                                |  1 +
 .gitlab/stages/01.tests.yml                   |  4 ++
 .gitlab/stages/02.functional-tests.yml        |  1 +
 Dockerfile.ci.dev                             | 62 +++++++++++++++++++
 .../jet/generate_jet_trigger_job.py           | 26 +++++++-
 5 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile.ci.dev

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fb222e080b..52ae2a886e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -89,6 +89,7 @@ variables:
 
   # CI wide variables
   CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
+  CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
   CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
   LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
   UNIT_TEST_TIMEOUT: 15
diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index 2fe5ddafae..68c1afcc6d 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -21,6 +21,10 @@ build_image:
         FILE: Dockerfile.ci
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
         TAG: mcore-docker-node-large
+      - IMAGE: CI_MCORE_DEV_IMAGE
+        FILE: Dockerfile.ci.dev
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
+        TAG: mcore-docker-node-large
       - IMAGE: CI_NEMO_IMAGE
         FILE: Dockerfile.ci
         BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 3ac0bcc0c5..531527b8b4 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -55,6 +55,7 @@ jet-generate:
         --h100-cluster $H100_CLUSTER \
         --container-tag ${CI_PIPELINE_ID} \
         --container-image ${CI_MCORE_IMAGE} \
+        --container-image-dev ${CI_MCORE_DEV_IMAGE} \
         --output-path "jet-trigger-job.yaml" \
         ${RELEASE_ARGS[@]}
   artifacts:
diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev
new file mode 100644
index 0000000000..fa13c48fd4
--- /dev/null
+++ b/Dockerfile.ci.dev
@@ -0,0 +1,62 @@
+# syntax=docker/dockerfile:1.3-labs
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
+
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+triton==2.1.0 \
+causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
+mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
+grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+COPY . /opt/megatron-lm
+RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
+
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
+    /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index 42030257c5..beeb31860d 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -1,3 +1,4 @@
+import pathlib
 from typing import Optional
 
 import click
@@ -5,13 +6,16 @@
 
 from tests.functional_tests.python_test_utils.jet import common
 
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
 
 @click.command()
 @click.option("--scope", required=True, type=str, help="Test scope")
 @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
 @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
 @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
-@click.option("--container-image", required=True, type=str, help="Container tag to use")
+@click.option("--container-image", required=True, type=str, help="LTS Container tag to use")
+@click.option("--container-image-dev", required=True, type=str, help="Dev Container tag to use")
 @click.option("--container-tag", required=True, type=str, help="Container tag to use")
 @click.option(
     "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
@@ -28,6 +32,7 @@ def main(
     h100_cluster: str,
     output_path: str,
     container_image: str,
+    container_image_dev: str,
     container_tag: str,
     run_name: Optional[str] = None,
     wandb_experiment: Optional[str] = None,
@@ -55,6 +60,25 @@ def main(
             f"--cluster {cluster}",
         ]
 
+        with open(
+            pathlib.Path(
+                BASE_PATH
+                / ".."
+                / ".."
+                / "test_cases"
+                / test_case.spec.model
+                / test_case.spec.test_case
+                / "model_config.yaml"
+            )
+        ) as stream:
+            try:
+                test_case_dict = yaml.safe_load(stream)
+            except yaml.YAMLError as exc:
+                print(exc)
+
+        if 'EXPERIMENTAL' in test_case_dict and test_case_dict['EXPERIMENTAL']:
+            script.append(f"--container-image {container_image_dev}")
+
         if run_name is not None and wandb_experiment is not None:
             script.append(f"--run-name {run_name}")
             script.append(f"--wandb-experiment {wandb_experiment}")

From ff89e91bc5af2fca9ee9f1073c5e896deaea295a Mon Sep 17 00:00:00 2001
From: Xuwen Chen <xuwenc@nvidia.com>
Date: Tue, 24 Sep 2024 07:58:30 -0700
Subject: [PATCH 18/50] ADLR/megatron-lm!2005 - Fix performance regression
 brought by torch.bincount

---
 megatron/core/transformer/moe/moe_utils.py    |  6 +++-
 megatron/core/transformer/moe/router.py       | 11 +++++--
 .../core/transformer/moe/token_dispatcher.py  | 33 ++++++++++++++-----
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index ee4bb690b7..02a2cccca5 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -327,6 +327,7 @@ def topk_softmax_with_capacity(
     pad_to_capacity: bool = False,
     drop_policy: str = "probs",
     use_pre_softmax: bool = False,
+    deterministic_mode: bool = False,
 ):
     """Apply capacity and padding to the top-k selection.
     Args:
@@ -366,7 +367,10 @@ def topk_softmax_with_capacity(
 
     if capacity_factor is None:
         # TopK without capacity
-        tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts)
+        if deterministic_mode:
+            tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts)
+        else:
+            tokens_per_expert = torch.histc(top_indices, bins=num_experts, min=0, max=num_experts)
         return probs, top_indices, tokens_per_expert
     else:
         # TopK with capacity
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index 8894dc1df3..3e85ec53c5 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -74,7 +74,8 @@ def routing(self, logits: torch.Tensor):
             logits (torch.Tensor): Logits tensor.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of tensors representing max probs and the indices.
+            Tuple[torch.Tensor, torch.Tensor]:
+                Tuple of tensors representing max probs and the indices.
         """
         raise NotImplementedError("Routing function not implemented.")
 
@@ -155,6 +156,7 @@ def aux_loss_load_balancing(self, logits: torch.Tensor):
             pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
             drop_policy=self.config.moe_token_drop_policy,
             use_pre_softmax=self.config.moe_router_pre_softmax,
+            deterministic_mode=self.config.deterministic_mode,
         )
 
         if self.training:
@@ -172,8 +174,10 @@ def apply_load_balancing_loss(
         """Applies auxiliary loss to the MoE layer.
 
         Args:
-            probs (torch.Tensor): The probs output by the router for each token. [num_tokens, num_experts]
-            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert. [num_experts]
+            probs (torch.Tensor):
+                The probs output by the router for each token. [num_tokens, num_experts]
+            num_local_tokens_per_expert (torch.Tensor):
+                The number of tokens per expert. [num_experts]
             activation (torch.Tensor): The activation tensor to attach the gradient function to.
 
         Returns:
@@ -279,6 +283,7 @@ def routing(self, logits: torch.Tensor):
                 pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
                 drop_policy=self.config.moe_token_drop_policy,
                 use_pre_softmax=self.config.moe_router_pre_softmax,
+                deterministic_mode=self.config.deterministic_mode,
             )
         else:
             raise ValueError(f"Unsupported MoE routing type: {self.routing_type}")
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
index e23ea4ea0f..db1b1920fa 100644
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -184,13 +184,23 @@ def token_permutation(
                 self.global_local_map = None
 
         with torch.no_grad():
-            tokens_per_expert = torch.bincount(
-                local_indices.view(-1), minlength=self.config.num_moe_experts
-            )
-            if self.num_local_experts < self.config.num_moe_experts:
-                tokens_per_expert = tokens_per_expert[
-                    self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
-                ]
+            # The indices of local_indices that give its sorted order along dim 0.
+            self.indices = torch.argsort(local_indices, dim=0)
+            if self.config.deterministic_mode:
+                tokens_per_expert = torch.bincount(
+                    local_indices.view(-1), minlength=self.config.num_moe_experts
+                )
+                if self.num_local_experts < self.config.num_moe_experts:
+                    tokens_per_expert = tokens_per_expert[
+                        self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
+                    ]
+            else:
+                tokens_per_expert = torch.histc(
+                    local_indices,
+                    bins=self.num_local_experts,
+                    min=self.local_expert_indices[0],
+                    max=self.local_expert_indices[-1],
+                )
             tokens_per_expert = tokens_per_expert.cpu().to(torch.long)
 
         # Stage2: permute the tokens locally so that they are grouped by their expert assignment
@@ -382,7 +392,14 @@ def preprocess(self, indices: torch.Tensor) -> torch.Tensor:
         Returns:
             torch.Tensor: Tensor containing the number of tokens assigned to local expert.
         """
-        num_local_tokens_per_expert = torch.bincount(indices.view(-1), minlength=self.num_experts)
+        if self.config.deterministic_mode:
+            num_local_tokens_per_expert = torch.bincount(
+                indices.view(-1), minlength=self.num_experts
+            )
+        else:
+            num_local_tokens_per_expert = torch.histc(
+                indices, bins=self.num_experts, min=0, max=self.num_experts
+            )
         # num_local_tokens_per_expert: [num_experts]
 
         tp_rank = parallel_state.get_tensor_model_parallel_rank()

From 6543004f44a68beedaf623afe4eb7fc35da3c714 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Tue, 24 Sep 2024 12:50:37 -0700
Subject: [PATCH 19/50] ADLR/megatron-lm!2073 - Multimodal batched bug fix

---
 .../core/models/multimodal/llava_model.py     |   8 +-
 .../golden_values.json                        |   2 +-
 .../golden_values.json                        |   2 +-
 .../golden_values.json                        |   2 +-
 tests/unit_tests/models/test_llava_model.py   | 102 +++++++++++-------
 5 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 68d963bdf9..6573e6f048 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -351,7 +351,9 @@ def _preprocess_data(
             ]
 
             # Put image embeddings to image positions.
-            final_embedding[images_mask] = image_embeddings.reshape(-1, embed_dim).contiguous()
+            final_embedding[images_mask] = (
+                image_embeddings.permute(1, 0, 2).reshape(-1, embed_dim).contiguous()
+            )
 
         # Create the final labels and loss mask (if this is the last language model stage).
         final_labels, final_loss_mask = None, None
@@ -466,7 +468,9 @@ def forward(
             image_embeddings = None
         elif self.add_encoder and not has_images:
             # If no images provided, use an empty image embeddings tensor.
-            image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device)
+            image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device).reshape(
+                0, 0, 0
+            )
         elif self.add_encoder and has_images:
             image_embeddings = self.vision_model(images)  # [num_tiles, img_seq_len, h_vision]
             if self._drop_vision_class_token:
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
index bd193a724d..f4b39082a6 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13442, 9.13256, 9.12852, 9.11273, 9.05533, 9.04358, 8.98427, 8.93519, 8.89295, 8.79396]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3478477.0, 3585145.0, 3475635.0, 3384010.0, 3700478.0, 3480110.0, 3398548.0, 3454436.0, 3425849.0, 3585758.0]},"iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
index de82457c30..03e0dd0e9b 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
@@ -1 +1 @@
-{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3558381.0, 3664861.0, 3555505.0, 3463866.0, 3780904.0, 3560200.0, 3478189.0, 3534510.0, 3506002.0, 3665772.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16219, 9.16263, 9.15739, 9.1412, 9.09523, 9.07236, 9.01592, 8.96749, 8.92204, 8.8314]}}
\ No newline at end of file
+{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
index 0ce1048997..96f345a702 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19789, 9.20022, 9.19547, 9.17248, 9.11862, 9.10315, 9.0418, 8.98727, 8.9443, 8.84512]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3718539.0, 3825032.0, 3715374.0, 3623934.0, 3940675.0, 3720162.0, 3638165.0, 3695121.0, 3666164.0, 3825842.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index e246ef466a..0110ad4e8b 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -18,16 +18,22 @@ def setup_method(self, method):
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
 
+        self.language_hidden_size = 64
+        self.language_num_attention_heads = 4
+
         language_config = TransformerConfig(
-            num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False
+            num_layers=3,
+            hidden_size=self.language_hidden_size,
+            num_attention_heads=self.language_num_attention_heads,
+            use_cpu_initialization=False,
         )
         vision_config = TransformerConfig(
-            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False
+            num_layers=2, hidden_size=16, num_attention_heads=2, use_cpu_initialization=False
         )
         vision_projection_config = TransformerConfig(
             num_layers=2,
-            hidden_size=128,
-            ffn_hidden_size=72,
+            hidden_size=self.language_hidden_size,
+            ffn_hidden_size=32,
             num_attention_heads=1,
             use_cpu_initialization=False,
         )
@@ -39,7 +45,7 @@ def setup_method(self, method):
         self.model = LLaVAModel(
             language_transformer_config=language_config,
             language_transformer_layer_spec=language_layer_spec,
-            language_vocab_size=2048,
+            language_vocab_size=8192,
             language_max_sequence_length=4096,
             vision_transformer_config=vision_config,
             vision_transformer_layer_spec=vision_layer_spec,
@@ -60,7 +66,7 @@ def test_constructor(self):
         assert isinstance(self.model, LLaVAModel)
 
         num_weights = sum([p.numel() for p in self.model.parameters()])
-        assert num_weights == 1832520
+        assert num_weights == 1488736
 
     @pytest.mark.internal
     def test_set_input_tensor(self):
@@ -73,12 +79,18 @@ def test_set_input_tensor(self):
     def test_preprocess_data(self):
         self.model.cuda()
 
-        image_embedding_value = torch.tensor(123.0)
+        hidden_size = 72
+
         # 3 images with 1 tile and 2 image with 2 tiles = 7 tiles.
-        image_embeddings = image_embedding_value * torch.ones((577, 7, 128)).cuda()
+        image_embeddings = (
+            1e-5
+            * torch.arange(577 * 7 * hidden_size, dtype=torch.float)
+            .reshape(577, 7, hidden_size)
+            .cuda()
+        )
 
         image_token_index = -200
-        input_ids = torch.arange(0, 1024, dtype=torch.int).expand(5, 1024).cuda()
+        input_ids = torch.arange(1024).expand(5, 1024).cuda()
         input_ids[0, 0] = image_token_index  # image before text
         input_ids[1, 100] = image_token_index  # image in between
         input_ids[2, -1] = image_token_index  # image at the end
@@ -86,8 +98,14 @@ def test_preprocess_data(self):
         input_ids[4, 50] = image_token_index  # two images in between
         input_ids[4, 150] = image_token_index
 
-        language_embedding_value = torch.tensor(999.0)
-        language_embeddings = language_embedding_value * torch.ones((5, 1024, 128)).cuda()
+        # Offset by 1000 to distinguish from image embeddings.
+        language_embeddings = (
+            1000.0
+            + 1e-5
+            * torch.arange(5 * 1024 * hidden_size, dtype=torch.float)
+            .reshape(5, 1024, hidden_size)
+            .cuda()
+        )
 
         # Labels are input_ids shifted to left by one.
         labels = torch.arange(1, 1025, dtype=torch.int).expand(5, 1024).cuda()
@@ -121,14 +139,14 @@ def test_preprocess_data(self):
         # The fifth sample has 2 images with 3 tiles and 1024 text tokens.
         max_seq_len = 3 * img_seq_len - 2 + 1024
 
-        assert embeddings.shape == torch.Size((max_seq_len, 5, 128))
+        assert embeddings.shape == torch.Size((max_seq_len, 5, hidden_size))
         assert labels.shape == torch.Size((5, max_seq_len))
         assert loss_mask.shape == labels.shape
 
         # First sample where image is before text (index 0).
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:577] = image_embedding_value
-        expected_embeddings[577:1600] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:577] = image_embeddings[:, 0]
+        expected_embeddings[577:1600] = language_embeddings[0, 1:]
         expected_embeddings[1600:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -144,15 +162,16 @@ def test_preprocess_data(self):
         expected_loss_mask[696:1600] = 1
         expected_loss_mask[1600:] = 0
 
-        assert torch.allclose(embeddings[:, 0], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 0], expected_embeddings)
         assert torch.allclose(labels[0], expected_labels)
         assert torch.allclose(loss_mask[0], expected_loss_mask)
 
         # Second sample where image is in between (index 100). The image has 2 tiles.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:100] = language_embedding_value
-        expected_embeddings[100:1254] = image_embedding_value
-        expected_embeddings[1254:2177] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:100] = language_embeddings[1, :100]
+        expected_embeddings[100:677] = image_embeddings[:, 1]
+        expected_embeddings[677:1254] = image_embeddings[:, 2]
+        expected_embeddings[1254:2177] = language_embeddings[1, 101:]
         expected_embeddings[2177:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -172,14 +191,14 @@ def test_preprocess_data(self):
         expected_loss_mask[1273:2177] = 1
         expected_loss_mask[2177:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 1], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 1], expected_embeddings)
         assert torch.allclose(labels[1], expected_labels)
         assert torch.allclose(loss_mask[1], expected_loss_mask)
 
         # Third sample where image is at the end.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:1023] = language_embedding_value
-        expected_embeddings[1023:1600] = image_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1023] = language_embeddings[2, :1023]
+        expected_embeddings[1023:1600] = image_embeddings[:, 3]
         expected_embeddings[1600:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -195,13 +214,13 @@ def test_preprocess_data(self):
         expected_loss_mask[1023:1600] = 0
         expected_loss_mask[1600:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 2], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 2], expected_embeddings)
         assert torch.allclose(labels[2], expected_labels)
         assert torch.allclose(loss_mask[2], expected_loss_mask)
 
         # Fourth sample where there is no image.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:1024] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1024] = language_embeddings[3]
         expected_embeddings[1024:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -212,17 +231,18 @@ def test_preprocess_data(self):
         expected_loss_mask[:1024] = 1
         expected_loss_mask[1024:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 3], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 3], expected_embeddings)
         assert torch.allclose(labels[3], expected_labels)
         assert torch.allclose(loss_mask[3], expected_loss_mask)
 
-        # Fifth sample has two images in between. The first image has two tiles.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:50] = language_embedding_value
-        expected_embeddings[50:1204] = image_embedding_value  # two tiles
-        expected_embeddings[1204:1303] = language_embedding_value
-        expected_embeddings[1303:1880] = image_embedding_value
-        expected_embeddings[1880:] = language_embedding_value
+        # Fifth sample has two images in between (indices 50 and 150). The first image has two tiles.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:50] = language_embeddings[4, :50]
+        expected_embeddings[50:627] = image_embeddings[:, 4]  # two tiles
+        expected_embeddings[627:1204] = image_embeddings[:, 5]
+        expected_embeddings[1204:1303] = language_embeddings[4, 51:150]
+        expected_embeddings[1303:1880] = image_embeddings[:, 6]
+        expected_embeddings[1880:] = language_embeddings[4, 151:]
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
         expected_labels[:49] = torch.arange(1, 50)
@@ -238,7 +258,7 @@ def test_preprocess_data(self):
         expected_loss_mask[1302:1880] = 0
         expected_loss_mask[1880:] = 1
 
-        assert torch.allclose(embeddings[:, 4], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 4], expected_embeddings)
         assert torch.allclose(labels[4], expected_labels)
         assert torch.allclose(loss_mask[4], expected_loss_mask)
 
@@ -309,7 +329,7 @@ def test_forward(self):
             loss_mask=None,
             num_image_tiles=num_image_tiles,
         )
-        assert logits.shape == torch.Size((5, max_seq_len, 2048))
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
 
         # Try without labels and with inference params.
         inference_params = InferenceParams(5, max_seq_len)
@@ -323,7 +343,7 @@ def test_forward(self):
             num_image_tiles=num_image_tiles,
             inference_params=inference_params,
         )
-        assert logits.shape == torch.Size((5, max_seq_len, 2048))
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
 
         # Check KV cache got populated correctly.
         kv_dict = inference_params.key_value_memory_dict
@@ -332,7 +352,11 @@ def test_forward(self):
         for layer_no in range(1, 4):  # 3 layers in the model.
             layer_kv = kv_dict[layer_no]
             # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head]
-            assert layer_kv[0].shape == layer_kv[1].shape == torch.Size((max_seq_len, 5, 8, 16))
+            assert (
+                layer_kv[0].shape
+                == layer_kv[1].shape
+                == torch.Size((max_seq_len, 5, self.language_num_attention_heads, 16))
+            )
 
     @pytest.mark.internal
     def test_save_load(self, tmp_path):

From 29793cfddd051ff0d05ceda22c8cafe764becb3d Mon Sep 17 00:00:00 2001
From: Boxiang Wang <boxiangw@nvidia.com>
Date: Tue, 24 Sep 2024 15:06:27 -0700
Subject: [PATCH 20/50] ADLR/megatron-lm!1581 - Add MLA support into MCore

Co-authored-by: Shunkang <shunkangz@nvidia.com>
Co-authored-by: BoxiangW <bwang1@fas.harvard.edu>
---
 .../core/extensions/transformer_engine.py     |  16 +-
 .../core/models/common/embeddings/__init__.py |   5 +
 .../models/common/embeddings/rope_utils.py    | 191 +++++++++
 .../common/embeddings/rotary_pos_embedding.py | 151 +------
 .../embeddings/yarn_rotary_pos_embedding.py   | 169 ++++++++
 megatron/core/models/gpt/gpt_layer_specs.py   | 147 +++++--
 megatron/core/models/gpt/gpt_model.py         |   4 +-
 megatron/core/transformer/__init__.py         |   2 +-
 megatron/core/transformer/attention.py        |  10 +-
 .../core/transformer/dot_product_attention.py |  11 +-
 .../transformer/multi_latent_attention.py     | 375 ++++++++++++++++++
 .../core/transformer/transformer_config.py    |  56 ++-
 megatron/legacy/model/rms_norm.py             |   3 +-
 megatron/legacy/model/transformer.py          |   8 +-
 megatron/training/arguments.py                |  29 +-
 megatron/training/yaml_arguments.py           |   7 +-
 pretrain_gpt.py                               |   4 +-
 .../test_multi_latent_attention.py            | 173 ++++++++
 18 files changed, 1161 insertions(+), 200 deletions(-)
 create mode 100644 megatron/core/models/common/embeddings/rope_utils.py
 create mode 100644 megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
 create mode 100644 megatron/core/transformer/multi_latent_attention.py
 create mode 100644 tests/unit_tests/transformer/test_multi_latent_attention.py

diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 751bcedb13..e5ff55849f 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -459,6 +459,9 @@ def __init__(
         attn_mask_type: AttnMaskType,
         attention_type: str,
         attention_dropout: float = None,
+        softmax_scale: float = None,
+        k_channels: int = None,
+        v_channels: int = None,
     ):
         self.config = config
         self.te_forward_mask_type = False
@@ -522,9 +525,20 @@ def __init__(
             )
             extra_kwargs['window_size'] = config.window_size
 
+        if _te_version >= packaging.version.Version("1.10.0"):
+            # TE 1.10.0 introduces the ability to set the different k and v channels
+            kv_channels = (
+                (k_channels, v_channels)
+                if k_channels is not None and v_channels is not None
+                else self.config.kv_channels
+            )
+            extra_kwargs['softmax_scale'] = softmax_scale
+        else:
+            kv_channels = self.config.kv_channels
+
         super().__init__(
             num_attention_heads=self.config.num_attention_heads,
-            kv_channels=self.config.kv_channels,
+            kv_channels=kv_channels,
             attention_dropout=(
                 self.config.attention_dropout if attention_dropout is None else attention_dropout
             ),
diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py
index e69de29bb2..865f96da5d 100644
--- a/megatron/core/models/common/embeddings/__init__.py
+++ b/megatron/core/models/common/embeddings/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from .rope_utils import apply_rotary_pos_emb
+from .rotary_pos_embedding import RotaryEmbedding
+from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
new file mode 100644
index 0000000000..037377c530
--- /dev/null
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+import logging
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+
+logger = logging.getLogger(__name__)
+
+try:
+    from apex.transformer.functional import (
+        fused_apply_rotary_pos_emb,
+        fused_apply_rotary_pos_emb_thd,
+    )
+
+    HAVE_APPLY_ROPE_FUSION = True
+except ImportError:
+    HAVE_APPLY_ROPE_FUSION = False
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor:
+    """Get the position embedding on the current context parallel rank.
+
+    Args:
+        pos_emb (Tensor): Positional embedding tensor
+        seq_dim (int): Sequence dimension
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor(
+        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
+    ).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(
+        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
+    )
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
+    """Change sign so the last dimension becomes [-odd, +even]
+
+    Args:
+        x (Tensor): Input tensor
+
+    Returns:
+        Tensor: Tensor rotated half
+    """
+    if not rotary_interleaved:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1 = x[:, :, :, ::2]
+        x2 = x[:, :, :, 1::2]
+        x_new = torch.stack((-x2, x1), dim=-1)
+        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
+
+
+def _apply_rotary_pos_emb_bshd(
+    t: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """Apply rotary positional embedding to input tensor T.
+
+    check https://kexue.fm/archives/8265 for detailed formulas
+
+    Args:
+        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+
+    Returns:
+        Tensor: The input tensor after applying RoPE
+    """
+    rot_dim = freqs.shape[-1]
+
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    if multi_latent_attention:
+        x1 = t[..., 0::2]
+        x2 = t[..., 1::2]
+        t = torch.cat((x1, x2), dim=-1)
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    cos_ = (torch.cos(freqs) * mscale).to(t.dtype)
+    sin_ = (torch.sin(freqs) * mscale).to(t.dtype)
+
+    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+    return torch.cat((t, t_pass), dim=-1)
+
+
+def _apply_rotary_pos_emb_thd(
+    t: Tensor,
+    cu_seqlens: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """A baseline implementation of applying RoPE for `thd` format.
+
+    Args:
+        t (Tensor): Input tensor T is of shape [t, h, d]
+        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
+        with shape [b + 1] and dtype torch.int32.
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
+
+    Returns:
+        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
+    """
+
+    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    return torch.cat(
+        [
+            _apply_rotary_pos_emb_bshd(
+                x.unsqueeze(1),
+                freqs[: x.size(0)],
+                rotary_interleaved=rotary_interleaved,
+                multi_latent_attention=multi_latent_attention,
+                mscale=mscale,
+            )
+            for x in torch.split(t, seqlens)
+        ]
+    ).squeeze(1)
+
+
+def apply_rotary_pos_emb(
+    t: Tensor,
+    freqs: Tensor,
+    config: TransformerConfig,
+    cu_seqlens: Optional[Tensor] = None,
+    mscale: float = 1.0,
+):
+    """
+    Reroute to the appropriate apply_rotary_pos_emb function depending on
+    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
+    """
+    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
+        # setting apply_rope_fusion in config to False
+        # so that subsequent queries to this config also return False
+        config.apply_rope_fusion = False
+        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
+            logger.warning(
+                "Setting apply_rope_fusion to false because its implementation"
+                " is not included in Apex. Try upgrading to the latest version"
+            )
+            apply_rotary_pos_emb.printed_fused_warning = True
+
+    if config.multi_latent_attention and config.rotary_interleaved:
+        logger.warning(
+            "rotary_interleaved is not supported with multi_latent_attention, setting it to False"
+        )
+        config.rotary_interleaved = False
+
+    if config.apply_rope_fusion:
+        if cu_seqlens is None:
+            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
+        else:
+            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
+    else:
+        if cu_seqlens is None:
+            return _apply_rotary_pos_emb_bshd(
+                t,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
+        else:
+            return _apply_rotary_pos_emb_thd(
+                t,
+                cu_seqlens,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index 0a4e5bf6de..6be71d87c6 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -14,46 +14,30 @@
 from torch import Tensor, nn
 
 from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
 
 logger = logging.getLogger(__name__)
 
-try:
-    from apex.transformer.functional import (
-        fused_apply_rotary_pos_emb,
-        fused_apply_rotary_pos_emb_thd,
-    )
 
-    HAVE_APPLY_ROPE_FUSION = True
-except ImportError:
-    HAVE_APPLY_ROPE_FUSION = False
-
-
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
-
-
-def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
-    cp_size = parallel_state.get_context_parallel_world_size()
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cp_idx = torch.tensor(
-        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
-    ).cuda(non_blocking=True)
-    pos_emb = pos_emb.view(
-        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
-    )
-    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
-    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
-    return pos_emb
+__all__ = ['RotaryEmbedding']
 
 
 class RotaryEmbedding(nn.Module):
     """Rotary Embedding for language model.
 
     Args:
-        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None
-        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000.
-        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on the GPU. Defaults to False
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained
+            from transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position
+            embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE
+            for longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly
+            on the GPU. Defaults to False
     """
 
     def __init__(
@@ -111,7 +95,8 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
         if parallel_state.get_context_parallel_world_size() > 1:
-            # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank
+            # slice rotary_pos_emb along sequence dimension
+            # and select the parition of the current CP rank
             emb = get_pos_emb_on_this_cp_rank(emb, 0)
         return emb
 
@@ -130,7 +115,8 @@ def get_rotary_seq_len(
 
         Args:
             inference_params : Used during Inference time
-            transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model
+            transformer (TransformerBlock): The transformer block (decoder/encoder) used
+                by the model
             transformer_input (Tensor): _description_
             transformer_config (TransformerConfig): Transformer config used by the model
 
@@ -151,102 +137,3 @@ def get_rotary_seq_len(
         rotary_seq_len *= transformer_config.context_parallel_size
 
         return rotary_seq_len
-
-
-def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
-    """Change sign so the last dimension becomes [-odd, +even]
-
-    Args:
-        x (Tensor): Input tensor
-
-    Returns:
-        Tensor: Tensor rotated half
-    """
-    if not rotary_interleaved:
-        x1, x2 = torch.chunk(x, 2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1 = x[:, :, :, ::2]
-        x2 = x[:, :, :, 1::2]
-        x_new = torch.stack((-x2, x1), dim=-1)
-        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
-
-
-def apply_rotary_pos_emb_bshd(t: Tensor, freqs: Tensor, rotary_interleaved: bool = False) -> Tensor:
-    """Apply rotary positional embedding to input tensor T.
-
-    check https://kexue.fm/archives/8265 for detailed formulas
-
-    Args:
-        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
-
-    Returns:
-        Tensor: The input tensor after applying RoPE
-    """
-    rot_dim = freqs.shape[-1]
-
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    cos_ = torch.cos(freqs).to(t.dtype)
-    sin_ = torch.sin(freqs).to(t.dtype)
-
-    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
-    return torch.cat((t, t_pass), dim=-1)
-
-
-def apply_rotary_pos_emb_thd(
-    t: Tensor, cu_seqlens: Tensor, freqs: Tensor, rotary_interleaved: bool = False
-) -> Tensor:
-    """A baseline implementation of applying RoPE for `thd` format.
-
-    Args:
-        t (Tensor): Input tensor T is of shape [t, h, d]
-        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
-        with shape [b + 1] and dtype torch.int32.
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
-
-    Returns:
-        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
-    """
-
-    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-    return torch.cat(
-        [
-            apply_rotary_pos_emb_bshd(x.unsqueeze(1), freqs[: x.size(0)])
-            for x in torch.split(t, seqlens)
-        ]
-    ).squeeze(1)
-
-
-def apply_rotary_pos_emb(
-    t: Tensor, freqs: Tensor, config: TransformerConfig, cu_seqlens: Optional[Tensor] = None
-):
-    """
-    Reroute to the appropriate apply_rotary_pos_emb function depending on
-    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
-    """
-    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
-        # setting apply_rope_fusion in config to False so that subsequent queries to this config also return False
-        config.apply_rope_fusion = False
-        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
-            logger.warning(
-                "Setting apply_rope_fusion to false because its implementation"
-                " is not included in Apex. Try upgrading to the latest version"
-            )
-            apply_rotary_pos_emb.printed_fused_warning = True
-    if config.apply_rope_fusion:
-        if cu_seqlens is None:
-            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
-        else:
-            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
-    else:
-        if cu_seqlens is None:
-            return apply_rotary_pos_emb_bshd(t, freqs, rotary_interleaved=config.rotary_interleaved)
-        else:
-            return apply_rotary_pos_emb_thd(
-                t, cu_seqlens, freqs, rotary_interleaved=config.rotary_interleaved
-            )
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
new file mode 100644
index 0000000000..14d147ea34
--- /dev/null
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+import logging
+import math
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+
+logger = logging.getLogger(__name__)
+
+
+class YarnRotaryEmbedding(RotaryEmbedding):
+    """Yarn Rotary Embedding for language model.
+
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from
+            transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for
+            longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (float, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on
+            the GPU. Defaults to False
+        scaling_factor (float, optional): Scaling factor for Yarn RoPE. Defaults to 1.0.
+        original_max_position_embeddings (int, optional): Original maximum position embeddings
+            length. Defaults to 4096.
+        beta_fast (float, optional): Fast beta value for Yarn RoPE. Defaults to 32.
+        beta_slow (float, optional): Slow beta value for Yarn RoPE. Defaults to 1.
+        mscale (float, optional): Mscale value for Yarn RoPE. Defaults to 1.
+        mscale_all_dim (float, optional): Mscale all dim value for Yarn RoPE. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_percent: float = 1.0,
+        rotary_interleaved: bool = False,
+        seq_len_interpolation_factor: float = None,
+        rotary_base: float = 10000.0,
+        use_cpu_initialization: bool = False,
+        scaling_factor: float = 1.0,
+        original_max_position_embeddings: int = 4096,
+        beta_fast: float = 32.0,
+        beta_slow: float = 1.0,
+        mscale: float = 1.0,
+        mscale_all_dim: float = 0.0,
+    ):
+        self.dim = kv_channels
+        self.rotary_base = rotary_base
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq_extra = 1.0 / (
+            self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        self.inv_freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        super().__init__(
+            kv_channels,
+            rotary_percent,
+            rotary_interleaved,
+            seq_len_interpolation_factor,
+            rotary_base,
+            use_cpu_initialization,
+        )
+
+    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+
+        assert (
+            not self.rotary_interleaved
+        ), "Yarn RoPE does not support interleaved rotary embeddings"
+
+        if self.inv_freq_extra.device.type == 'cpu':
+            # move `inv_freq_extra` to GPU once at the first micro-batch forward pass
+            self.inv_freq_extra = self.inv_freq_extra.to(device=torch.cuda.current_device())
+
+        if self.inv_freq_inter.device.type == 'cpu':
+            # move `inv_freq_inter` to GPU once at the first micro-batch forward pass
+            self.inv_freq_inter = self.inv_freq_inter.to(device=torch.cuda.current_device())
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.dim,
+            self.rotary_base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to(
+            device=self.inv_freq_extra.device, dtype=torch.float32
+        )
+        inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask
+
+        seq = (
+            torch.arange(
+                max_seq_len, device=self.inv_freq_extra.device, dtype=self.inv_freq_extra.dtype
+            )
+            + offset
+        )
+
+        freqs = torch.outer(seq, inv_freq)
+
+        _mscale = float(
+            _yarn_get_mscale(self.scaling_factor, self.mscale)
+            / _yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        if parallel_state.get_context_parallel_world_size() > 1:
+            # slice rotary_pos_emb along sequence dimension
+            # and select the parition of the current CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0)
+        return emb, _mscale
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(
+    num_rotations: float, dim: int, rotary_base: float = 10000, max_position_embeddings: int = 2048
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(rotary_base)
+    )
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+    low_rot: float,
+    high_rot: float,
+    dim: int,
+    rotary_base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> tuple[int, int]:
+    low = math.floor(_yarn_find_correction_dim(low_rot, dim, rotary_base, max_position_embeddings))
+    high = math.ceil(_yarn_find_correction_dim(high_rot, dim, rotary_base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor:
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
index d469f5e4ce..1db68dc886 100755
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -11,6 +11,10 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
 from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
@@ -49,6 +53,7 @@ def get_gpt_layer_with_transformer_engine_spec(
     num_experts: Optional[int] = None,
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
     fp8: Optional[str] = None,
 ) -> ModuleSpec:
     """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
@@ -66,34 +71,63 @@ def get_gpt_layer_with_transformer_engine_spec(
     mlp = _get_mlp_module_spec(
         use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8
     )
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                    # TENorm significantly harms convergence when used
-                    # for QKLayerNorm; we instead use the Apex implementation.
-                    q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
-                    k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=TEColumnParallelLinear,
+                        linear_q_down_proj=TEColumnParallelLinear,
+                        linear_q_up_proj=TEColumnParallelLinear,
+                        linear_kv_down_proj=TEColumnParallelLinear,
+                        linear_kv_up_proj=TEColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                        kv_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    ),
                 ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                input_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=TELayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        # TENorm significantly harms convergence when used
+                        # for QKLayerNorm; we instead use the Apex implementation.
+                        q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                        k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
 
 
 def get_gpt_layer_local_spec(
     num_experts: Optional[int] = None,
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
 ) -> ModuleSpec:
     """Use this spec for an implementation using only modules in Megatron-Core.
 
@@ -109,31 +143,58 @@ def get_gpt_layer_local_spec(
     mlp = _get_mlp_module_spec(
         use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
     )
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=LNImpl,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                    q_layernorm=LNImpl if qk_layernorm else IdentityOp,
-                    k_layernorm=LNImpl if qk_layernorm else IdentityOp,
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=ColumnParallelLinear,
+                        linear_q_down_proj=ColumnParallelLinear,
+                        linear_q_up_proj=ColumnParallelLinear,
+                        linear_kv_down_proj=ColumnParallelLinear,
+                        linear_kv_up_proj=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        kv_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
                 ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl if num_experts else IdentityOp,
+                input_layernorm=LNImpl if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=LNImpl,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-            sharded_state_dict_keys_map={
-                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-            },
-        ),
-    )
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=LNImpl,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        k_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map={
+                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+                },
+            ),
+        )
 
 
 def _get_mlp_module_spec(
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index b5f7ce51e9..cabd97672a 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -102,7 +102,7 @@ def __init__(
                 position_embedding_type=position_embedding_type,
             )
 
-        if self.position_embedding_type == 'rope':
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
             self.rotary_pos_emb = RotaryEmbedding(
                 kv_channels=self.config.kv_channels,
                 rotary_percent=rotary_percent,
@@ -212,7 +212,7 @@ def forward(
 
         # Rotary positional embeddings (embedding is None for PP intermediate devices)
         rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
             rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
                 inference_params, self.decoder, decoder_input, self.config
             )
diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py
index 7cc10776b7..0e3cdcfa57 100644
--- a/megatron/core/transformer/__init__.py
+++ b/megatron/core/transformer/__init__.py
@@ -2,5 +2,5 @@
 
 from .module import MegatronModule
 from .spec_utils import ModuleSpec, build_module
-from .transformer_config import TransformerConfig
+from .transformer_config import MLATransformerConfig, TransformerConfig
 from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index 6f81787f67..850dec88e1 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -6,7 +6,7 @@
 import torch
 
 from megatron.core import parallel_state, tensor_parallel
-from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.parallel_state import (
     get_data_parallel_group,
     get_data_parallel_rank,
@@ -146,14 +146,14 @@ def custom_forward(*inputs):
 
         return hidden_states
 
-    def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype):
+    def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype):
         """Allocate memory to store kv cache during inference."""
 
         return torch.empty(
             inference_max_sequence_length,
             batch_size,
             self.num_query_groups_per_partition,
-            self.hidden_size_per_attention_head,
+            dim,
             dtype=dtype,
             device=torch.cuda.current_device(),
         )
@@ -178,10 +178,10 @@ def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_p
             inf_max_seq_length = inference_params.max_sequence_length
             inf_max_batch_size = inference_params.max_batch_size
             inference_key_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, key.dtype
+                inf_max_seq_length, inf_max_batch_size, key.shape[-1], key.dtype
             )
             inference_value_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, value.dtype
+                inf_max_seq_length, inf_max_batch_size, value.shape[-1], value.dtype
             )
             inference_params.key_value_memory_dict[self.layer_number] = (
                 inference_key_memory,
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
index bbac3fa4a2..d5c014cabf 100644
--- a/megatron/core/transformer/dot_product_attention.py
+++ b/megatron/core/transformer/dot_product_attention.py
@@ -40,6 +40,7 @@ def __init__(
         attn_mask_type: AttnMaskType,
         attention_type: str,
         attention_dropout: float = None,
+        softmax_scale: float = None,
     ):
         super().__init__(config=config)
 
@@ -67,10 +68,14 @@ def __init__(
         self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
 
         coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if softmax_scale is None:
+            self.softmax_scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head)
+        else:
+            self.softmax_scale = softmax_scale
+
         if self.config.apply_query_key_layer_scaling:
             coeff = self.layer_number
-            self.norm_factor *= coeff
+            self.softmax_scale /= coeff
 
         self.scale_mask_softmax = FusedScaleMaskSoftmax(
             input_in_fp16=self.config.fp16,
@@ -143,7 +148,7 @@ def forward(
             query.transpose(0, 1),  # [b * np, sq, hn]
             key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
             beta=0.0,
-            alpha=(1.0 / self.norm_factor),
+            alpha=self.softmax_scale,
         )
 
         # change view to [b, np, sq, sk]
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
new file mode 100644
index 0000000000..d637e2b448
--- /dev/null
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings import (
+    YarnRotaryEmbedding,
+    _yarn_get_mscale,
+    apply_rotary_pos_emb,
+)
+from megatron.core.transformer.attention import Attention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+
+
+@dataclass
+class MLASelfAttentionSubmodules:
+    """Submodules for the MLA self-attention layer."""
+
+    linear_q_proj: Union[ModuleSpec, type] = None
+    linear_q_down_proj: Union[ModuleSpec, type] = None
+    linear_q_up_proj: Union[ModuleSpec, type] = None
+    linear_kv_down_proj: Union[ModuleSpec, type] = None
+    linear_kv_up_proj: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    kv_layernorm: Union[ModuleSpec, type] = None
+
+
+class MultiLatentAttention(Attention):
+    """Multi-Latent Attention layer abstract class.
+
+    This layer only contains common modules required for the "self attn" and
+    "cross attn" specializations.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: Union[MLASelfAttentionSubmodules],
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+    ) -> None:
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        assert (
+            world_size == 1
+        ), "MLA is not supported with Tensor Parallelism yet, \
+        use Expert Parallelism and Pipeline Parallelism for better performance."
+
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attention_type=attention_type,
+            attn_mask_type=attn_mask_type,
+        )
+
+        self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
+
+        self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
+
+        mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale)
+        self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim)
+
+        self.rotary_pos_emb = YarnRotaryEmbedding(
+            self.config.qk_pos_emb_head_dim,
+            rotary_base=self.config.rotary_base,
+            scaling_factor=self.config.rotary_scaling_factor,
+            original_max_position_embeddings=self.config.max_position_embeddings,
+            beta_fast=self.config.beta_fast,
+            beta_slow=self.config.beta_slow,
+            mscale=self.config.mscale,
+            mscale_all_dim=self.config.mscale_all_dim,
+        )
+
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+        )
+
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        position_ids=None,
+    ):
+        assert rotary_pos_emb is None, "Rotary position embeddings should not be passed into MLA."
+
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
+        query, key, value = self.get_query_key_value_tensors(
+            hidden_states,
+            key_value_states,
+            position_ids,
+            packed_seq_params,
+            inference_params=inference_params,
+        )
+
+        # ===================================================
+        # Adjust key, value for inference
+        # ===================================================
+        # rotary_pos_emb = None
+        key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb=None
+        )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        # Need corresponding TE change
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query, key, value, attention_mask, packed_seq_params=packed_seq_params
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                packed_seq_params=packed_seq_params,
+                attn_mask_type=attn_mask_type,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+
+class MLASelfAttention(MultiLatentAttention):
+    """MLA Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: MLASelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        if self.config.q_lora_rank is None:
+            # Not projectiing query
+            self.linear_q_proj = build_module(
+                submodules.linear_q_proj,
+                self.config.hidden_size,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        else:
+
+            self.linear_q_down_proj = build_module(
+                submodules.linear_q_down_proj,
+                self.config.hidden_size,
+                self.config.q_lora_rank,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+            self.linear_q_up_proj = build_module(
+                submodules.linear_q_up_proj,
+                self.config.q_lora_rank,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        self.linear_kv_down_proj = build_module(
+            submodules.linear_kv_down_proj,
+            self.config.hidden_size,
+            self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.linear_kv_up_proj = build_module(
+            submodules.linear_kv_up_proj,
+            self.config.kv_lora_rank,
+            self.config.num_attention_heads * (self.config.qk_head_dim + self.config.v_head_dim),
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        if self.config.q_lora_rank is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.config.q_lora_rank,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+
+        self.kv_layernorm = build_module(
+            submodules.kv_layernorm,
+            hidden_size=self.config.kv_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def get_query_key_value_tensors(
+        self,
+        hidden_states,
+        key_value_states=None,
+        position_ids=None,
+        packed_seq_params=None,
+        inference_params=None,
+    ):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # s = sequence length, b = batch size, h = hidden size, n = num attention heads
+        # Attention heads [s, b, n*h]
+        assert (
+            hidden_states.ndim == 3
+        ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D"
+        q_len, bsz, _ = hidden_states.size()
+
+        if self.config.q_lora_rank is not None:
+            q_compressed, _ = self.linear_q_down_proj(hidden_states)
+            q_compressed = self.q_layernorm(q_compressed)
+            q, _ = self.linear_q_up_proj(q_compressed)
+        else:
+            # hidden_states:[s, b, 2048], q: [s, b, n * 192]
+            q, _ = self.linear_q_proj(hidden_states)
+
+        # q: [s, b, n, 192]
+        q = q.view(q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim)
+
+        # q: [s, b, n, 128], q_pos_emb: [s, b, n, 64]
+        q_no_pe, q_pos_emb = torch.split(
+            q, [self.config.qk_head_dim, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv_combined: [s, b, 576]
+        kv_combined, _ = self.linear_kv_down_proj(hidden_states)
+
+        # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64]
+        kv_compressed, k_pos_emb = torch.split(
+            kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv: [s, b, 2048]
+        kv, _ = self.linear_kv_up_proj(self.kv_layernorm(kv_compressed))
+
+        # kv: [s, b, n, 256]
+        kv = kv.view(
+            q_len,
+            bsz,
+            self.num_attention_heads_per_partition,
+            self.config.qk_head_dim + self.config.v_head_dim,
+        )
+
+        # k_no_pe: [s, b, n, 128], value: [s, b, n, 128]
+        k_no_pe, value = torch.split(kv, [self.config.qk_head_dim, self.config.v_head_dim], dim=-1)
+
+        # rotary_pos_emb:[s, b, 1, 64]
+        rotary_pos_emb = self.rotary_pos_emb(max_seq_len=self.config.max_position_embeddings)
+
+        if len(rotary_pos_emb) == 2:
+            mscale = rotary_pos_emb[1]
+            rotary_pos_emb = rotary_pos_emb[0]
+
+        if inference_params is not None:
+            # add offset to the sequence start for inference
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + q_len
+            rotary_pos_emb = rotary_pos_emb[sequence_start:sequence_end]
+
+        # [s, b, 64] -> [s, b, 1, 64]
+        k_pos_emb = torch.unsqueeze(k_pos_emb, 2)
+
+        if packed_seq_params is not None:
+            cu_seqlens_q = packed_seq_params.cu_seqlens_q
+            cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+        else:
+            cu_seqlens_q = cu_seqlens_kv = None
+
+        # q_pos_emb: [s, b, n, 64], k_pos_emb:[s, b, 1, 64]
+        q_pos_emb = apply_rotary_pos_emb(
+            q_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q, mscale=mscale
+        )
+        k_pos_emb = apply_rotary_pos_emb(
+            k_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv, mscale=mscale
+        )
+
+        # query: [s, b, n, 192]
+        query = torch.cat([q_no_pe, q_pos_emb], dim=-1)
+
+        # key: [s, b, n, 192]
+        key = torch.cat([k_no_pe, k_pos_emb], dim=-1)
+
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        return query, key, value
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index f16a0117a3..c5ce7bc6dc 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -112,6 +112,9 @@ class TransformerConfig(ModelParallelConfig):
     """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the
     global batch, versus the default behavior of assuming all tokens are non-padded."""
 
+    multi_latent_attention: bool = False
+    """Whether to use multi-latent attention."""
+
     ####################
     # initialization
     ####################
@@ -262,7 +265,6 @@ class TransformerConfig(ModelParallelConfig):
     """When there are multiple experts per rank, compress multiple local (potentially small) gemms
     in a single kernel launch to improve the utilization and performance by leveraging the Grouped
     GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
-
     """
 
     moe_aux_loss_coeff: float = 0  # 1e-2 would be a good start value for load balance loss.
@@ -282,6 +284,7 @@ class TransformerConfig(ModelParallelConfig):
     moe_token_dispatcher_type: str = "allgather"
     """The type of token dispatcher to use. The default is 'allgather'.
     Options are 'allgather' and 'alltoall'."""
+
     moe_per_layer_logging: bool = False
     """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
 
@@ -513,3 +516,54 @@ def __post_init__(self):
 
             if self.moe_grouped_gemm:
                 raise ValueError("Grouped GEMM of MoE not support fp8 for now.")
+
+
+@dataclass
+class MLATransformerConfig(TransformerConfig):
+    """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers.
+
+    The initialization function has an argument for each parameter, including those in
+    ModelParallelConfig. Included YaRN RoPE parameters that is fused in MLA.
+    """
+
+    multi_latent_attention: bool = True
+    """Whether to use Multi-Latent Attention."""
+
+    q_lora_rank: int = 512
+    """Rank of Query tensor's low rank representation."""
+
+    kv_lora_rank: int = 512
+    """Rank of Key and Value tensors' low rank representation."""
+
+    qk_head_dim: int = 128
+    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim"""
+
+    qk_pos_emb_head_dim: int = 64
+    """Dimension of the position embedding in the QK projection."""
+
+    v_head_dim: int = 128
+    """Dimension of the head in the V projection."""
+
+    rotary_base: float = 10000
+    """Rotary base for the rotary embeddings."""
+
+    rotary_scaling_factor: float = 40
+    """Rotary scaling factor for the rotary embeddings."""
+
+    normalization: str = "RMSNorm"
+    """Default normalization layer for MLA models is RMSNorm."""
+
+    max_position_embeddings: int = 163840
+    """Maximum position embeddings for the original model."""
+
+    beta_fast: float = 32
+    """Beta fast for YaRN RoPE."""
+
+    beta_slow: float = 1
+    """Beta slow for YaRN RoPE."""
+
+    mscale: float = 0.707
+    """Mscale for YaRN RoPE in Multi-Latent Attention."""
+
+    mscale_all_dim: float = 0.707
+    """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention."""
diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py
index 7e4424c7b0..21ba00c600 100644
--- a/megatron/legacy/model/rms_norm.py
+++ b/megatron/legacy/model/rms_norm.py
@@ -8,7 +8,8 @@ class RMSNorm(torch.nn.Module):
     def __init__(self,
                  dim: int,
                  eps: float = 1e-6,
-                 sequence_parallel: bool = False):
+                 sequence_parallel: bool = False,
+                 config: dict = None):
         """RMS Normaliation module
 
         Args:
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index 7d723df024..9dfc7f7ed8 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -13,11 +13,11 @@
 from megatron import core
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
+from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.jit import jit_fuser
-from megatron.core.models.common.embeddings.rotary_pos_embedding import (
-    RotaryEmbedding,
-    apply_rotary_pos_emb,
-)
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.parallel_state import (
     get_tensor_and_expert_parallel_group,
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 7a0c2d8d37..4d5dc48014 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -17,7 +17,7 @@
     get_config_path as get_retro_config_path,
     get_gpt_data_dir as get_retro_data_dir,
 )
-from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
 from megatron.training.activations import squared_relu
 from megatron.training.utils import update_use_dist_ckpt
 
@@ -42,6 +42,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     parser = _add_biencoder_args(parser)
     parser = _add_vision_args(parser)
     parser = _add_moe_args(parser)
+    parser = _add_mla_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_straggler_detector_args(parser)
     parser = _add_inference_args(parser)
@@ -655,10 +656,13 @@ def _check_arg_is_not_none(args, arg):
 
 
 def core_transformer_config_from_args(args, config_class=None):
-
+    
     # Config class.
     config_class = config_class or TransformerConfig
 
+    if args.multi_latent_attention:
+        config_class = MLATransformerConfig
+
     # Translate args to core transformer configuration
     kw_args = {}
     for f in dataclasses.fields(config_class):
@@ -877,7 +881,9 @@ def _add_network_size_args(parser):
                        help='Disable BERT binary head.',
                        dest='bert_binary_head')
     group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
-                       help='Untie embeddings and output weights.'),
+                       help='Untie embeddings and output weights.')
+    group.add_argument('--multi-latent-attention', action='store_true',
+                       help='Use multi-latent attention for model.')
     return parser
 
 
@@ -1911,6 +1917,23 @@ def _add_moe_args(parser):
 
     return parser
 
+def _add_mla_args(parser):
+    group = parser.add_argument_group(title="mla")
+    group.add_argument('--q-lora-rank', type=int, default=None,
+                       help="Rank of Query tensor's low rank representation.")
+    group.add_argument('--kv-lora-rank', type=int, default=32,
+                       help="Rank of Key and Value tensors' low rank representation.")
+    group.add_argument('--qk-head-dim', type=int, default=128,
+                       help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim")
+    group.add_argument('--qk-pos-emb-head-dim', type=int, default=64,
+                       help="Dimension of the position embedding in the QK projection.")
+    group.add_argument('--v-head-dim', type=int, default=128,
+                       help="Dimension of the head in the V projection.")
+    group.add_argument('--rotary-scaling-factor', type=float, default=1.0,
+                       help="Rotary scaling factor for the rotary embeddings.")
+
+    return parser
+
 def _add_experimental_args(parser):
     group = parser.add_argument_group(title='experimental')
 
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
index f81d4dee5d..3c6c39b07f 100644
--- a/megatron/training/yaml_arguments.py
+++ b/megatron/training/yaml_arguments.py
@@ -16,7 +16,7 @@
 
 import torch.nn.functional as F
 
-from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
 
 # Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml
 # Allows for yaml to use environment variables
@@ -442,7 +442,10 @@ def squared_relu(x):
         kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
     
     # Return Transformer config.
-    return TransformerConfig(**kw_args)
+    if getattr(args, "multi_latent_attention", False):
+        return MLATransformerConfig(**kw_args)
+    else:
+        return TransformerConfig(**kw_args)
 
 def load_yaml(yaml_path):
     print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored")
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 0bd85b76e1..96563a3acb 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -73,9 +73,9 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             transformer_layer_spec = import_module(args.spec)
         else:
             if use_te:
-                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.fp8)
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, args.fp8)
             else:
-                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
+                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention)
 
         build_model_context = nullcontext
         build_model_context_args = {}
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
new file mode 100644
index 0000000000..4117ba6aa0
--- /dev/null
+++ b/tests/unit_tests/transformer/test_multi_latent_attention.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from importlib.metadata import version
+
+import pytest
+import torch
+import transformer_engine as te
+from pkg_resources import packaging
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.multi_latent_attention import MLASelfAttention
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+def get_te_version():
+    def get_te_version_str():
+        if hasattr(te, '__version__'):
+            return str(te.__version__)
+        else:
+            return version("transformer-engine")
+
+    return packaging.version.Version(get_te_version_str())
+
+
+_te_version = get_te_version()
+
+
+class TestParallelMLAAttention:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            q_lora_rank=32,
+            kv_lora_rank=32,
+            qk_head_dim=128,
+            v_head_dim=128,
+            qk_pos_emb_head_dim=64,
+            rotary_base=10000,
+        )
+        self.parallel_attention = MLASelfAttention(
+            self.transformer_config,
+            get_gpt_layer_with_transformer_engine_spec(
+                multi_latent_attention=True
+            ).submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.parallel_attention, MLASelfAttention)
+        assert self.parallel_attention.layer_number == 1
+
+        num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
+        assert num_weights == 65036
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self):
+        if _te_version >= packaging.version.Version("1.10.0"):
+
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = self.parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+
+    def test_fused_rope_gpu_forward(self):
+        if _te_version >= packaging.version.Version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            self.parallel_attention.config.apply_rope_fusion = True
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+            rotary_pos_emb = torch.ones(
+                sequence_length, 1, 1, self.parallel_attention.config.kv_channels
+            ).cuda()
+            output, bias = self.parallel_attention(
+                hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb
+            )
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+            self.parallel_attention.config.apply_rope_fusion = False
+
+    def test_checkpointed_gpu_forward(self):
+        if _te_version >= packaging.version.Version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            transformer_config = self.transformer_config
+            transformer_config.recompute_granularity = 'selective'
+            checkpointed_parallel_attention = MLASelfAttention(
+                transformer_config,
+                get_gpt_layer_with_transformer_engine_spec(
+                    multi_latent_attention=True
+                ).submodules.self_attention.submodules,
+                layer_number=1,
+            )
+            config = checkpointed_parallel_attention.config
+
+            sequence_length = 32
+            micro_batch_size = 2
+
+            checkpointed_parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (
+                    sequence_length,
+                    micro_batch_size,
+                    checkpointed_parallel_attention.config.hidden_size,
+                )
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = checkpointed_parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity == 'selective'
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size

From 754e0f01c79120aa8d4c8c5553fcd25d79d0b3a2 Mon Sep 17 00:00:00 2001
From: Tuomas Rintamaki <trintamaki@nvidia.com>
Date: Tue, 24 Sep 2024 17:11:31 -0700
Subject: [PATCH 21/50] ADLR/megatron-lm!1995 - Add freeze options to
 pretrain_vlm

---
 pretrain_vlm.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index 7777603e53..c71cc7c19c 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -137,6 +137,12 @@ def model_provider(
         patch_dim=args.patch_dim,
     )
 
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
     return model
 
 
@@ -270,7 +276,18 @@ def forward_step(data_iterator, model: LLaVAModel):
 def add_vlm_extra_args(parser):
     """Extra arguments."""
     group = parser.add_argument_group(title='vision language model specific arguments')
-    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        '--freeze-LM', action='store_true', default=False, help="Freeze language model weights"
+    )
+    group.add_argument(
+        '--freeze-ViT', action='store_true', default=False, help="Freeze vision model (ViT) weights"
+    )
+    group.add_argument(
+        "--disable-vision-class-token",
+        action="store_true",
+        default=False,
+        help="Drop vision model class token",
+    )
     return parser
 
 

From 10350b6f4e32955f4f9295c4275b6fff82c5d5db Mon Sep 17 00:00:00 2001
From: Deepak Narayanan <dnarayanan@nvidia.com>
Date: Tue, 24 Sep 2024 18:42:40 -0700
Subject: [PATCH 22/50] ADLR/megatron-lm!2145 - Improve logging when decreasing
 batch size

---
 megatron/core/num_microbatches_calculator.py | 19 +++++++++++++------
 megatron/training/checkpointing.py           |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py
index 16bd95a7b4..5850e512ca 100644
--- a/megatron/core/num_microbatches_calculator.py
+++ b/megatron/core/num_microbatches_calculator.py
@@ -320,6 +320,8 @@ def __init__(
             if rank == 0:
                 logger.info(
                     f'decreasing batch size from {global_batch_size} to {running_global_batch_size}'
+                    f'to keep divisiblity by micro_batch_size={micro_batch_size} * '
+                    f'data_parallel_size={data_parallel_size}'
                 )
             self.num_micro_batches = (
                 running_global_batch_size // micro_batch_times_data_parallel_size
@@ -424,7 +426,7 @@ def __init__(
         self.rampup_samples_per_increment = self.ramup_samples / num_increments
 
         # Initialize number of microbatches.
-        self.update(0, False)
+        self.update(0, consistency_check=False, verbose=True)
 
     def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = False) -> None:
         """Update number of microbatches.
@@ -450,10 +452,13 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool =
         if old_current_global_batch_size != self.current_global_batch_size:
             global_batch_size_changed = True
         if self.rank == 0 and global_batch_size_changed and verbose:
-            logger.info(
-                f'ramping up batch size from {old_current_global_batch_size} to '
-                f'{self.current_global_batch_size}'
-            )
+            if old_current_global_batch_size is None:
+                logger.info(f'setting initial batch size to {self.current_global_batch_size}')
+            else:
+                logger.info(
+                    f'ramping up batch size from {old_current_global_batch_size} to '
+                    f'{self.current_global_batch_size}'
+                )
 
         # Check consistency of the current global batch size.
         if consistency_check and not self.decrease_batch_size_if_needed:
@@ -477,7 +482,9 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool =
             if self.rank == 0 and global_batch_size_changed and verbose:
                 logger.info(
                     f'decreasing batch size from {self.current_global_batch_size} to '
-                    f'{self.current_running_global_batch_size}'
+                    f'{self.current_running_global_batch_size} to keep divisiblity by '
+                    f'micro_batch_size={self.micro_batch_size} * '
+                    f'data_parallel_size={self.data_parallel_size}'
                 )
             assert (
                 self.current_running_global_batch_size % self.micro_batch_times_data_parallel_size
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index cb4b7ace4d..3de49f6c57 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -1131,7 +1131,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                                               'consumed_train_samples', 0)
         args.skipped_train_samples = getattr(checkpoint_args,
                                              'skipped_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples, verbose=True)
         args.consumed_valid_samples = getattr(checkpoint_args,
                                               'consumed_valid_samples', 0)
     else:

From f54686ac83786d1fc5537d527373bfe483860096 Mon Sep 17 00:00:00 2001
From: Helen Ngo <helenn@nvidia.com>
Date: Tue, 24 Sep 2024 18:42:44 -0700
Subject: [PATCH 23/50] ADLR/megatron-lm!2148 - Add model.eval() to
 run_text_generation_server.py

---
 tools/run_text_generation_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index 861d8d6d73..5c99bf2908 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -122,6 +122,8 @@ def add_text_generate_args(parser):
 
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
+    model.eval()
+
     if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         server = MegatronServer(model)
         server.run("0.0.0.0",port=args.port)

From b301e5ff9efe472a365862884b936b6c5ce5249d Mon Sep 17 00:00:00 2001
From: Jon Barker <jbarker@nvidia.com>
Date: Wed, 25 Sep 2024 18:06:48 -0700
Subject: [PATCH 24/50] ADLR/megatron-lm!2111 - Mcore llama3.1 support

Co-authored-by: Jon Barker <jbarker@draco-oci-dc-01.cm.cluster>
---
 docs/llama_mistral.md                         | 106 ++++++++++++++++++
 .../llama_mistral/huggingface_reference.py    |   1 +
 .../run_text_generation_llama3.1.sh           |  56 +++++++++
 .../common/embeddings/rotary_pos_embedding.py |  45 +++++++-
 megatron/core/models/gpt/gpt_model.py         |   2 +
 .../core/models/multimodal/llava_model.py     |   2 +
 megatron/training/arguments.py                |   2 +
 pretrain_gpt.py                               |   3 +-
 pretrain_vlm.py                               |   1 +
 tools/checkpoint/loader_llama_mistral.py      |   9 +-
 tools/run_text_generation_server.py           |   3 +-
 11 files changed, 219 insertions(+), 11 deletions(-)
 create mode 100755 examples/inference/llama_mistral/run_text_generation_llama3.1.sh

diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 01e55c4a23..11601fd44f 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -282,6 +282,104 @@ If loading for either inference or finetuning, use the following arguments:
 --bf16 \
 ```
 
+# Llama-3.1
+
+Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Convert the checkpoints from Huggingface format to Megatron format.
+3. (Optional) Validate converted checkpoints
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Huggingface format](#huggingface-format)
+  * [Validate checkpoint](#optional-validate-checkpoint)
+  * [Launch model](#launch-model)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Llama-3 checkpoints from [Huggingface](https://huggingface.co/meta-llama).
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  8B        | 1                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --bf16 \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+ >    --model-size llama3-8B \
+```
+
+Valid values for `--model-size` are `llama3.1-8B` and `llama3.1-70B` (for pretrained-only models), and `llama3.1-8Bf` and `llama3.1-70Bf` (for chat-finetuned models).
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## (Optional) Validate checkpoints
+
+A Megatron-LM text generation server for Llama3.1 can be launched using the script `examples/llama_mistral/run_text_generation_llama3.1.sh <PATH_TO_CONVERTED_MCORE_CHECKPOINT> <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT>`.
+
+Once running, query the server with `curl 'http://<TEXT_GENERATION_SERVER_IP>:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["<SOME_PROMPT>"], "tokens_to_generate":100, "top_k":1}'`.
+
+A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT> --prompt <SOME_PROMPT>`.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 8192 \
+--max-position-embeddings 131072 \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32 \
+--disable-bias-linear \
+--transformer-impl transformer_engine \
+--group-query-attention 8 \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--rotary-base 500000 \
+--rotary-percent 1.0 \
+--use-rope-scaling \
+--ffn-hidden-size 14336 \
+--num-attention-heads 32 \
+--swiglu \
+--bf16 \
+```
+
 # Mistral-7b
 
 Megatron currently supports loading the v0.3 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps:
@@ -372,3 +470,11 @@ If loading for either inference or finetuning, use the following arguments:
 *Note: Experimental*
 
 Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3).
+
+# Known numerical differences
+
+It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
+
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas mcore combines them into a single GEMM for efficiency. This leads to small numerical differences.
+
diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py
index 7b583612a5..9d8f4465f6 100644
--- a/examples/inference/llama_mistral/huggingface_reference.py
+++ b/examples/inference/llama_mistral/huggingface_reference.py
@@ -20,5 +20,6 @@
 for key in inputs:
     inputs[key] = inputs[key].cuda()
 # top_k, top_p and do_sample are set for greedy argmax based sampling
+
 outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
new file mode 100755
index 0000000000..06584f0917
--- /dev/null
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index 6be71d87c6..f3c854275c 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -9,6 +9,7 @@
     from megatron.core.transformer.transformer_block import TransformerBlock
 
 import logging
+import math
 
 import torch
 from torch import Tensor, nn
@@ -36,6 +37,7 @@ class RotaryEmbedding(nn.Module):
             for longer sequences. The value must be a float larger than 1.0. Defaults to None
         rotary_base (int, optional): Base period for rotary position embeddings. Defaults to
             10000.
+        rope_scaling (bool, optional): Apply rope scaling as used in llama 3.1
         use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly
             on the GPU. Defaults to False
     """
@@ -47,6 +49,7 @@ def __init__(
         rotary_interleaved: bool = False,
         seq_len_interpolation_factor: float = None,
         rotary_base: int = 10000,
+        rope_scaling: bool = False,
         use_cpu_initialization: bool = False,
     ) -> None:
         super().__init__()
@@ -62,6 +65,44 @@ def __init__(
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
 
+        if rope_scaling:
+            self.inv_freq = self._apply_scaling(self.inv_freq)
+
+    def _apply_scaling(
+        self,
+        freqs,
+        factor=8,
+        low_freq_factor=1,
+        high_freq_factor=4,
+        original_max_position_embeddings=8192,
+    ):
+        # This implementation is adapted from:
+        # https://github.com/huggingface/transformers/blob/2a5a6ad18aa22e98429bb5ecb880660328030ea0/src/transformers/modeling_rope_utils.py#L303-L343
+
+        factor = factor  # `8` in the original implementation
+        low_freq_factor = low_freq_factor  # `1` in the original implementation
+        high_freq_factor = high_freq_factor  # `4` in the original implementation
+        old_context_len = original_max_position_embeddings  # `8192` in the original implementation
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        wavelen = 2 * math.pi / freqs
+        # wavelen < high_freq_wavelen: do nothing
+        # wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen, freqs / factor, freqs)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+            high_freq_factor - low_freq_factor
+        )
+        smoothed_inv_freq = (
+            1 - smooth_factor
+        ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+        inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+        return inv_freq_llama
+
     def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
         """Forward pass of RoPE embedding.
 
@@ -115,8 +156,8 @@ def get_rotary_seq_len(
 
         Args:
             inference_params : Used during Inference time
-            transformer (TransformerBlock): The transformer block (decoder/encoder) used
-                by the model
+            transformer (TransformerBlock): The transformer block
+                (decoder/encoder) used by the model
             transformer_input (Tensor): _description_
             transformer_config (TransformerConfig): Transformer config used by the model
 
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index cabd97672a..ea4bd181af 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -69,6 +69,7 @@ def __init__(
         position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
         rotary_percent: float = 1.0,
         rotary_base: int = 10000,
+        rope_scaling: bool = False,
         seq_len_interpolation_factor: Optional[float] = None,
     ) -> None:
         super().__init__(config=config)
@@ -109,6 +110,7 @@ def __init__(
                 rotary_interleaved=self.config.rotary_interleaved,
                 seq_len_interpolation_factor=seq_len_interpolation_factor,
                 rotary_base=rotary_base,
+                rope_scaling=rope_scaling,
                 use_cpu_initialization=self.config.use_cpu_initialization,
             )
 
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 6573e6f048..32527f9dea 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -76,6 +76,7 @@ def __init__(
         img_w: int = 336,
         patch_dim: int = 14,
         language_rotary_base: int = 10000,
+        language_rope_scaling: bool = False,
     ) -> None:
         super().__init__(config=language_transformer_config)
 
@@ -112,6 +113,7 @@ def __init__(
                 pre_process=self.pre_process,
                 post_process=self.post_process,
                 rotary_base=language_rotary_base,
+                rope_scaling=language_rope_scaling,
             )
             self.share_embeddings_and_output_weights = (
                 self.language_model.share_embeddings_and_output_weights
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 4d5dc48014..162d719314 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -847,6 +847,8 @@ def _add_network_size_args(parser):
                           help='Use interleaved rotary embedding.')
     group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
                        help='Sequence length interpolation factor for rotary embeddings.')
+    group.add_argument('--use-rope-scaling', action='store_true',
+                       help='Apply rope scaling as used in llama3.1')
     group.add_argument('--no-position-embedding',
                        action='store_false',
                        help='Disable position embedding. Deprecated: use --position-embedding-type',
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 96563a3acb..3b7f8db012 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -105,7 +105,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                 share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
                 position_embedding_type=args.position_embedding_type,
                 rotary_percent=args.rotary_percent,
-                rotary_base=args.rotary_base
+                rotary_base=args.rotary_base,
+                rope_scaling=args.use_rope_scaling
             )
 
     return model
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index c71cc7c19c..b0b9d21d97 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -128,6 +128,7 @@ def model_provider(
         parallel_output=parallel_output,
         language_position_embedding_type=args.position_embedding_type,
         language_rotary_percent=args.rotary_percent,
+        language_rope_scaling=args.use_rope_scaling,
         pre_process=pre_process,
         post_process=post_process,
         add_encoder=add_encoder,
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
index 1b5fec9afd..ea803c5543 100644
--- a/tools/checkpoint/loader_llama_mistral.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -385,15 +385,10 @@ def load_checkpoint_to_model(args):
     '''Set model params.'''
 
     from pretrain_gpt import model_provider
-    if "llama" in args.model_size or "yi" in args.model_size:
-        from transformers import LlamaForCausalLM as ModelForCausalLM
-    elif "mistral" in args.model_size:
-        from transformers import MistralForCausalLM as ModelForCausalLM
-    else:
-        raise AttributeError(f"args.model_size={args.model_size} not supported")
+    from transformers import AutoModelForCausalLM
 
     # Load Huggingface model.
-    hf_model = ModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
+    hf_model = AutoModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
 
     # Init Megatron model.
     model = model_provider(True, True).to(args.params_dtype)
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index 5c99bf2908..e5b3f08a58 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -83,7 +83,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent,
-            rotary_base=args.rotary_base
+            rotary_base=args.rotary_base,
+            rope_scaling=args.use_rope_scaling
         )
 
     return model

From dcdf804b6dc6d5256077b4861bd2588170b41824 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Wed, 25 Sep 2024 18:06:50 -0700
Subject: [PATCH 25/50] ADLR/megatron-lm!2151 - ci: Run experimental UTs on dev
 image

---
 .gitlab/stages/01.tests.yml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index 68c1afcc6d..3a667cbe02 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -85,13 +85,17 @@ unit_tests:
   # the current code. This is a form of backwards compatibility testing
   # and helps in providing stable interfaces.
   extends: [.test_mr_rules]
-  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
+  image: ${IMAGE}:${CI_PIPELINE_ID}
   needs: [build_image]
   timeout: 180m
   parallel:
     matrix:
       - TAG: latest
+        IMAGE: ${CI_MCORE_IMAGE}
+      # - TAG: latest
+      #   IMAGE: ${CI_MCORE_DEV_IMAGE}
       - TAG: 63be779b4608403f956aa1ef6c9013ab78db3eeb
+        IMAGE: ${CI_MCORE_IMAGE}
   tags: [8xL40S]
   variables:
     GIT_STRATEGY: clone
@@ -112,11 +116,14 @@ unit_tests:
 
       for i in $(seq $UNIT_TEST_REPEAT); do
         SEED=$((RANDOM % 9000 + 1000));
-        SKIPPED=()
+        ARGS=()
         if [[ $TAG != latest ]]; then
-          SKIPPED+=(-m "not internal")
+          ARGS+=(-m "not internal")
+        fi
+        if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then
+          ARGS+=(-m "experimental")
         fi
-        timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${SKIPPED[@]}" tests/unit_tests
+        timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
       done
   artifacts:
     paths:

From 32b395ec228f6561b3e2f61730f6c564b9d21d2f Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Date: Wed, 25 Sep 2024 21:33:02 -0700
Subject: [PATCH 26/50] ADLR/megatron-lm!1953 - Mcore export to export models
 to TRTLLM (GPU and CPU version)

Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
---
 examples/export/README.md                     |  10 +
 .../ptq_and_trtllm_export}/README.md          |   0
 .../ptq_trtllm_llama2_7b.sh                   |   0
 .../ptq_trtllm_llama3_1_8b.sh                 |   0
 .../ptq_trtllm_llama3_8b.sh                   |   0
 .../ptq_trtllm_minitron_8b.sh                 |   0
 .../ptq_trtllm_mistral_12b.sh                 |   0
 .../text_generation_ptq.py                    |   0
 .../trtllm_text_generation.py                 |   0
 examples/export/trtllm_export/README.md       | 161 ++++++
 .../gpt_distributed_gpu_export.py             | 117 +++++
 .../gpt_single_device_cpu_export.py           | 118 +++++
 megatron/core/export/__init__.py              |   1 +
 megatron/core/export/data_type.py             |   5 +
 megatron/core/export/export_config.py         |  19 +
 megatron/core/export/model_type.py            |   7 +
 megatron/core/export/trtllm/__init__.py       |   1 +
 .../export/trtllm/engine_builder/__init__.py  |   1 +
 .../engine_builder/trtllm_engine_builder.py   | 148 ++++++
 .../trtllm/model_to_trllm_mapping/__init__.py |   1 +
 .../default_conversion_dict.py                |  17 +
 .../model_to_trllm_mapping/falcon_model.py    |  25 +
 .../model_to_trllm_mapping/gemma_model.py     |  21 +
 .../model_to_trllm_mapping/gpt_model.py       |  28 ++
 .../model_to_trllm_mapping/llama_model.py     |  22 +
 .../model_to_trllm_mapping/starcoder_model.py |  30 ++
 .../core/export/trtllm/trt_model_config.py    |  15 +
 megatron/core/export/trtllm/trt_model_type.py |  13 +
 megatron/core/export/trtllm/trtllm_helper.py  | 463 ++++++++++++++++++
 megatron/core/export/trtllm/trtllm_layers.py  | 157 ++++++
 .../trtllm_weights_converter/__init__.py      |   1 +
 ...tributed_trtllm_model_weights_converter.py | 250 ++++++++++
 ...e_device_trtllm_model_weights_converter.py | 441 +++++++++++++++++
 megatron/core/models/gpt/gpt_model.py         |   1 +
 tests/unit_tests/export/trtllm/__init__.py    |   0
 .../test_trtllm_distributed_gpu_converter.py  | 100 ++++
 .../export/trtllm/test_trtllm_layers.py       | 111 +++++
 .../test_trtllm_single_device_converter.py    | 169 +++++++
 38 files changed, 2453 insertions(+)
 create mode 100644 examples/export/README.md
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/README.md (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama2_7b.sh (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama3_1_8b.sh (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama3_8b.sh (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_minitron_8b.sh (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_mistral_12b.sh (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/text_generation_ptq.py (100%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/trtllm_text_generation.py (100%)
 create mode 100644 examples/export/trtllm_export/README.md
 create mode 100644 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
 create mode 100644 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
 create mode 100644 megatron/core/export/__init__.py
 create mode 100644 megatron/core/export/data_type.py
 create mode 100644 megatron/core/export/export_config.py
 create mode 100644 megatron/core/export/model_type.py
 create mode 100644 megatron/core/export/trtllm/__init__.py
 create mode 100644 megatron/core/export/trtllm/engine_builder/__init__.py
 create mode 100644 megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
 create mode 100644 megatron/core/export/trtllm/trt_model_config.py
 create mode 100644 megatron/core/export/trtllm/trt_model_type.py
 create mode 100644 megatron/core/export/trtllm/trtllm_helper.py
 create mode 100644 megatron/core/export/trtllm/trtllm_layers.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
 create mode 100644 tests/unit_tests/export/trtllm/__init__.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_layers.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py

diff --git a/examples/export/README.md b/examples/export/README.md
new file mode 100644
index 0000000000..ddb8216f94
--- /dev/null
+++ b/examples/export/README.md
@@ -0,0 +1,10 @@
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
\ No newline at end of file
diff --git a/examples/inference/quantization/README.md b/examples/export/ptq_and_trtllm_export/README.md
similarity index 100%
rename from examples/inference/quantization/README.md
rename to examples/export/ptq_and_trtllm_export/README.md
diff --git a/examples/inference/quantization/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
similarity index 100%
rename from examples/inference/quantization/ptq_trtllm_llama2_7b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
diff --git a/examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
similarity index 100%
rename from examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
diff --git a/examples/inference/quantization/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
similarity index 100%
rename from examples/inference/quantization/ptq_trtllm_llama3_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
diff --git a/examples/inference/quantization/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
similarity index 100%
rename from examples/inference/quantization/ptq_trtllm_minitron_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
diff --git a/examples/inference/quantization/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
similarity index 100%
rename from examples/inference/quantization/ptq_trtllm_mistral_12b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
diff --git a/examples/inference/quantization/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
similarity index 100%
rename from examples/inference/quantization/text_generation_ptq.py
rename to examples/export/ptq_and_trtllm_export/text_generation_ptq.py
diff --git a/examples/inference/quantization/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
similarity index 100%
rename from examples/inference/quantization/trtllm_text_generation.py
rename to examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md
new file mode 100644
index 0000000000..50177382c9
--- /dev/null
+++ b/examples/export/trtllm_export/README.md
@@ -0,0 +1,161 @@
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function. Default behaviour is to transfer one layer at a time to cuda and convert if available, else do cpu conversion.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
new file mode 100644
index 0000000000..57d44f9f62
--- /dev/null
+++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
@@ -0,0 +1,117 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
new file mode 100644
index 0000000000..587e7cfdd3
--- /dev/null
+++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
@@ -0,0 +1,118 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py
new file mode 100644
index 0000000000..38fbdea8f6
--- /dev/null
+++ b/megatron/core/export/data_type.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+DataType = Enum('DataType', ["bfloat16", "float16", "float32"])
diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py
new file mode 100644
index 0000000000..2cc1e208be
--- /dev/null
+++ b/megatron/core/export/export_config.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+
+
+@dataclass
+class ExportConfig:
+    """Base configuration for Megatron Core Export
+
+    These parameters control the export setting for trtllm
+    """
+
+    inference_tp_size: int = 1
+
+    inference_pp_size: int = 1
+
+    use_parallel_embedding: bool = False
+
+    use_embedding_sharing: bool = False
diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py
new file mode 100644
index 0000000000..6a33d6440e
--- /dev/null
+++ b/megatron/core/export/model_type.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+ModelType = Enum(
+    'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
+)
diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
new file mode 100644
index 0000000000..e729fec410
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
+from tensorrt_llm.plugin import PluginConfig
+
+
+class TRTLLMEngineBuilder:
+    """A utility class to build TRTLLM engine"""
+
+    @staticmethod
+    def build_and_save_engine(
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            model_type (ModelType, optional): ModelType enum. Defaults to ModelType.gpt.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+        architecture = (
+            "LLaMAForCausalLM"
+            if trtllm_model_config.architecture == "LlamaForCausalLM"
+            else trtllm_model_config.architecture
+        )
+        try:
+            model_cls = getattr(tensorrt_llm.models, architecture)
+        except:
+            raise AttributeError(f"Could not find TRTLLM model for architecture: {architecture}!")
+
+        logger.set_level("info")
+        plugin_config = PluginConfig()
+        plugin_config.gpt_attention_plugin = gpt_attention_plugin
+        plugin_config.gemm_plugin = gemm_plugin
+        if paged_kv_cache:
+            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+        else:
+            plugin_config.paged_kv_cache = False
+        plugin_config.remove_input_padding = remove_input_padding
+        plugin_config.use_paged_context_fmha = paged_context_fmha
+        plugin_config.multiple_profiles = multiple_profiles
+
+        if max_seq_len is None:
+            max_seq_len = max_input_len + max_output_len
+
+        max_num_tokens, opt_num_tokens = check_max_num_tokens(
+            max_num_tokens=max_num_tokens,
+            opt_num_tokens=opt_num_tokens,
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_beam_width=max_beam_width,
+            remove_input_padding=remove_input_padding,
+            enable_context_fmha=plugin_config.context_fmha,
+            tokens_per_block=tokens_per_block,
+            multiple_profiles=multiple_profiles,
+        )
+
+        build_dict = {
+            'max_input_len': max_input_len,
+            'max_output_len': max_output_len,
+            'max_batch_size': max_batch_size,
+            'max_beam_width': max_beam_width,
+            'max_seq_len': max_seq_len,
+            'max_num_tokens': max_num_tokens,
+            'opt_num_tokens': opt_num_tokens,
+            'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+            'gather_context_logits': False,
+            'gather_generation_logits': False,
+            'strongly_typed': False,
+            'builder_opt': None,
+            'use_refit': use_refit,
+            'multiple_profiles': multiple_profiles,
+        }
+        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+        if use_lora_plugin is not None:
+            # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+            # build_config.plugin_config._lora_plugin = use_lora_plugin
+            lora_config = LoraConfig(
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source='nemo',  # TODO : NEED TO SEE HOW TO HANDLE THIS FOR MCORE
+                max_lora_rank=max_lora_rank,
+                lora_target_modules=lora_target_modules,
+            )
+            build_config.lora_config = lora_config
+
+        model = model_cls.from_config(trtllm_model_config)
+        model = optimize_model(
+            model,
+            use_parallel_embedding=trtllm_model_config.use_parallel_embedding,
+            share_embedding_table=trtllm_model_config.share_embedding_table,
+        )
+        preprocess_weights(trtllm_model_weights, trtllm_model_config)
+        model.load(trtllm_model_weights)
+        engine = build_trtllm(model, build_config)
+        engine.save(engine_dir)
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
new file mode 100644
index 0000000000..924dda4bc8
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.model_to_trllm_mapping.falcon_model import FALCON_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gemma_model import GEMMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.llama_model import LLAMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.starcoder_model import STARCODER_DICT
+
+DEFAULT_CONVERSION_DICT = {
+    ModelType.llama: LLAMA_DICT,
+    ModelType.falcon: FALCON_DICT,
+    ModelType.gemma: GEMMA_DICT,
+    ModelType.starcoder: STARCODER_DICT,
+    ModelType.gpt: GPT_DICT,
+    ModelType.gptnext: GPT_DICT,  # TODO : Check if this is right
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
new file mode 100644
index 0000000000..1640f992a1
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+FALCON_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
new file mode 100644
index 0000000000..47a0211706
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GEMMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
new file mode 100644
index 0000000000..eda27600c6
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+GPT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
new file mode 100644
index 0000000000..5fd2067081
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+LLAMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
new file mode 100644
index 0000000000..dce61d26c5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+STARCODER_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py
new file mode 100644
index 0000000000..2ed09398c2
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_config.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_CONFIG = {
+    ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.gemma: tensorrt_llm.models.GemmaConfig,
+    ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig,
+}
diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py
new file mode 100644
index 0000000000..f45ff1786e
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_type.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_TYPE_STRING = {
+    ModelType.gpt: 'GPTForCausalLM',
+    ModelType.gptnext: 'GPTForCausalLM',
+    ModelType.starcoder: 'GPTForCausalLM',
+    ModelType.mixtral: 'LlamaForCausalLM',
+    ModelType.llama: 'LlamaForCausalLM',
+    ModelType.gemma: 'GemmaForCausalLM',
+    ModelType.falcon: 'FalconForCausalLM',
+}
diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py
new file mode 100644
index 0000000000..9db8d246fc
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_helper.py
@@ -0,0 +1,463 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.engine_builder.trtllm_engine_builder import TRTLLMEngineBuilder
+from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+    DEFAULT_CONVERSION_DICT,
+)
+from megatron.core.export.trtllm.trt_model_config import TRT_MODEL_CONFIG
+from megatron.core.export.trtllm.trt_model_type import TRT_MODEL_TYPE_STRING
+
+# pylint: disable=line-too-long
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TRTLLMHelper:
+    """TRTLLM Helper class to convert export and build TRTLLM model."""
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        model_type: ModelType,
+        trtllm_conversion_dict: dict = {},
+        position_embedding_type: str = 'learned_absolute',
+        max_position_embeddings: int = None,
+        rotary_percentage: int = 1.0,
+        rotary_base: int = 10000,
+        moe_tp_mode: int = 2,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+        seq_len_interpolation_factor: float = None,
+        moe_renorm_mode=None,
+        share_embeddings_and_output_weights=False,
+    ):
+        """Constructor for the TRTLLMHelper
+
+        There are two public API's supported  by this helper.
+        a) get_trtllm_pretrained_config_and_model_weights
+        b) build_and_save_engine
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            model_type (ModelType): The type of the input model. Enum (megatron.core.export.model_type.ModelType)
+            conversion_dict (dict, optional): A conversion dictionary that will map your model layer names to trtllm equivalent layer names. Sample dictionaries are given megatron/core/export/model_mapping. NOTE: Ingore layer numbers in the model layer names. (e.g) decoder.layers.0.attention_qkv.weight will be decoder.layers.attention_qkv.weight in the mapping dictionary. Defaults to {}.
+            position_embedding_type (str, optional): The position embedding type. Defaults to None.
+            max_position_embeddings (int, optional): Max posistion embeddings value. Defaults to None.
+            rotary_percentage (int, optional): The rotary percentage if using rope embedding. Defaults to 1.0.
+            rotary_base (int, optional): The rotary base (theta value) if using rope embeddings. Defaults to 10000.
+            moe_tp_mode (int, optional): TRTLLM Config. Defaults to 2.
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+            seq_len_interpolation_factor (float, optional): The sequence length interpolation factor if using rope embeddings. Defaults to None.
+            moe_renorm_mode (optional) : Renormalization mode if using mixture of experts. Defaults to None.
+            share_embeddings_and_output_weights (bool, optional): True if input and output layers share weights. Defaults to False.
+        """
+
+        self.transformer_config = transformer_config
+        self.model_type = model_type
+        self.trtllm_conversion_dict = DEFAULT_CONVERSION_DICT[model_type]
+        self.trtllm_conversion_dict.update(trtllm_conversion_dict)
+        assert position_embedding_type in [
+            'learned_absolute',
+            'rope',
+        ], f"Position embedding type should be one of learned_absolute, rope. You entered {position_embedding_type}"
+        self.position_embedding_type = position_embedding_type
+        self.max_position_embeddings = max_position_embeddings
+        self.rotary_percentage = rotary_percentage
+        self.rotary_base = rotary_base
+        self.moe_tp_mode = moe_tp_mode
+        self.multi_query_mode = multi_query_mode
+        self.activation = activation
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        self.moe_renorm_mode = moe_renorm_mode
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+    def _get_trtllm_config(
+        self,
+        export_config: ExportConfig,
+        world_size: int,
+        gpus_per_node: int,
+        vocab_size_padded: int,
+        dtype: DataType,
+    ):
+        """Get TRTLLM Config
+
+        Returns appropriate TRTLLM PretrainedConfig used by TRTLLM for building engine
+
+        Args:
+            export_config (ExportConfig): The export config that defines inference tp , pp size etc.
+            world_size (int): The number of gpus (Mostly TP * PP)
+            gpus_per_node (int): Num gpus per node
+            vocab_size_padded (int): Padded vocab size
+            dtype (DataType): The datatype or model precision
+
+        Returns:
+            GPTConfig or the LLamaConfig or the PretrainedConfig constructed from your model config
+        """
+        hidden_act = self.activation
+        hidden_act = (
+            hidden_act.split("-")[-1]
+            if self.transformer_config.num_moe_experts
+            else non_gated_version(hidden_act)
+        )
+
+        config = {
+            'architecture': TRT_MODEL_TYPE_STRING[self.model_type],
+            'dtype': dtype.name,
+            'num_hidden_layers': self.transformer_config.num_layers,
+            'num_attention_heads': self.transformer_config.num_attention_heads,
+            'num_key_value_heads': (
+                self.transformer_config.num_query_groups
+                if self.transformer_config.num_query_groups
+                else self.transformer_config.num_attention_heads
+            ),
+            'head_size': self.transformer_config.kv_channels,
+            'hidden_size': self.transformer_config.hidden_size,
+            'intermediate_size': self.transformer_config.ffn_hidden_size,
+            'norm_epsilon': self.transformer_config.layernorm_epsilon,
+            'vocab_size': vocab_size_padded,
+            'position_embedding_type': (
+                "rope_gpt_neox" if self.position_embedding_type == "rope" else "learned_absolute"
+            ),
+            'max_position_embeddings': self.max_position_embeddings,
+            'hidden_act': hidden_act,
+            'use_parallel_embedding': export_config.use_parallel_embedding,
+            'embedding_sharding_dim': 0,
+            'share_embedding_table': export_config.use_embedding_sharing,
+            'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None},
+            'bias': self.transformer_config.add_bias_linear,
+            'apply_query_key_layer_scaling': False,
+            'rotary_pct': self.rotary_percentage,
+            'rotary_base': self.rotary_base,
+            'moe_num_experts': (
+                0
+                if self.transformer_config.moe_router_topk == 0
+                else (self.transformer_config.num_moe_experts or 1)
+            ),
+            'moe_top_k': self.transformer_config.moe_router_topk,
+            'moe_normalization_mode': self.moe_renorm_mode
+            or MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+            'moe_tp_mode': self.moe_tp_mode,
+            'logits_dtype': 'float32',
+            'world_size': world_size,
+            'tp_size': export_config.inference_tp_size,
+            'pp_size': export_config.inference_pp_size,
+            'gpus_per_node': gpus_per_node,
+        }
+
+        if self.model_type == ModelType.falcon:
+            config["new_decoder_architecture"] = (
+                False if self.transformer_config.num_layers == 32 else True
+            )
+            config["parallel_attention"] = True
+
+        if self.seq_len_interpolation_factor is not None:
+            config["rotary_scaling"] = {
+                "type": "linear",
+                "factor": float(self.seq_len_interpolation_factor),
+            }
+
+        config_cls = TRT_MODEL_CONFIG[self.model_type]
+        return config_cls(**config)
+
+    # pylint: disable=line-too-long
+    def get_trtllm_pretrained_config_and_model_weights(
+        self,
+        model_state_dict,
+        dtype: DataType,
+        export_config: ExportConfig = None,
+        on_device_distributed_conversion: bool = False,
+        vocab_size: int = None,
+        gpus_per_node: int = None,
+        state_dict_split_by_layer_numbers: bool = True,
+    ):
+        """Get TRTLLM Config and Converted Model Weights
+
+        This function returns the trtllm model weights as a list.
+        There are two modes for conversion. The default is to use a single device cpu/gpu for conversion.
+        In the single device mode, we use cuda device automatically if available, if not we convert on CPU.
+        NOTE: For faster performance, if your entire model will fit in memory, pre transfer the model state dict to cuda device and then call this function.
+        Default behaviour is to transfer one layer at a time to cuda and convert if available, else do cpu conversion.
+        For on device conversion it returns weights which will be used on the device itself.
+        Same thing happens with the pretrained config
+
+        Args:
+            model_state_dict (dict, optional): The input model state dictionary (Entire model state loaded on CPU). Used only when on device conversion is set to False. Defaults to None.
+            False, or the model state dict of each GPU in the case of on_device conversion)
+            export_config (ExportConfig): The export config used to define inference tp size, pp size etc. Used only for on device conversion.
+            dtype (DataType): The data type of model precision
+            on_device_distributed_conversion (bool, optional): Convert on gpus in distributed setting. This assumes that the model state dict is sharded according to required inference model parallelism and that each gpu gets its part of the model state dict . Defaults to False.
+            vocab_size (int, optional): The vocabulary size. Defaults to None.
+            gpus_per_node (int, optional): The number of gpus per node. Used for on device conversion.
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . First list of trtllm converted model weights(Either on device, or a list of weights for each gpu) and the trtllm_model_configs.
+        """
+        assert not (
+            self.share_embeddings_and_output_weights and not export_config.use_parallel_embedding
+        ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True"
+
+        if on_device_distributed_conversion:
+            assert (vocab_size is not None, "Need to pass in vocab_size for on device")
+            assert (
+                self.model_type in [ModelType.gpt, ModelType.gptnext, ModelType.llama],
+                "On device conversion only supported for model types gptnext and llama",
+            )
+            assert (
+                export_config is None,
+                "Export config is inferred based on the parallel state. If you want to set inference tp 2, then load the model with this TP2 setting and just pass in the model state dict. ",
+            )
+            assert (
+                gpus_per_node is not None
+            ), "Need to pass in gpus_per_node for on device conversion"
+            trtllm_model_weights_on_device, trtllm_model_config = (
+                self._get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+                    model_state_dict, dtype, vocab_size, gpus_per_node
+                )
+            )
+            return [trtllm_model_weights_on_device], [trtllm_model_config]
+
+        else:
+            assert (
+                vocab_size is None
+            ), "Vocab size is inferred from the input layer for cpu conversion. So leave it as None"
+            trtllm_model_weights_list, trtllm_model_config_list = (
+                self._get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+                    export_config,
+                    model_state_dict,
+                    dtype,
+                    gpus_per_node,
+                    state_dict_split_by_layer_numbers,
+                )
+            )
+
+            return trtllm_model_weights_list, trtllm_model_config_list
+
+    def _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+        self, model_state_dict: dict, dtype: DataType, vocab_size: int, gpus_per_node: int
+    ):
+        """Get the TRTLLM Pretrained config and model weights list in a distributed setting
+
+        This function assumes the  model state dict is distributed according to model parallelism .
+        Each device gets its own model state dict
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            vocab_size (int): Tokenizer vocab size
+            gpus_per_node (int): The number of gpus per node
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+
+        distributed_trtllm_model_weights_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            multi_query_mode=self.multi_query_mode,
+            activation=self.activation,
+        )
+        distributed_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            tokenizer_vocab_size=vocab_size,
+        )
+
+        export_config = ExportConfig(
+            inference_pp_size=distributed_trtllm_model_weights_converter.inference_pp_size,
+            inference_tp_size=distributed_trtllm_model_weights_converter.inference_tp_size,
+            use_parallel_embedding=True,
+        )
+
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+
+        trtllm_model_config = self._get_trtllm_config(
+            export_config=export_config,
+            world_size=world_size,
+            gpus_per_node=gpus_per_node,
+            vocab_size_padded=vocab_size,
+            dtype=dtype,
+        )
+
+        model_parallel_rank = (
+            distributed_trtllm_model_weights_converter.pp_rank
+            * distributed_trtllm_model_weights_converter.inference_tp_size
+            + distributed_trtllm_model_weights_converter.tp_rank
+        )
+
+        trtllm_model_config.mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=model_parallel_rank,
+            tp_size=export_config.inference_tp_size,
+            pp_size=export_config.inference_pp_size,
+        )
+
+        return distributed_trtllm_model_weights_converter.trtllm_model_weights, trtllm_model_config
+
+    def _get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+        self,
+        export_config: ExportConfig,
+        model_state_dict: dict,
+        dtype: DataType,
+        gpus_per_node=None,
+        state_dict_split_by_layer_numbers=True,
+    ):
+        """Get the TRTLLM Pretrained config and model weights list (one per gpu rank) on single device (CPU/GPU)
+
+        This function assumes the entire model state dict is present in CPU or on one GPU
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            gpus_per_node (int, optional): Number of gpus per node
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+        trtllm_model_configs_list = []
+        trtllm_model_weights_list = []
+
+        single_device_trtllm_model_weights_converter = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config=export_config,
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            activation=self.activation,
+            multi_query_mode=self.multi_query_mode,
+        )
+        # Convert the input model state dict to trtllm model weights dictionary
+        single_device_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        vocab_size_padded = single_device_trtllm_model_weights_converter.get_padded_vocab_size()
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+        gpus_per_node = gpus_per_node or export_config.inference_tp_size
+
+        for gpu_rank in range(world_size):
+            mapping = tensorrt_llm.Mapping(
+                world_size=world_size,
+                rank=gpu_rank,
+                tp_size=export_config.inference_tp_size,
+                pp_size=export_config.inference_pp_size,
+            )
+
+            # Important to create a new instance everytime so that the list elements have differnt rank values in the mapping object
+            trtllm_model_config = self._get_trtllm_config(
+                export_config=export_config,
+                world_size=world_size,
+                gpus_per_node=gpus_per_node,
+                vocab_size_padded=vocab_size_padded,
+                dtype=dtype,
+            )
+            trtllm_model_config.mapping = mapping
+            trtllm_model_configs_list.append(trtllm_model_config)
+
+            # Get the model weights for each rank and append it to the trtllm_model_weights_list
+            trtllm_model_weights_per_gpu = (
+                single_device_trtllm_model_weights_converter.get_local_model_weights_per_gpu(
+                    mapping, trtllm_model_config
+                )
+            )
+            trtllm_model_weights_list.append(trtllm_model_weights_per_gpu)
+
+        return trtllm_model_weights_list, trtllm_model_configs_list
+
+    def build_and_save_engine(
+        self,
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Max size of prompt embedding table. Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+
+        TRTLLMEngineBuilder.build_and_save_engine(
+            engine_dir,
+            trtllm_model_weights,
+            trtllm_model_config,
+            max_input_len,
+            max_output_len,
+            max_batch_size,
+            lora_ckpt_list,
+            use_lora_plugin,
+            max_lora_rank,
+            lora_target_modules,
+            max_prompt_embedding_table_size,
+            paged_kv_cache,
+            remove_input_padding,
+            paged_context_fmha,
+            use_refit,
+            max_num_tokens,
+            max_seq_len,
+            opt_num_tokens,
+            max_beam_width,
+            tokens_per_block,
+            multiple_profiles,
+            gpt_attention_plugin,
+            gemm_plugin,
+        )
diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py
new file mode 100644
index 0000000000..0cf805dcb6
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_layers.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from enum import Enum
+from typing import Tuple
+
+
+class TRTLLMLayers(Enum):
+    """TRTLLM Layer names
+
+    This Enum will be used to map input model layer names to TRTLLM Layer names
+    """
+
+    # ONE TIME LAYERS (NOT ASSOCIATED TO TRANSFORMER BLOCK)
+    # Input layers
+    position_embedding = 'transformer.position_embedding.weight'
+    vocab_embedding = 'transformer.vocab_embedding.weight'
+    lm_head = 'lm_head.weight'
+
+    # Output layers
+    final_layernorm_weight = 'transformer.ln_f.weight'
+    final_layernorm_bias = 'transformer.ln_f.bias'
+
+    # TRANSFORMER LAYERS
+    # Attention block related layers
+    input_layernorm_weight = 'transformer.layers.input_layernorm.weight'
+    input_layernorm_bias = 'transformer.layers.input_layernorm.bias'
+    attention_qkv_weight = 'transformer.layers.attention.qkv.weight'
+    attention_qkv_bias = 'transformer.layers.attention.qkv.bias'
+    attention_dense_weight = 'transformer.layers.attention.dense.weight'
+    attention_dense_bias = 'transformer.layers.attention.dense.bias'
+
+    # mlp layers
+    mlp_fc_weight = 'transformer.layers.mlp.fc.weight'
+    mlp_fc_bias = 'transformer.layers.mlp.fc.bias'
+    post_layernorm_weight = 'transformer.layers.post_layernorm.weight'
+    post_layernorm_bias = 'transformer.layers.post_layernorm.bias'
+    mlp_projection_weight = 'transformer.layers.mlp.proj.weight'
+    mlp_projection_bias = 'transformer.layers.mlp.proj.bias'
+
+    # mixture of expert layers
+    mlp_router_weight = 'transformer.layers.mlp.router.weight'
+    mlp_fc_weight_mixture_of_experts = 'transformer.layers.mlp.fc.weight.expert'
+    mlp_projection_weight_mixture_of_experts = 'transformer.layers.mlp.proj.weight.expert'
+
+    @staticmethod
+    def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]:
+        """Helper function to return layer name and number
+        Given an input layer e.g decoder.layers.2.self_attention.linear_qkv.weight,
+        this function returns decoder.layers.self_attention.linear_qkv.weight and layernumber 2.
+        In case no layer number is present, it returns None for the layer number
+        Args:
+            layer_name (dict): The input layer name
+
+        Returns:
+            Tuple[str, int]: The layer name , layer number (layer number could be None)
+        """
+        # Use regular expression to find the number specifically after 'layers.'
+        match = re.search(r'(?<=layers\.)\d+(?=\.)', layer_name)
+        if match:
+            # Extract the number and remove it from the layer name
+            number = match.group(0)
+            layer_name_without_number = re.sub(r'\.{}\.'.format(number), '.', layer_name)
+            return layer_name_without_number, int(number)
+        else:
+            # Return the original name if no number is found
+            return layer_name, None
+
+    # pylint: disable=line-too-long
+    @staticmethod
+    def rename_input_layer_names_to_trtllm_layer_names(
+        model_state_dict: dict,
+        trtllm_conversion_dict: dict,
+        state_dict_split_by_layer_numbers: bool = True,
+    ) -> dict:
+        """Helper function to rename model layer names to TRTLLM Layer names
+
+        We go through each layer (keys) in the model state dict,
+        and map it to the equivalent TRTLLMLayer name (megatron/core/export/trtllm/trtllm).
+        If we have a layer number associated with layer, we extract it out,
+        map the original layer name to equivalent trtllm layer name and add layer number back.
+        CPU Conversion will pass in model state dict without layer numbers
+        (i.e decoder.layers.mlp.linear_fc1.weight of shape [num_layers, hidden_dim, 4 * hidden_dim]) .
+        GPU conversion will pass model state dict with each layer seperated
+        (i.e decoder.layers.2.mlp.linear_fc1.weight of shape [hidden_dim, 4 * hidden_dim]).
+
+        Args:
+            model_state_dict (dict): The original model state dict
+            trtllm_conversion_dict (dict): The conversion dictionary mapping input model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Raises:
+            ValueError: In case the keys dont match to trtllm keys or if all model layers are not mapped to equivalent trtllm keys
+
+        Returns:
+            dict: The model state dict with the key (i.e original model layer name) replaced by trtllm layer names
+        """
+        for original_model_layer_name in list(model_state_dict.keys()):
+            if "_extra_state" in original_model_layer_name:
+                del model_state_dict[original_model_layer_name]
+                continue
+
+            original_layer_name_without_number, layer_number = (
+                TRTLLMLayers.return_layer_name_and_number(original_model_layer_name)
+            )
+            if 'layers' in original_layer_name_without_number and state_dict_split_by_layer_numbers:
+                assert (
+                    layer_number is not None
+                ), f"Layer number is None for {original_model_layer_name} and state_dict_split_by_layer_numbers is set to True. Consider setting it False"
+
+            if original_layer_name_without_number not in trtllm_conversion_dict:
+                raise ValueError(
+                    f'Unable to rename key {original_layer_name_without_number}. Provide an appropriate mapping in the trtllm_conversion_dict when you initialize TRTLLMHelper'
+                )
+
+            trtllm_layer = trtllm_conversion_dict[original_layer_name_without_number]
+            assert isinstance(
+                trtllm_layer, TRTLLMLayers
+            ), f"{trtllm_layer} is not supported for conversion. Please use one of the TRTLLMLayerNames we provided in megatron/core/export/trtllm/trtllm_layer_names"
+
+            value = model_state_dict.pop(original_model_layer_name)
+
+            if layer_number is not None:
+                trtllm_layer_name_with_number = re.sub(
+                    r'(?<=layers\.)', f'{layer_number}.', trtllm_layer.value
+                )
+                model_state_dict[trtllm_layer_name_with_number] = value
+            else:
+                model_state_dict[trtllm_layer.value] = value
+
+        return model_state_dict
+
+
+# These layers are not associated within the transformer block.
+# So they dont have a layer number (i.e independant of number of layers in the model)
+NON_TRANSFORMER_LAYERS_NAMES = [
+    TRTLLMLayers.vocab_embedding.value,
+    TRTLLMLayers.position_embedding.value,
+    TRTLLMLayers.lm_head.value,
+    TRTLLMLayers.final_layernorm_weight.value,
+    TRTLLMLayers.final_layernorm_bias.value,
+]
+
+
+def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str:
+    """Get TRTLayer name without prefix
+
+    Given a layer e.g TRTLLMLayers.attention_qkv_weight it returns 'attention.qkv.weight'
+
+    Args:
+        layer (TRTLLMLayers): The TRTLLMLayer
+
+    Returns:
+        str: The TRTLLMLayers suffix (i.e Removing transformer.layers. fromt he layer name)
+    """
+    layer_name_without_prefix = layer.value.replace("transformer.layers.", "")
+    return layer_name_without_prefix
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..07b47411cc
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from tqdm import tqdm
+
+from megatron.core import parallel_state
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+# pylint: disable=line-too-long
+class DistributedTRTLLMModelWeightsConverter:
+    """The TRTLLM Converter class used for GPU (on device) conversion
+
+    This class is used to convert models sharded and on gpus. (It assumes that the model is already sharded appropriate to how you want to export it). (i.e) If you want to export to tp2pp2, then load the model in tp2pp2 setting and pass in their respective state dictionaries
+    """
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterGPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights.
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+        self.inference_pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        self.inference_tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        self.tp_group = parallel_state.get_tensor_model_parallel_group()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+        assert (
+            vp_size is None or vp_size == 1
+        ), "Virtual parallelism is not supported in GPU Converter. Gather the VP chunks and use PP config."
+
+    def _save_val(self, val: torch.Tensor, layer_name: str):
+        assert torch.is_tensor(val), f"Expected a tensor for {layer_name} but got {type(val)}"
+        val = val.to(self.storage_type)
+        val = val.detach().contiguous()
+        if val.ndim >= 2:
+            val = torch.transpose(val.reshape(val.shape[0], -1), 0, 1)
+        if layer_name not in self.trtllm_model_weights:
+            self.trtllm_model_weights[layer_name] = torch.empty(
+                val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True
+            )
+        self.trtllm_model_weights[layer_name] = val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight))
+        ):
+            self._save_val(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                vals, gates = [[n] for n in torch.chunk(val, 2, axis=-1)]
+                gate_layer_name = layer_name.replace("fc", "gate")
+                self._save_val(val=gates[0], layer_name=gate_layer_name)
+                val = vals[0]
+
+            self._save_val(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = (
+                qkv_hidden_dim
+                // (self.transformer_config.num_attention_heads + 2 * self.num_kv_heads)
+                * self.inference_tp_size
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head)
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            split_vals = torch.concatenate(
+                [qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=0
+            )
+            self._save_val(val=split_vals, layer_name=layer_name)
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            val = val.reshape(
+                hidden_dim, self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head
+            )
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+            split_vals = torch.concatenate(
+                [
+                    qkv[0].reshape(hidden_dim, -1),
+                    qkv[1].reshape(hidden_dim, -1),
+                    qkv[2].reshape(hidden_dim, -1),
+                ],
+                dim=1,
+            )
+            self._save_val(val=split_vals, layer_name=layer_name)
+
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by GPU converter")
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            self._save_val(val=val, layer_name=layer_name)
+
+    # ----------------Convert Embeddings----------------
+    def _get_remove_vocab_padding(self, layer_name, model_state_dict, tokenizer_vocab_size):
+        val = model_state_dict.get(layer_name, None)
+        if val is None:
+            return None
+
+        if self.inference_tp_size > 1:  # Gather padded tensor chunks
+            vocab_size_padded = val.shape[0] * self.inference_tp_size
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                vocab_size_padded, self.tp_rank, self.inference_tp_size
+            )
+            dim_size = list(val.size())
+            dim_size[0] = vocab_size_padded
+            gathered_val = torch.zeros(
+                dim_size, dtype=val.dtype, device=torch.cuda.current_device()
+            )
+            gathered_val[vocab_start_index:vocab_end_index] = val
+            torch.distributed.all_reduce(gathered_val, group=self.tp_group)
+            val = gathered_val
+        unpadded = val[:tokenizer_vocab_size]
+        if self.inference_tp_size > 1:  # Split gathered val for val parallel embedding
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                tokenizer_vocab_size, self.tp_rank, self.inference_tp_size
+            )
+            unpadded = unpadded[vocab_start_index:vocab_end_index]
+        return unpadded.T  # TRTLLM expects (vocab_size, hidden_size) so need extra transpose
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict: dict, tokenizer_vocab_size: int
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            tokenizer_vocab_size (int): The vocab size of the tokenizer
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict, trtllm_conversion_dict=trtllm_conversion_dict
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            if (
+                layer_name in TRTLLMLayers.vocab_embedding.value
+                or layer_name in TRTLLMLayers.lm_head.value
+            ):
+                # For embedding layers alone we do some pre processing
+                embed_val = self._get_remove_vocab_padding(
+                    layer_name, model_state_dict, tokenizer_vocab_size
+                )
+                model_state_dict[layer_name] = embed_val
+            # TODO : Check if this handling of position embedding is right.
+            if layer_name == TRTLLMLayers.position_embedding.value:
+                position_embedding = model_state_dict[layer_name]
+                req_position_embedding = position_embedding.chunk(self.inference_tp_size)[
+                    self.tp_rank
+                ]
+                model_state_dict[layer_name] = req_position_embedding.T
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        for layer_name, value in tqdm(
+            model_state_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..b8ec02ff61
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
@@ -0,0 +1,441 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+
+import torch
+from tqdm import tqdm
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+# pylint: disable=line-too-long
+# TODO: Writing TRT imports this way so that it can be mocked in the test_trtllm_cpu_converter.py unit test
+# TODO: Figure out how to patch it directly from the trtllm library
+def pad_vocab_size(vocab_size: int, tp_size: int):
+    """Pad vocab size based on inference size"""
+    from tensorrt_llm._utils import pad_vocab_size
+
+    return pad_vocab_size(vocab_size, tp_size)
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+class SingleDeviceTRTLLMModelWeightsConverter:
+    """Class to convert Model weights to TRTLLM weights on CPU"""
+
+    def __init__(
+        self,
+        export_config: ExportConfig,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterCPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights and also split them for each GPU rank and return as a list.
+
+        Args:
+            export_config (ExportConfig): The export config with inference tp size, pp size etc.
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.export_config = export_config
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer_name (str): The TRTLLM Layer name that we want to convert
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            val = val.to(self.storage_type).detach().contiguous()
+            self.trtllm_model_weights[layer_name] = val
+
+    def _transfer_tensor_to_cuda_if_available(self, val: torch.tensor):
+        """Transfer to cuda device if available
+
+        This function transfers the tensor to cuda and returns it
+        """
+        if torch.cuda.is_available() and not val.is_cuda:
+            val = val.cuda()
+        return val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+
+        def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=None):
+            """Add the input weight to trtllm_model_weights
+
+            Depending on split (Expert split/Tensor split/None) we split the input data and add accordingly
+
+            Args:
+                val (torch.Tensor): The model weight to be added
+                layer_name (str): The TRTLLMlayername as a string
+                split_type (str, optional): The split type. Defaults to None.
+            """
+            if split_type == 'expert_split':
+                for split_num, split_val in enumerate(val):
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            elif split_type == 'tensor_split':
+                for split_num, split_val in enumerate(val):
+                    if split_val.ndim >= 2:
+                        split_val = torch.transpose(split_val.reshape(split_val.shape[0], -1), 1, 0)
+
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            else:
+                if val.ndim >= 2:
+                    val = torch.transpose(val.reshape(val.shape[0], -1), 1, 0)
+                self.trtllm_model_weights[layer_name] = (
+                    val.to(self.storage_type).detach().contiguous()
+                )
+
+        val = self._transfer_tensor_to_cuda_if_available(val)
+
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+        ):
+            _add_to_trtllm_model_weights(val=val, layer_name=layer_name, split_type=None)
+
+        elif layer_name.endswith(
+            suffix(TRTLLMLayers.attention_dense_weight)
+        ) or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=0)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                val, gate = torch.chunk(val, 2, axis=-1)
+                gate_layer_name = layer_name.replace("fc", "gate")
+                split_vals = torch.chunk(gate, self.export_config.inference_tp_size, axis=-1)
+                _add_to_trtllm_model_weights(
+                    val=split_vals, layer_name=gate_layer_name, split_type='tensor_split'
+                )
+
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = qkv_hidden_dim // (
+                self.transformer_config.num_attention_heads + 2 * self.num_kv_heads
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads, q_num + 2, size_per_head)
+
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=0)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=0)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=0)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], dim=0
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
+            # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(hidden_dim, self.num_kv_heads, q_num + 2, size_per_head)
+
+            # Split the QKV to separate variables.
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % self.export_config.inference_tp_size) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=1)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=1)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=1)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [
+                        q_split[i].reshape(hidden_dim, -1),
+                        k_split[i].reshape(hidden_dim, -1),
+                        v_split[i].reshape(hidden_dim, -1),
+                    ],
+                    dim=1,
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight_mixture_of_experts)):
+            w1, w3 = torch.chunk(val, 2, axis=1)
+            # w1 splits
+            split_w1s = torch.chunk(w1, self.export_config.inference_tp_size, axis=1)
+            # w3 splits
+            split_w3s = torch.chunk(w3, self.export_config.inference_tp_size, axis=1)
+
+            split_vals = [torch.concatenate(item, dim=1) for item in zip(split_w3s, split_w1s)]
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight_mixture_of_experts)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by converter")
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict, state_dict_split_by_layer_numbers=True
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            # For vocab embedding layer alone we pad the weights to be divisible by inference tp size
+            if (
+                layer_name == TRTLLMLayers.vocab_embedding.value
+                and self.export_config.use_parallel_embedding
+            ):
+                val = self._transfer_tensor_to_cuda_if_available(val)
+                val = model_state_dict[TRTLLMLayers.vocab_embedding.value]
+                vocab_size = val.shape[0]
+                if vocab_size % self.export_config.inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(
+                        vocab_size, self.export_config.inference_tp_size
+                    )
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+                    model_state_dict[layer_name] = val
+
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        transformer_layers_dict = {}
+        # Convert the transformer layers
+        if state_dict_split_by_layer_numbers:
+            # Already model dict is split by layer numbers
+            transformer_layers_dict = model_state_dict
+        else:
+            # Here we split the model state dict into individual layers
+            for layer_name in list(model_state_dict.keys()):
+                value = model_state_dict.pop(layer_name)
+                for layer_number in range(self.transformer_config.num_layers):
+                    # e.g transformer.layers.mlp.fc.bias => transformer.layers.2.mlp.fc.bias
+                    layer_name_with_layer_number = re.sub(
+                        r'(?<=layers\.)', f'{layer_number}.', layer_name
+                    )
+                    transformer_layers_dict[layer_name_with_layer_number] = value[layer_number]
+
+        for layer_name, value in tqdm(
+            transformer_layers_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
+
+    def get_padded_vocab_size(self) -> int:
+        """Return the paded vocab size
+
+        We extract the lm head and vocab embedding and use that to determine padded_vocab_size
+
+        Returns:
+            int: Padded vocab size
+        """
+        lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+        vocab_size = self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value].shape[0]
+        vocab_size_padded = (
+            vocab_size
+            if lm_head_weight is None
+            else pad_vocab_size(vocab_size, self.export_config.inference_tp_size)
+        )
+        return vocab_size_padded
+
+    def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config: dict):
+        """Get the trtllm model weights split per gpu
+
+        Given the trtllm mapping information (tp, pp rank etc) we split the model weights in a list, with each element of the list corresponding to the weights of each gpu rank
+
+        Args:
+            mapping : The trtllm mapping information
+            trtllm_model_config (dict): The trtllm model config
+        """
+
+        def _split(torch_tensor, tp_size, idx, dim=0):
+            """Splits the np tensor v on dim and return the idx's slice."""
+            if tp_size == 1:
+                return torch_tensor
+            if len(torch_tensor.shape) == 1:
+                return torch.chunk(torch_tensor, tp_size)[idx].contiguous()
+            else:
+                return torch.chunk(torch_tensor, tp_size, axis=dim)[idx].contiguous()
+
+        pp_layer_range = mapping.pp_layers(self.transformer_config.num_layers)
+
+        trtllm_model_weights_per_gpu = {}
+        for layer_name, value in self.trtllm_model_weights.items():
+            if layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+                continue
+
+            # Happens in the case of TP split or expert split
+            if layer_name.endswith(".bin"):
+                if layer_name.endswith(f"{mapping.tp_rank}.bin"):
+                    layer_name = layer_name.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
+
+            layer_num = int(layer_name.split(".")[2])
+            if layer_num in pp_layer_range:
+                layer_name = layer_name.replace(
+                    f"layers.{layer_num}", f"layers.{layer_num - pp_layer_range[0]}"
+                )
+            else:
+                continue
+            if (
+                hasattr(trtllm_model_config, 'new_decoder_architecture')
+                and trtllm_model_config.new_decoder_architecture
+                and "post_layernorm" in layer_name
+            ):
+                layer_name = layer_name.replace("post_layernorm", "mlp_layernorm")
+
+            trtllm_model_weights_per_gpu[layer_name] = value
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                _split(
+                    self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value],
+                    mapping.tp_size,
+                    mapping.tp_rank,
+                )
+                if self.export_config.use_parallel_embedding
+                else self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value]
+            )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.vocab_embedding.value] = embedding_weight
+
+            pos_embedding_weight = self.trtllm_model_weights.get(
+                TRTLLMLayers.position_embedding.value
+            )
+            if pos_embedding_weight is not None:
+                if self.export_config.use_parallel_embedding:
+                    pos_embedding_weight = _split(
+                        pos_embedding_weight, mapping.tp_size, mapping.tp_rank
+                    )
+
+                trtllm_model_weights_per_gpu[TRTLLMLayers.position_embedding.value] = (
+                    pos_embedding_weight
+                )
+
+        if mapping.is_last_pp_rank():
+            lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+            if lm_head_weight is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.lm_head.value] = _split(
+                    lm_head_weight, mapping.tp_size, mapping.tp_rank
+                )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_weight.value] = (
+                self.trtllm_model_weights[TRTLLMLayers.final_layernorm_weight.value]
+            )
+
+            ln_f_bias = self.trtllm_model_weights.get(TRTLLMLayers.final_layernorm_bias.value)
+            if ln_f_bias is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_bias.value] = ln_f_bias
+
+        return trtllm_model_weights_per_gpu
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index ea4bd181af..7ee6dde182 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -94,6 +94,7 @@ def __init__(
         # These 2 attributes are needed for TensorRT-LLM export.
         self.max_position_embeddings = max_sequence_length
         self.rotary_percent = rotary_percent
+        self.rotary_base = rotary_base
 
         if self.pre_process:
             self.embedding = LanguageModelEmbedding(
diff --git a/tests/unit_tests/export/trtllm/__init__.py b/tests/unit_tests/export/trtllm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
new file mode 100644
index 0000000000..50c33ec9eb
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+
+class TestTRTLLMGPUConverter:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(2, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=64,
+            num_attention_heads=2,
+            use_cpu_initialization=True,
+            pipeline_dtype=torch.float32,
+            add_qkv_bias=False,
+            add_bias_linear=False,
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=_VOCAB_SIZE,
+            max_sequence_length=_SEQUENCE_LENGTH,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_get_model_weights_converter(self, mocker):
+        device = torch.device("cuda")
+        self.gpt_model.to(device)
+
+        transformer_config = self.gpt_model.config
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        dtype = DataType.bfloat16
+        distributed_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config, dtype, activation="gelu"
+        )
+
+        model_state_dict = {}
+        for key, val in self.gpt_model.state_dict().items():
+            # val is non for _extra_state layers . We filter it out
+            if val is not None:
+                model_state_dict[key] = val
+
+        distributed_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=GPT_DICT,
+            tokenizer_vocab_size=_VOCAB_SIZE,
+        )
+
+        expected_result = {
+            'transformer.vocab_embedding.weight': torch.Size([128, 64]),
+            'transformer.position_embedding.weight': torch.Size([32, 64]),
+            'lm_head.weight': torch.Size([128, 64]),
+            'transformer.ln_f.weight': torch.Size([64]),
+            'transformer.ln_f.bias': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.0.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.0.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.0.mlp.proj.weight': torch.Size([64, 128]),
+            'transformer.layers.1.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.1.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.1.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.1.mlp.proj.weight': torch.Size([64, 128]),
+        }
+
+        for key, value in distributed_converter.trtllm_model_weights.items():
+            assert (
+                expected_result[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result[key]} but got {value.shape}"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_layers.py b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
new file mode 100644
index 0000000000..b2e88852e5
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
@@ -0,0 +1,111 @@
+import pytest
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers, get_layer_name_without_prefix
+
+
+class TestTRTLLMLayers:
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_without_layer_numbers(self):
+
+        conversion_dict = {
+            "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "transformer.layers.attn.dense.bias": 0,
+            "transformer.layers.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+        assert (
+            converted_dict[TRTLLMLayers.attention_dense_bias.value] == 0
+        ), "Something wrong with conversion dict"
+        assert (
+            converted_dict[TRTLLMLayers.mlp_fc_weight.value] == 1
+        ), "Something wrong with conversion dict"
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_exception(self):
+
+        with pytest.raises(AssertionError):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": "randomValue",
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            del conversion_dict["attn.dense.bias"]
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=True,
+            )
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_with_layer_numbers(self):
+
+        conversion_dict = {
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "deocder.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "decoder.lm_head.weight": 2,
+            "decoder.layers.0.attn.dense.bias": 0,
+            "deocder.layers.43.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        assert (
+            converted_dict['transformer.layers.0.attention.dense.bias'] == 0
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['transformer.layers.43.mlp.fc.weight'] == 1
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['lm_head.weight'] == 2
+        ), "Something wrong with conversion of layer names"
+
+    def test_get_layer_name_without_prefix(self):
+        layer_name_without_prefix = get_layer_name_without_prefix(
+            TRTLLMLayers.attention_dense_weight
+        )
+        assert (
+            layer_name_without_prefix == "attention.dense.weight"
+        ), f"get_layer_name_without_prefix returned {layer_name_without_prefix}, expected attention.dense.weight"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
new file mode 100644
index 0000000000..0dad81d77b
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
@@ -0,0 +1,169 @@
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TestTRTLLMCPUConverter:
+    def test_get_model_weights_converter(self, mocker):
+
+        export_config = ExportConfig(inference_tp_size=2)
+
+        vocab_size = 10
+        hidden_dim = 4
+        seq_len = 8
+        num_layers = 2
+        num_attn_heads = 2
+
+        model_config = TransformerConfig(
+            num_layers=num_layers,
+            num_attention_heads=num_attn_heads,
+            num_query_groups=0,
+            hidden_size=hidden_dim,
+            ffn_hidden_size=hidden_dim * 4,
+        )
+
+        dtype = DataType.bfloat16
+
+        model_state_dict = {
+            "decoder.position_embedding.weight": torch.randn(seq_len, hidden_dim),
+            "decoder.word_embedding.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.lm_head.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.final_layernorm.weight": torch.randn(hidden_dim),
+            "decoder.layers.input_layernorm.weight": torch.randn(num_layers, hidden_dim),
+            "decoder.layers.attention.qkv.weight": torch.randn(
+                num_layers, hidden_dim * 3, hidden_dim
+            ),
+            "decoder.layers.attention.qkv.bias": torch.randn(num_layers, hidden_dim * 3),
+            "decoder.layers.attention.dense.weight": torch.randn(
+                num_layers, hidden_dim, hidden_dim
+            ),
+            "deocder.layers.mlp.fc.weight": torch.randn(num_layers, 4 * hidden_dim, hidden_dim),
+            "decoder.layers.mlp.fc.expert": torch.randn(num_layers, hidden_dim, hidden_dim * 4),
+            "decoder.layers.mlp.proj.expert": torch.randn(num_layers, hidden_dim * 4, hidden_dim),
+        }
+
+        trtllm_conversion_dict = {
+            "decoder.position_embedding.weight": TRTLLMLayers.position_embedding,
+            "decoder.word_embedding.weight": TRTLLMLayers.vocab_embedding,
+            "decoder.final_layernorm.weight": TRTLLMLayers.final_layernorm_weight,
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.input_layernorm.weight": TRTLLMLayers.input_layernorm_weight,
+            "decoder.layers.attention.qkv.weight": TRTLLMLayers.attention_qkv_weight,
+            "decoder.layers.attention.qkv.bias": TRTLLMLayers.attention_qkv_bias,
+            "decoder.layers.attention.dense.weight": TRTLLMLayers.attention_dense_weight,
+            "deocder.layers.mlp.fc.weight": TRTLLMLayers.mlp_fc_weight,
+            "decoder.layers.mlp.fc.expert": TRTLLMLayers.mlp_fc_weight_mixture_of_experts,
+            "decoder.layers.mlp.proj.expert": TRTLLMLayers.mlp_projection_weight_mixture_of_experts,
+        }
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        trtllm_model_weights_converter_cpu = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config, model_config, dtype, activation="swiglu"
+        )
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.pad_vocab_size",
+            return_value=10,
+        )
+
+        trtllm_model_weights_converter_cpu.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        expected_shapes = {
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (10, 4),
+            'transformer.ln_f.weight': (4,),
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.0.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.0.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.0.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.0.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.0.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.0.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.0.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.1.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.1.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.1.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.1.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.0.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.0.mlp.proj.weight.1.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.1.bin': (4, 8),
+        }
+
+        for key, value in trtllm_model_weights_converter_cpu.trtllm_model_weights.items():
+            assert (
+                expected_shapes[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_shapes[key]} but got {value.shape}"
+
+        class SampleMapping:
+
+            def __init__(self):
+                self.tp_size = 2
+                self.tp_rank = 1
+
+            def pp_layers(self, num_layers):
+                return [0, 1]
+
+            def is_first_pp_rank(self):
+                return True
+
+            def is_last_pp_rank(self):
+                return True
+
+        trtllm_model_weights_per_gpu = (
+            trtllm_model_weights_converter_cpu.get_local_model_weights_per_gpu(
+                mapping=SampleMapping(), trtllm_model_config=None
+            )
+        )
+
+        expected_result_per_gpu = {
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight': (6, 4),
+            'transformer.layers.1.attention.qkv.weight': (6, 4),
+            'transformer.layers.0.attention.qkv.bias': (6,),
+            'transformer.layers.1.attention.qkv.bias': (6,),
+            'transformer.layers.0.attention.dense.weight': (4, 2),
+            'transformer.layers.1.attention.dense.weight': (4, 2),
+            'transformer.layers.0.mlp.gate.weight': (4, 4),
+            'transformer.layers.0.mlp.fc.weight': (16, 2),
+            'transformer.layers.1.mlp.gate.weight': (4, 4),
+            'transformer.layers.1.mlp.fc.weight': (16, 2),
+            'transformer.layers.0.mlp.proj.weight': (4, 8),
+            'transformer.layers.1.mlp.proj.weight': (4, 8),
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (5, 4),
+            'transformer.ln_f.weight': (4,),
+        }
+
+        for key, value in trtllm_model_weights_per_gpu.items():
+            assert (
+                expected_result_per_gpu[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result_per_gpu[key]} but got {value.shape}"

From 8fbb30c224a6a3c108f56766a04a24e4c373fd2f Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Thu, 26 Sep 2024 02:50:09 -0700
Subject: [PATCH 27/50] ADLR/megatron-lm!2154 - ci: Prune docker cache of
 `mcore-docker-node-jet`

---
 .gitlab/stages/00.pre.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index e358a6aa95..a91436be87 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -76,9 +76,10 @@ clean_docker_node:
     matrix:
       - node: 8xL40S
       - node: mcore-docker-node-small
+      - node: mcore-docker-node-jet
   script:
     - export DOCKER_HOST='unix:///var/run/docker.sock'
-    - docker system prune -a --filter "until=48h" -f || true
+    - docker system prune -a --filter "until=36h" -f || true
 
 maybe_cherry_pick_commit:
   rules:

From 9f06f064d117def13920c5bff8304d2185a2fe67 Mon Sep 17 00:00:00 2001
From: Xuwen Chen <xuwenc@nvidia.com>
Date: Thu, 26 Sep 2024 07:20:42 -0700
Subject: [PATCH 28/50] ADLR/megatron-lm!2155 - Resolve release test failure
 caused by GroupedMLP distributed checkpointing

---
 .../mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml        | 1 +
 .../mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
index af474ac150..9516076dc6 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -88,6 +88,7 @@ MODEL_ARGS:
   --auto-detect-ckpt-format: true
   --load: ${LOAD_PATH}
   --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
   --save-interval: 500
 
   # Add initialization args
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
index 95b151569a..39421a887e 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -89,6 +89,7 @@ MODEL_ARGS:
   --auto-detect-ckpt-format: true
   --load: ${LOAD_PATH}
   --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
   --save-interval: 500
 
   # Add initialization args

From 015bffc58f3f1be68373e700b99f4ee8dc99258a Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Thu, 26 Sep 2024 15:17:42 -0700
Subject: [PATCH 29/50] ADLR/megatron-lm!2156 - tests: Set better name for
 Wandb logging

---
 .../python_test_utils/jet/generate_jet_trigger_job.py        | 5 ++++-
 .../python_test_utils/jet/launch_jet_workload.py             | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index beeb31860d..b67d856464 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -81,7 +81,10 @@ def main(
 
         if run_name is not None and wandb_experiment is not None:
             script.append(f"--run-name {run_name}")
-            script.append(f"--wandb-experiment {wandb_experiment}")
+            test_case.spec.model
+            script.append(
+                f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}"
+            )
 
         gitlab_pipeline[test_case.spec.test_case] = {
             "stage": "functional_tests",
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
index 4e796ceb6c..9c8ccb0bc0 100644
--- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -195,6 +195,8 @@ def main(
             success = pipeline.get_status() == PipelineStatus.SUCCESS
             sys.exit(int(not success))  # invert for exit 0
 
+    sys.exit(1)
+
 
 if __name__ == "__main__":
     main()

From 34f0f98da8f935fa5e322d7a74b8936458f87017 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Thu, 26 Sep 2024 17:46:20 -0700
Subject: [PATCH 30/50] ADLR/megatron-lm!1950 - Remove pkg_resources package

Co-authored-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Deepak Narayanan <dnarayanan@nvidia.com>
---
 .../dist_checkpointing/strategies/torch.py    |  7 +-
 .../core/extensions/transformer_engine.py     | 74 ++++++++-----------
 megatron/core/models/bert/bert_model.py       | 26 +++++--
 megatron/core/models/retro/config.py          |  7 +-
 megatron/core/requirements.txt                |  3 +-
 megatron/core/tensor_parallel/random.py       |  4 +-
 .../core/transformer/transformer_block.py     |  9 +--
 .../core/transformer/transformer_config.py    |  9 +--
 megatron/core/utils.py                        | 29 ++++++++
 megatron/legacy/model/transformer.py          | 12 +--
 megatron/training/initialize.py               |  2 -
 .../models/test_sequential_mlp.py             | 10 +--
 tests/unit_tests/models/test_bert_model.py    | 63 +++++++++-------
 .../transformer/moe/test_grouped_mlp.py       |  8 +-
 .../transformer/moe/test_sequential_mlp.py    | 12 ++-
 .../test_multi_latent_attention.py            | 21 +-----
 .../transformer/test_spec_customization.py    |  6 +-
 tools/checkpoint/saver_mcore.py               |  8 +-
 18 files changed, 151 insertions(+), 159 deletions(-)

diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
index 96f2c316c5..077d94eb77 100644
--- a/megatron/core/dist_checkpointing/strategies/torch.py
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -10,7 +10,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import torch
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from torch.distributed import checkpoint
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharded_tensor import Shard, ShardedTensorMetadata, TensorProperties
@@ -448,8 +448,9 @@ def __init__(
         nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None,
         **kwargs,
     ) -> None:
-        # `dedup_replicated_tensors` was deprecated in 2.3 - avoids tons of warnings during saving
-        if packaging.version.Version(torch.__version__) <= packaging.version.Version("2.2"):
+        # `dedup_replicated_tensors` was deprecated in 2.3; this check avoids warnings
+        # during saving.
+        if PkgVersion(torch.__version__) <= PkgVersion("2.2"):
             kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors
         super().__init__(*args, **kwargs)
         self.nd_flattened_global_shapes = nd_flattened_global_shapes or {}
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index e5ff55849f..36781f9cca 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -3,12 +3,11 @@
 import dataclasses
 import os
 import warnings
-from importlib.metadata import version
 from typing import Callable
 
 import torch
 import transformer_engine as te
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from torch import Tensor
 
 from megatron.core import ModelParallelConfig, parallel_state
@@ -25,27 +24,13 @@
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-
-def get_te_version():
-    """Get TE version from __version__; if not available use pip's. Use caching."""
-
-    def get_te_version_str():
-        if hasattr(te, '__version__'):
-            return str(te.__version__)
-        else:
-            return version("transformer-engine")
-
-    return packaging.version.Version(get_te_version_str())
-
-
-_te_version = get_te_version()
+from megatron.core.utils import get_te_version, is_te_min_version
 
 
 def _get_extra_te_kwargs(config: TransformerConfig):
     extra_transformer_engine_kwargs = {"params_dtype": config.params_dtype}
 
-    if _te_version >= packaging.version.Version("0.12.0"):
+    if is_te_min_version("0.12.0"):
         if config.use_cpu_initialization:
             extra_transformer_engine_kwargs["device"] = 'cpu'
         else:
@@ -131,9 +116,9 @@ def __init__(
 
         extra_kwargs = _get_extra_te_kwargs(config)
 
-        if _te_version >= packaging.version.Version("0.8.0"):
+        if is_te_min_version("0.8.0"):
             if self.config.tp_comm_overlap:
-                if _te_version > packaging.version.Version("1.5.0"):
+                if is_te_min_version("1.5.0"):
                     # Use old overlap flags if they were supplied instead
                     extra_kwargs["ub_overlap_ag"] = (
                         self.config.tp_comm_overlap_ag
@@ -160,7 +145,7 @@ def __init__(
                         extra_kwargs["ub_atomic_gemm_ag"] = False
                         extra_kwargs["ub_split_rs"] = False
                         extra_kwargs["ub_atomic_gemm_rs"] = False
-                if _te_version > packaging.version.Version("1.0.0"):
+                if is_te_min_version("1.0.0", check_equality=False):
                     assert (
                         tp_comm_buffer_name is not None
                     ), "Buffer name should be set to configure communication overlap settings"
@@ -171,7 +156,7 @@ def __init__(
             rng_tracker_name = get_expert_parallel_rng_tracker_name()
         else:
             rng_tracker_name = None
-        if _te_version >= packaging.version.Version("1.7.0"):
+        if is_te_min_version("1.7.0"):
             extra_kwargs["rng_tracker_name"] = rng_tracker_name
 
         # Disable communications in TE when using SP or EP by making TE agnostic of model parallel.
@@ -268,25 +253,26 @@ def __init__(
         extra_kwargs = _get_extra_te_kwargs(config)
 
         # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm`
-        if _te_version >= packaging.version.Version("0.11.0"):
+        if is_te_min_version("0.11.0"):
             extra_kwargs["normalization"] = self.config.normalization
         elif self.config.normalization != "LayerNorm":
+            te_version = get_te_version()
             raise ValueError(
-                f"Transformer Engine v{_te_version} does not support {self.config.normalization}."
+                f"Transformer Engine v{te_version} does not support {self.config.normalization}."
             )
 
-        if _te_version >= packaging.version.Version("0.8.0"):
+        if is_te_min_version("0.8.0"):
             if self.config.tp_comm_overlap:
                 extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad
                 extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad
-                if _te_version > packaging.version.Version("1.5.0"):
+                if is_te_min_version("1.5.0", check_equality=False):
                     # Use old overlap flags if they were supplied instead
                     extra_kwargs["ub_overlap_ag"] = (
                         self.config.tp_comm_overlap_ag
                         if hasattr(self.config, "tp_comm_overlap_ag")
                         else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag
                     )
-                    if _te_version > packaging.version.Version("1.6.0.dev0"):
+                    if is_te_min_version("1.6.0.dev0", check_equality=False):
                         extra_kwargs["ub_overlap_rs_dgrad"] = (
                             self.config.tp_comm_overlap_rs_dgrad
                             if hasattr(self.config, "tp_comm_overlap_rs_dgrad")
@@ -302,7 +288,7 @@ def __init__(
                 else:
                     extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag
                     extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
-                if _te_version > packaging.version.Version("1.0.0"):
+                if is_te_min_version("1.0.0", check_equality=False):
                     assert (
                         tp_comm_buffer_name is not None
                     ), "Buffer name should be set to configure communication overlap settings"
@@ -478,25 +464,25 @@ def __init__(
             )
 
         extra_kwargs = {}
-        if _te_version >= packaging.version.Version("0.11.0"):
+        if is_te_min_version("0.11.0"):
             extra_kwargs["num_gqa_groups"] = self.config.num_query_groups
         elif self.config.num_query_groups != self.config.num_attention_heads:
             raise ValueError(
-                f"Transformer Engine v{_te_version} does not support Grouped Query Attention, "
+                f"Transformer Engine v{get_te_version()} does not support Grouped Query Attention, "
                 f"use a newer version of Transformer Engine. "
                 f"(num_query_groups ({self.config.num_query_groups}) != "
                 f"num_attention_heads ({self.config.num_attention_heads}))"
             )
 
-        if _te_version >= packaging.version.Version("0.10.0"):
+        if is_te_min_version("0.10.0"):
             extra_kwargs["attention_type"] = attention_type
             # older version don't need attention_type
 
-        if _te_version > packaging.version.Version("0.12.0"):
+        if is_te_min_version("0.12.0", check_equality=False):
             self.te_forward_mask_type = True
 
         # Only Transformer-Engine version >= 1.0.0 supports context parallelism
-        if _te_version >= packaging.version.Version("1.0.0"):
+        if is_te_min_version("1.0.0"):
             if getattr(TEDotProductAttention, "cp_stream") is None:
                 TEDotProductAttention.cp_stream = torch.cuda.Stream()
             extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False)
@@ -519,13 +505,13 @@ def __init__(
 
         if config.window_size is not None:
             # Check version
-            assert _te_version >= packaging.version.Version("1.2.0"), (
-                f"Transformer-Engine version ({str(_te_version)}) must be >= 1.2.0 to support"
+            assert is_te_min_version("1.2.0"), (
+                f"Transformer-Engine v{get_te_version()} must be >= 1.2.0 to support"
                 "sliding window attention."
             )
             extra_kwargs['window_size'] = config.window_size
 
-        if _te_version >= packaging.version.Version("1.10.0"):
+        if is_te_min_version("1.10.0"):
             # TE 1.10.0 introduces the ability to set the different k and v channels
             kv_channels = (
                 (k_channels, v_channels)
@@ -568,12 +554,12 @@ def forward(
         )
         # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
         # after init
-        if self.config.apply_rope_fusion and _te_version > packaging.version.Version("0.13.0"):
+        if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False):
             self.qkv_format = 'bshd'
 
         qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
 
-        if _te_version < packaging.version.Version("1.3.0"):
+        if get_te_version() < PkgVersion("1.3.0"):
             # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
             # copies (#555)
             # These two arguments did not exist prior to 1.3.0
@@ -592,7 +578,7 @@ def forward(
                 value = value.as_strided(value.shape, key.stride())
 
         if self.te_forward_mask_type:
-            if qkv_format == 'thd' and _te_version >= packaging.version.Version("1.7.0"):
+            if qkv_format == 'thd' and is_te_min_version("1.7.0"):
                 # thd format uses flash attention with cuDNN kernel which requires is_padding=True,
                 # so the only acceptable mask types are `padding_causal` and `padding`. These do not
                 # necessarily indicate there are padded tokens in the sequence.
@@ -617,7 +603,7 @@ def forward(
             return core_attn_out
 
 
-if _te_version >= packaging.version.Version("1.9.0.dev0"):
+if is_te_min_version("1.9.0.dev0"):
 
     class TEGroupedLinear(te.pytorch.GroupedLinear):
         """
@@ -879,10 +865,10 @@ def __init__(
         override_linear_precision: tuple = (False, False, False),
     ):
         extra_kwargs = _get_extra_te_kwargs(config)
-        if _te_version >= packaging.version.Version("1.6.0.dev0"):
+        if is_te_min_version("1.6.0.dev0"):
             extra_kwargs["fp8_dpa"] = config.fp8_dot_product_attention
             extra_kwargs["fp8_mha"] = config.fp8_multi_head_attention
-        if _te_version < packaging.version.Version("1.8.0"):
+        if get_te_version() < PkgVersion("1.8.0"):
             extra_kwargs["interval"] = config.fp8_interval
         elif config.fp8_interval != 1:
             warnings.warn("fp8_interval is deprecated and ignored from Transformer-Engine v1.8.0.")
@@ -935,7 +921,7 @@ def te_checkpoint(
     """Checkpointing with Transformer-Engine."""
     from transformer_engine.pytorch.distributed import checkpoint
 
-    if _te_version >= packaging.version.Version("1.5.0"):
+    if is_te_min_version("1.5.0"):
         return checkpoint(
             forward_func,
             hidden_states,
@@ -981,7 +967,7 @@ def get_cpu_offload_context(
         enabled, num_layers, model_layers, activation_offloading, weight_offloading
     ):
         """Get CPU offload context and sync function."""
-        if _te_version >= packaging.version.Version("1.10.0.dev0"):
+        if is_te_min_version("1.10.0.dev0"):
             context, sync_func = _get_cpu_offload_context(
                 enabled, num_layers, model_layers, activation_offloading, weight_offloading
             )
diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
index d9d1be449c..541d05d905 100644
--- a/megatron/core/models/bert/bert_model.py
+++ b/megatron/core/models/bert/bert_model.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 import os
-from importlib.metadata import version
+import warnings
 from typing import Literal, Optional
 
 import torch
-from pkg_resources import packaging
 from torch import Tensor
 
 from megatron.core import parallel_state, tensor_parallel
@@ -20,11 +19,14 @@
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import get_linear_layer
+from megatron.core.utils import get_te_version as _get_te_version
+from megatron.core.utils import is_te_min_version
 
 
 def get_te_version():
-    """Returns the installed version of transformer engine"""
-    return packaging.version.Version(version("transformer-engine"))
+    """Included for backwards compatibility."""
+    warnings.warn("`get_te_version` will be deprecated in a future release")
+    return _get_te_version()
 
 
 class BertModel(LanguageModule):
@@ -177,19 +179,27 @@ def _santiy_check_attention_and_get_attn_mask_dimension(
         """
         attn_mask_dimensions = "b1ss"
         if transformer_layer_spec == bert_layer_with_transformer_engine_spec:
-            if get_te_version() >= packaging.version.Version("1.7.0"):
-                # pylint: disable=line-too-long
+            if is_te_min_version("1.7.0"):
                 if os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0':
                     assert (
                         transformer_layer_spec.submodules.self_attention.params['attn_mask_type']
                         == AttnMaskType.arbitrary
-                    ), "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
+                    ), (
+                        "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset "
+                        "both of them or set one of them to 1 to use a more optimized attention "
+                        "kernel. Currently using unfused attention path. If you want to proceed "
+                        "with this path set AttnMaskType in module spec to be arbitrary"
+                    )
                 else:
                     attn_mask_dimensions = "b11s"
             else:
                 assert (
                     os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0'
-                ), "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
+                ), (
+                    "Flash and fused attention is not supported with transformer engine version "
+                    "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+                    "engine >= 1.7"
+                )
         return attn_mask_dimensions
 
     def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py
index f9ed05f470..d4b5c9684b 100644
--- a/megatron/core/models/retro/config.py
+++ b/megatron/core/models/retro/config.py
@@ -4,11 +4,9 @@
 
 import os
 from dataclasses import dataclass
-from importlib.metadata import version
-
-from pkg_resources import packaging
 
 from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import is_te_min_version
 
 
 @dataclass
@@ -65,8 +63,7 @@ def __post_init__(self) -> None:
         super().__post_init__()
 
         # Validate Transformer Engine version.
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version >= packaging.version.Version("1.3"):
+        if is_te_min_version("1.3"):
             try:
                 assert os.getenv("NVTE_FLASH_ATTN") == "0"
                 assert os.getenv("NVTE_FUSED_ATTN") == "0"
diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt
index 08ed5eeb4b..a03ef133e7 100644
--- a/megatron/core/requirements.txt
+++ b/megatron/core/requirements.txt
@@ -1 +1,2 @@
-torch
\ No newline at end of file
+torch
+packaging
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
index 3724f81648..4b144d4163 100644
--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
@@ -16,7 +16,7 @@
     get_expert_model_parallel_rank,
     get_tensor_model_parallel_rank,
 )
-from megatron.core.utils import safely_set_viewless_tensor_data
+from megatron.core.utils import is_te_min_version, safely_set_viewless_tensor_data
 
 from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
 
@@ -175,6 +175,8 @@ def initialize_rng_tracker(use_te_rng_tracker: bool = False):
         return
 
     if use_te_rng_tracker:
+        if not is_te_min_version("1.5.0"):
+            raise RuntimeError("use_te_rng_tracker requires TransformerEngine version >= 1.5")
         from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker
 
         _CUDA_RNG_STATE_TRACKER = TECudaRNGStatesTracker()
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 46f6796909..3a88f1ab22 100755
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -2,10 +2,8 @@
 
 from contextlib import nullcontext
 from dataclasses import dataclass
-from importlib.metadata import version
 from typing import List, Optional, Union
 
-import packaging
 import torch
 from torch import Tensor
 
@@ -19,7 +17,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import BaseTransformerLayer
 from megatron.core.transformer.utils import sharded_state_dict_default
-from megatron.core.utils import make_viewless_tensor
+from megatron.core.utils import is_te_min_version, make_viewless_tensor
 
 try:
     from megatron.core.extensions.transformer_engine import (
@@ -375,10 +373,9 @@ def get_cuda_graph_optional_args(
         optional_inputs = {}
         optional_inputs['is_first_microbatch'] = self.current_microbatch == 0
         try:
-            import transformer_engine.pytorch as te
+            import transformer_engine.pytorch as te  # pylint: disable=unused-import
 
-            _te_version = packaging.version.Version(version("transformer-engine"))
-            if _te_version < packaging.version.Version("1.10.0"):
+            if is_te_min_version("1.10.0", check_equality=False):
                 assert not any(
                     [attention_mask, context, context_mask, rotary_pos_emb]
                 ), "Keyword Arguments not supported with CUDA graph."
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index c5ce7bc6dc..a63171686a 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -1,14 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 from dataclasses import dataclass
-from importlib.metadata import version
 from typing import Callable, Optional, Tuple
 
 import torch.nn.functional as F
-from pkg_resources import packaging
 
 from ..model_parallel_config import ModelParallelConfig
-from ..utils import init_method_normal, scaled_init_method_normal
+from ..utils import get_te_version, init_method_normal, is_te_min_version, scaled_init_method_normal
 
 
 @dataclass
@@ -507,11 +505,10 @@ def __post_init__(self):
 
         if self.num_moe_experts and self.fp8:
             # TE version below 1.7.0 will raise Error when handle zeros tokens for expert
-            te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version < packaging.version.Version("1.7.0.dev0"):
+            if not is_te_min_version("1.7.0.dev0"):
                 raise ValueError(
                     "Only transformer-engine>=1.7.0 supports MoE FP8 training, "
-                    f"but your version is {te_version}."
+                    f"but your version is {get_te_version()}."
                 )
 
             if self.moe_grouped_gemm:
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index b0de950ef6..f3910926ab 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -15,10 +15,12 @@
 from dataclasses import dataclass
 from datetime import datetime
 from functools import reduce
+from importlib.metadata import version
 from types import TracebackType
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
+from packaging.version import Version as PkgVersion
 
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedTensor
@@ -26,6 +28,33 @@
 logger = logging.getLogger(__name__)
 
 
+_te_version = None
+
+
+def get_te_version():
+    """Get TE version from __version__; if not available use pip's. Use caching."""
+
+    def get_te_version_str():
+        import transformer_engine as te
+
+        if hasattr(te, '__version__'):
+            return str(te.__version__)
+        else:
+            return version("transformer-engine")
+
+    global _te_version
+    if _te_version is None:
+        _te_version = PkgVersion(get_te_version_str())
+    return _te_version
+
+
+def is_te_min_version(version, check_equality=True):
+    """Check if minimum version of `transformer-engine` is installed."""
+    if check_equality:
+        return get_te_version() >= PkgVersion(version)
+    return get_te_version() > PkgVersion(version)
+
+
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
     assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index 9dfc7f7ed8..dda550551a 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -1406,21 +1406,15 @@ def __init__(self, config,
         self.transformer_engine_v_0_8 = False
         if self.transformer_impl == 'transformer_engine':
             global transformer_engine
-            from importlib.metadata import version
-
             import transformer_engine
-            from pkg_resources import packaging
 
-            te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("0.8.0"):
+            if core.utils.is_te_min_version("0.8.0"):
                 self.transformer_engine_v_0_8 = True
-            if te_version >= packaging.version.Version("0.10.0"):
+            if core.utils.is_te_min_version("0.10.0"):
                 self.transformer_engine_v_0_10 = True
-            if te_version >= packaging.version.Version("0.11.0"):
+            if core.utils.is_te_min_version("0.11.0"):
                 self.transformer_engine_v_0_11 = True
 
-            del version, packaging
-
             assert not args.squared_relu, ("TransformerEngine does not support squared "
                                            "relu activation.")
 
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index b2ef8a8f45..8e4877c8b5 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -4,8 +4,6 @@
 import logging
 import random
 import os
-import packaging
-import packaging.version
 import time
 
 import numpy as np
diff --git a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
index d42b73b8af..5a31d9d3d4 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from importlib.metadata import version
-
 import pytest
 import torch
-from pkg_resources import packaging
 
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing import load, load_plain_tensors, save
@@ -21,11 +18,10 @@
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.moe.experts import SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 def initialize_expert_layer(seed, glu=True, moe_grouped_gemm=False, **config_kwargs):
     torch.manual_seed(seed)
@@ -69,7 +65,7 @@ def get_pp_offsets():
 
 
 moe_grouped_gemm_options = [False]
-if _te_version >= packaging.version.Version("1.9.0.dev0"):
+if is_te_min_version("1.9.0.dev0"):
     moe_grouped_gemm_options.append(True)
 
 
@@ -155,7 +151,7 @@ def test_parallel_reconfiguration_e2e(
             assert not any(map(bool, diffs)), diffs
 
     @pytest.mark.skipif(
-        _te_version < packaging.version.Version("1.9.0.dev0"),
+        not is_te_min_version("1.9.0.dev0"),
         reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.",
     )
     @pytest.mark.parametrize(
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
index 30d4aec024..75fbf914a2 100644
--- a/tests/unit_tests/models/test_bert_model.py
+++ b/tests/unit_tests/models/test_bert_model.py
@@ -5,17 +5,16 @@
 
 import pytest
 import torch
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from pytest_mock import mocker
 
 from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
 from megatron.core.models.bert.bert_model import BertModel
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestBertModel:
 
@@ -113,10 +112,7 @@ def test_te_assertions_te_less_than_1_7(self, mocker):
         )
 
         with pytest.raises(Exception) as exc_info:
-            mocker.patch(
-                "megatron.core.models.bert.bert_model.get_te_version",
-                return_value=packaging.version.Version("1.4"),
-            )
+            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.4"))
             self.bert_model = BertModel(
                 config=transformer_config,
                 num_tokentypes=0,
@@ -125,9 +121,9 @@ def test_te_assertions_te_less_than_1_7(self, mocker):
                 max_sequence_length=4,
             )
 
-        assert (
-            str(exc_info.value)
-            == "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
+        assert str(exc_info.value) == (
+            "Flash and fused attention is not supported with transformer engine version < 1.7. "
+            "Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
         )
 
     @pytest.mark.internal
@@ -150,10 +146,7 @@ def test_te_assertions_te_equal_to_1_7_exception(self, mocker):
         )
 
         with pytest.raises(Exception) as exc_info:
-            mocker.patch(
-                "megatron.core.models.bert.bert_model.get_te_version",
-                return_value=packaging.version.Version("1.7"),
-            )
+            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.7"))
             self.bert_model = BertModel(
                 config=transformer_config,
                 num_tokentypes=0,
@@ -162,9 +155,10 @@ def test_te_assertions_te_equal_to_1_7_exception(self, mocker):
                 max_sequence_length=4,
             )
 
-        assert (
-            str(exc_info.value)
-            == "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
+        assert str(exc_info.value) == (
+            "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set "
+            "one of them to 1 to use a more optimized attention kernel. Currently using unfused attention "
+            "path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
         )
 
     @pytest.mark.internal
@@ -186,15 +180,28 @@ def test_te_assertions_te_equal_to_1_7_no_exception(self, mocker):
             pipeline_dtype=torch.bfloat16,
         )
 
-        mocker.patch(
-            "megatron.core.models.bert.bert_model.get_te_version",
-            return_value=packaging.version.Version("1.7"),
-        )
-        self.bert_model = BertModel(
-            config=transformer_config,
-            num_tokentypes=0,
-            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
-            vocab_size=100,
-            max_sequence_length=4,
-        )
+        if is_te_min_version("1.7"):  # If TE version >= 1.7, no exception should be raised
+            self.bert_model = BertModel(
+                config=transformer_config,
+                num_tokentypes=0,
+                transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                vocab_size=100,
+                max_sequence_length=4,
+            )
+        else:  # If TE version < 1.7, an exception should be raised in other files
+            with pytest.raises(Exception) as exc_info:
+                mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.7"))
+                self.bert_model = BertModel(
+                    config=transformer_config,
+                    num_tokentypes=0,
+                    transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                    vocab_size=100,
+                    max_sequence_length=4,
+                )
+            assert str(exc_info.value) == (
+                "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
+                "instantiating TERowParallelLinear when instantiating SelfAttention when "
+                "instantiating TransformerLayer"
+            )
+
         Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
index a78921ad10..043bdc8c58 100644
--- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
@@ -1,17 +1,15 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from importlib.metadata import version
-
 import pytest
 import torch
 import torch.nn.functional as F
-from pkg_resources import packaging
 
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.transformer.moe import grouped_gemm_util as gg
 from megatron.core.transformer.moe.experts import TEGroupedMLP
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from megatron.legacy.model import Float16Module
 from megatron.training.arguments import parse_args
 from megatron.training.initialize import _set_random_seed
@@ -21,8 +19,6 @@
 if torch.cuda.is_available():
     DEVICE_CAPABILITY = torch.cuda.get_device_capability()
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestParallelGroupedMLP:
 
@@ -218,7 +214,7 @@ def test_gradient_with_no_tokens_allocated(self):
 
 
 @pytest.mark.skipif(
-    _te_version < packaging.version.Version("1.9.0.dev0"),
+    not is_te_min_version("1.9.0.dev0"),
     reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.",
 )
 class TestTEGroupedMLP:
diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
index 40a0caf31a..514e098bfd 100644
--- a/tests/unit_tests/transformer/moe/test_sequential_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 from importlib.metadata import version
 
-import packaging
 import pytest
 import torch
 
@@ -16,10 +15,9 @@
 from megatron.core.transformer.moe.experts import SequentialMLP
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
-te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestParallelSequentialMLP:
 
@@ -117,7 +115,7 @@ def setup_method(self, method):
         )
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -133,7 +131,7 @@ def test_constructor(self):
             )
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -155,7 +153,7 @@ def test_gpu_forward(self):
         assert torch.equal(output_local, output_te)
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -179,7 +177,7 @@ def test_gpu_forward_with_one_local_expert(self):
         assert torch.equal(output_local, output_te)
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
index 4117ba6aa0..4188d7b069 100644
--- a/tests/unit_tests/transformer/test_multi_latent_attention.py
+++ b/tests/unit_tests/transformer/test_multi_latent_attention.py
@@ -6,28 +6,15 @@
 import pytest
 import torch
 import transformer_engine as te
-from pkg_resources import packaging
 
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.multi_latent_attention import MLASelfAttention
 from megatron.core.transformer.transformer_config import MLATransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
 
-def get_te_version():
-    def get_te_version_str():
-        if hasattr(te, '__version__'):
-            return str(te.__version__)
-        else:
-            return version("transformer-engine")
-
-    return packaging.version.Version(get_te_version_str())
-
-
-_te_version = get_te_version()
-
-
 class TestParallelMLAAttention:
 
     def setup_method(self, method):
@@ -68,7 +55,7 @@ def test_cpu_forward(self):
         pass
 
     def test_gpu_forward(self):
-        if _te_version >= packaging.version.Version("1.10.0"):
+        if is_te_min_version("1.10.0"):
 
             # use flash attention for hopper, future may support fused attention for ampere
             os.environ['NVTE_FUSED_ATTN'] = "0"
@@ -97,7 +84,7 @@ def test_gpu_forward(self):
             assert bias.shape[0] == config.hidden_size
 
     def test_fused_rope_gpu_forward(self):
-        if _te_version >= packaging.version.Version("1.10.0"):
+        if is_te_min_version("1.10.0"):
             # use flash attention for hopper, future may support fused attention for ampere
             os.environ['NVTE_FUSED_ATTN'] = "0"
             os.environ['NVTE_FLASH_ATTN'] = "1"
@@ -131,7 +118,7 @@ def test_fused_rope_gpu_forward(self):
             self.parallel_attention.config.apply_rope_fusion = False
 
     def test_checkpointed_gpu_forward(self):
-        if _te_version >= packaging.version.Version("1.10.0"):
+        if is_te_min_version("1.10.0"):
             # use flash attention for hopper, future may support fused attention for ampere
             os.environ['NVTE_FUSED_ATTN'] = "0"
             os.environ['NVTE_FLASH_ATTN'] = "1"
diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py
index 80c3bf7577..a9a245b861 100755
--- a/tests/unit_tests/transformer/test_spec_customization.py
+++ b/tests/unit_tests/transformer/test_spec_customization.py
@@ -2,12 +2,10 @@
 
 import sys
 from dataclasses import dataclass, fields
-from importlib.metadata import version
 
 import pytest
 import torch
 import transformer_engine as te
-from pkg_resources import packaging
 
 from megatron.core.extensions.transformer_engine import (
     TEDotProductAttention,
@@ -26,6 +24,7 @@
 from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -134,8 +133,7 @@ def test_build_module(self):
         assert id(bda_op) == id(get_bias_dropout_add)
 
     def test_sliding_window_attention(self):
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version < packaging.version.Version("1.2.0"):
+        if not is_te_min_version("1.2.0"):
             print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr)
             return
 
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
index aea481abed..6aec90e41b 100644
--- a/tools/checkpoint/saver_mcore.py
+++ b/tools/checkpoint/saver_mcore.py
@@ -3,11 +3,10 @@
 import os
 import sys
 import torch
-from importlib.metadata import version
-from pkg_resources import packaging
 
 from setter import ModelSetter
 from utils import get_mcore_transformer_block_key, print_memory_usage
+from megatron.core.utils import get_te_version, is_te_min_version
 
 
 class MCoreSetter(ModelSetter):
@@ -288,9 +287,8 @@ def add_arguments(parser):
 def save_checkpoint(queue, args):
 
     # Transformer engine >= 0.12.0, for CPU initialization.
-    te_version = packaging.version.Version(version("transformer-engine"))
-    assert te_version >= packaging.version.Version("0.12.0"), \
-        "transformer engine version: %s (>=0.12.0 required)." % te_version
+    assert is_te_min_version("0.12.0"), \
+        "transformer engine version: %s (>=0.12.0 required)." % get_te_version()
 
     # Search in directory above this
     sys.path.append(os.path.abspath(

From 8103c4ced712e74ab09ecc74e828d548d809468f Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 27 Sep 2024 16:56:29 -0700
Subject: [PATCH 31/50] ADLR/megatron-lm!2142 - ci: Onboard CW

---
 .gitlab-ci.yml                                | 19 +++++++++++++------
 .../jet/launch_jet_workload.py                | 10 ++--------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 52ae2a886e..c99b97f697 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,22 +13,28 @@ workflow:
         FUNCTIONAL_TEST: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: mr
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: nightly
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: weekly
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
         FUNCTIONAL_TEST: "no"
@@ -70,6 +76,7 @@ variables:
   FUNCTIONAL_TEST_CLUSTER_H100:
     value: "dgxh100_eos"
     options:
+      - "dgxh100_coreweave"
       - "dgxh100_eos"
     description: 'Cluster for H100 workloads'
   FUNCTIONAL_TEST_NAME:
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
index 9c8ccb0bc0..123c322677 100644
--- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -23,6 +23,8 @@ def resolve_cluster_config(cluster: str) -> str:
         return "mcore/draco-oci"
     if cluster == "dgxa100_dracooci-ord":
         return "mcore/draco-oci-ord"
+    if cluster == "dgxh100_coreweave":
+        return "mcore/coreweave"
     raise ValueError(f"Unknown cluster {cluster} provided.")
 
 
@@ -54,14 +56,6 @@ def launch_and_wait_for_completion(
         ),
         config_id=resolve_cluster_config(cluster),
         custom_config={
-            "retrier": {
-                "enabled": True,
-                "max_retries": 2,
-                "retry_on": ['1.2', '1.2.*'],
-                "waiting_time": 60,
-                "environment": "jet-auto-retrier",
-            },
-            "builds": {"jet_flavour": None},
             "launchers": {cluster: {"account": account}},
             "executors": {
                 "jet-ci": {

From 30aafee61de997f95bac7d3f14aa8472e91c7045 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Date: Fri, 27 Sep 2024 21:26:21 -0700
Subject: [PATCH 32/50] ADLR/megatron-lm!2158 - Small changes to export

Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
---
 examples/export/trtllm_export/README.md       |  2 +-
 .../default_conversion_dict.py                |  3 +-
 .../model_to_trllm_mapping/falcon_model.py    |  3 +-
 .../model_to_trllm_mapping/gpt_next_model.py  | 24 ++++++
 megatron/core/export/trtllm/trtllm_helper.py  | 10 +--
 ...tributed_trtllm_model_weights_converter.py | 22 ++++--
 ...e_device_trtllm_model_weights_converter.py | 20 ++---
 .../test_trtllm_distributed_gpu_converter.py  |  2 +-
 .../export/trtllm/test_trtllm_helper.py       | 73 +++++++++++++++++++
 .../test_trtllm_single_device_converter.py    |  2 +-
 10 files changed, 131 insertions(+), 30 deletions(-)
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_helper.py

diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md
index 50177382c9..52cad78583 100644
--- a/examples/export/trtllm_export/README.md
+++ b/examples/export/trtllm_export/README.md
@@ -13,7 +13,7 @@ This guide will walk you through how you can use the megatron core export for ex
 #### 1. Quick Start
 This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
 
-NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function. Default behaviour is to transfer one layer at a time to cuda and convert if available, else do cpu conversion.
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
 
 <br>
 
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
index 924dda4bc8..cad9315034 100644
--- a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
@@ -4,6 +4,7 @@
 from megatron.core.export.trtllm.model_to_trllm_mapping.falcon_model import FALCON_DICT
 from megatron.core.export.trtllm.model_to_trllm_mapping.gemma_model import GEMMA_DICT
 from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_next_model import GPT_NEXT_DICT
 from megatron.core.export.trtllm.model_to_trllm_mapping.llama_model import LLAMA_DICT
 from megatron.core.export.trtllm.model_to_trllm_mapping.starcoder_model import STARCODER_DICT
 
@@ -13,5 +14,5 @@
     ModelType.gemma: GEMMA_DICT,
     ModelType.starcoder: STARCODER_DICT,
     ModelType.gpt: GPT_DICT,
-    ModelType.gptnext: GPT_DICT,  # TODO : Check if this is right
+    ModelType.gptnext: GPT_NEXT_DICT,
 }
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
index 1640f992a1..d1469d02ba 100644
--- a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
@@ -14,9 +14,10 @@
     'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
     'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
     # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
     'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
     'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
-    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
     # FINAL LAYER NORM
     'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
     'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
new file mode 100644
index 0000000000..ac5f84ef1b
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GPT_NEXT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py
index 9db8d246fc..d8bef18b33 100644
--- a/megatron/core/export/trtllm/trtllm_helper.py
+++ b/megatron/core/export/trtllm/trtllm_helper.py
@@ -187,9 +187,7 @@ def get_trtllm_pretrained_config_and_model_weights(
 
         This function returns the trtllm model weights as a list.
         There are two modes for conversion. The default is to use a single device cpu/gpu for conversion.
-        In the single device mode, we use cuda device automatically if available, if not we convert on CPU.
         NOTE: For faster performance, if your entire model will fit in memory, pre transfer the model state dict to cuda device and then call this function.
-        Default behaviour is to transfer one layer at a time to cuda and convert if available, else do cpu conversion.
         For on device conversion it returns weights which will be used on the device itself.
         Same thing happens with the pretrained config
 
@@ -206,10 +204,6 @@ def get_trtllm_pretrained_config_and_model_weights(
         Returns:
             Two lists . First list of trtllm converted model weights(Either on device, or a list of weights for each gpu) and the trtllm_model_configs.
         """
-        assert not (
-            self.share_embeddings_and_output_weights and not export_config.use_parallel_embedding
-        ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True"
-
         if on_device_distributed_conversion:
             assert (vocab_size is not None, "Need to pass in vocab_size for on device")
             assert (
@@ -231,6 +225,9 @@ def get_trtllm_pretrained_config_and_model_weights(
             return [trtllm_model_weights_on_device], [trtllm_model_config]
 
         else:
+            assert not (
+                self.share_embeddings_and_output_weights and not export_config.use_embedding_sharing
+            ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True"
             assert (
                 vocab_size is None
             ), "Vocab size is inferred from the input layer for cpu conversion. So leave it as None"
@@ -281,6 +278,7 @@ def _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
             inference_pp_size=distributed_trtllm_model_weights_converter.inference_pp_size,
             inference_tp_size=distributed_trtllm_model_weights_converter.inference_tp_size,
             use_parallel_embedding=True,
+            use_embedding_sharing=self.share_embeddings_and_output_weights,
         )
 
         world_size = export_config.inference_tp_size * export_config.inference_pp_size
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
index 07b47411cc..035e23a16c 100644
--- a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
@@ -65,7 +65,7 @@ def __init__(
             vp_size is None or vp_size == 1
         ), "Virtual parallelism is not supported in GPU Converter. Gather the VP chunks and use PP config."
 
-    def _save_val(self, val: torch.Tensor, layer_name: str):
+    def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: str):
         assert torch.is_tensor(val), f"Expected a tensor for {layer_name} but got {type(val)}"
         val = val.to(self.storage_type)
         val = val.detach().contiguous()
@@ -101,7 +101,15 @@ def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
             or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_weight))
             or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight))
         ):
-            self._save_val(val=val, layer_name=layer_name)
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
 
         elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
             suffix(TRTLLMLayers.mlp_fc_bias)
@@ -116,10 +124,10 @@ def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
             if split_gated_activation:
                 vals, gates = [[n] for n in torch.chunk(val, 2, axis=-1)]
                 gate_layer_name = layer_name.replace("fc", "gate")
-                self._save_val(val=gates[0], layer_name=gate_layer_name)
+                self._add_to_trtllm_model_weights(val=gates[0], layer_name=gate_layer_name)
                 val = vals[0]
 
-            self._save_val(val=val, layer_name=layer_name)
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
 
         elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
             qkv_hidden_dim = val.shape[0]
@@ -136,7 +144,7 @@ def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
             split_vals = torch.concatenate(
                 [qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=0
             )
-            self._save_val(val=split_vals, layer_name=layer_name)
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
 
         # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
         elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
@@ -158,7 +166,7 @@ def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
                 ],
                 dim=1,
             )
-            self._save_val(val=split_vals, layer_name=layer_name)
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
 
         else:
             raise ValueError(f"{layer_name} cannot be handled by GPU converter")
@@ -174,7 +182,7 @@ def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str
         """
         if layer_name in model_state_dict:
             val = model_state_dict.pop(layer_name)
-            self._save_val(val=val, layer_name=layer_name)
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
 
     # ----------------Convert Embeddings----------------
     def _get_remove_vocab_padding(self, layer_name, model_state_dict, tokenizer_vocab_size):
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
index b8ec02ff61..c7a98972d2 100644
--- a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
@@ -78,15 +78,6 @@ def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str
             val = val.to(self.storage_type).detach().contiguous()
             self.trtllm_model_weights[layer_name] = val
 
-    def _transfer_tensor_to_cuda_if_available(self, val: torch.tensor):
-        """Transfer to cuda device if available
-
-        This function transfers the tensor to cuda and returns it
-        """
-        if torch.cuda.is_available() and not val.is_cuda:
-            val = val.cuda()
-        return val
-
     def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
         """Convert Transformer layers to TRTLLM weights
 
@@ -127,8 +118,6 @@ def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=
                     val.to(self.storage_type).detach().contiguous()
                 )
 
-        val = self._transfer_tensor_to_cuda_if_available(val)
-
         if val.ndim == 2:
             val = val.T
 
@@ -142,6 +131,14 @@ def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=
             or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
             or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
         ):
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
             _add_to_trtllm_model_weights(val=val, layer_name=layer_name, split_type=None)
 
         elif layer_name.endswith(
@@ -295,7 +292,6 @@ def convert(
                 layer_name == TRTLLMLayers.vocab_embedding.value
                 and self.export_config.use_parallel_embedding
             ):
-                val = self._transfer_tensor_to_cuda_if_available(val)
                 val = model_state_dict[TRTLLMLayers.vocab_embedding.value]
                 vocab_size = val.shape[0]
                 if vocab_size % self.export_config.inference_tp_size != 0:
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
index 50c33ec9eb..5a0aa0e9c5 100644
--- a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
+++ b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
@@ -17,7 +17,7 @@
 _VOCAB_SIZE = 256
 
 
-class TestTRTLLMGPUConverter:
+class TestTRTLLMDistributedGPUConverter:
 
     def setup_method(self, method):
         Utils.initialize_model_parallel(2, 1)
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_helper.py b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
new file mode 100644
index 0000000000..53c0a5ffea
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
@@ -0,0 +1,73 @@
+import pytest
+
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+
+
+# TODO : Remove importorskip and handle with mocker
+class TestTRTLLMHelper:
+
+    def test_exceptions(self, mocker):
+        pytest.importorskip('tensorrt_llm')
+
+        from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+
+        trtllm_helper = TRTLLMHelper(
+            transformer_config=None,
+            model_type=ModelType.gpt,
+            share_embeddings_and_output_weights=True,
+        )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                vocab_size=None,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                ModelType=ModelType.falcon,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(),
+                on_device_distributed_conversion=True,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                vocab_size=100,
+                on_device_distributed_conversion=True,
+                gpus_per_node=None,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=False),
+                on_device_distributed_conversion=False,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=True),
+                vocab_size=100,
+            )
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
index 0dad81d77b..e431326f0b 100644
--- a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
+++ b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
@@ -10,7 +10,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 
 
-class TestTRTLLMCPUConverter:
+class TestTRTLLMSingleDeviceConverter:
     def test_get_model_weights_converter(self, mocker):
 
         export_config = ExportConfig(inference_tp_size=2)

From 30445f83617db10d4dee3246a4a93517b4330a1f Mon Sep 17 00:00:00 2001
From: Boxiang Wang <boxiangw@nvidia.com>
Date: Mon, 30 Sep 2024 16:21:01 -0700
Subject: [PATCH 33/50] ADLR/megatron-lm!2152 - Fix rope backward compatibility

---
 .../models/common/embeddings/rope_utils.py     |  2 +-
 .../common/embeddings/rotary_pos_embedding.py  | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
index 037377c530..accb251961 100644
--- a/megatron/core/models/common/embeddings/rope_utils.py
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -160,7 +160,7 @@ def apply_rotary_pos_emb(
             )
             apply_rotary_pos_emb.printed_fused_warning = True
 
-    if config.multi_latent_attention and config.rotary_interleaved:
+    if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved:
         logger.warning(
             "rotary_interleaved is not supported with multi_latent_attention, setting it to False"
         )
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index f3c854275c..5232faec60 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -15,7 +15,13 @@
 from torch import Tensor, nn
 
 from megatron.core import parallel_state
-from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
+from megatron.core.models.common.embeddings.rope_utils import (  # for backward compatibility; pylint: disable=unused-import
+    _apply_rotary_pos_emb_bshd,
+    _apply_rotary_pos_emb_thd,
+    _rotate_half,
+    apply_rotary_pos_emb,
+    get_pos_emb_on_this_cp_rank,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -136,8 +142,8 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
         if parallel_state.get_context_parallel_world_size() > 1:
-            # slice rotary_pos_emb along sequence dimension
-            # and select the parition of the current CP rank
+            # slice rotary_pos_emb along sequence dimension and select the parition of the current
+            # CP rank
             emb = get_pos_emb_on_this_cp_rank(emb, 0)
         return emb
 
@@ -156,9 +162,9 @@ def get_rotary_seq_len(
 
         Args:
             inference_params : Used during Inference time
-            transformer (TransformerBlock): The transformer block
-                (decoder/encoder) used by the model
-            transformer_input (Tensor): _description_
+            transformer (TransformerBlock): The transformer block (decoder/encoder) used
+                by the model
+            transformer_input (Tensor): Input tensor to the transformer
             transformer_config (TransformerConfig): Transformer config used by the model
 
         Returns:

From c6a0ec84338e87ac70651ff6bff23d2fbf207c60 Mon Sep 17 00:00:00 2001
From: "Jimmy Zhang (Engrg-Hardware 1)" <jiemingz@nvidia.com>
Date: Mon, 30 Sep 2024 17:18:24 -0700
Subject: [PATCH 34/50] ADLR/megatron-lm!2140 - [Bug fix] Don't trace graphs
 during inference

---
 megatron/core/transformer/cuda_graphs.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
index a60a22c0f3..2588980b5b 100644
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -6,6 +6,8 @@
 
 import torch
 
+from megatron.core.transformer.module import MegatronModule
+
 try:
     from transformer_engine.pytorch import make_graphed_callables
     from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
@@ -233,11 +235,16 @@ def __call__(self, megatron_module, args, kwargs):
                 break
 
         if runner is None:
-            runner = self.create_cudagraph_module(megatron_module, args, kwargs)
-            self.cudagraph_runners.append(runner)
-            logging.getLogger(__name__).info(
-                f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
-            )
+            if self.training and torch.is_grad_enabled():
+                runner = self.create_cudagraph_module(megatron_module, args, kwargs)
+                self.cudagraph_runners.append(runner)
+                logging.getLogger(__name__).info(
+                    f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
+                )
+            else:
+                # No cudagraphs were found in inference mode, so fallback to eager since
+                # tensor.requires_grad is needed to correctly trace the backward graph.
+                return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
 
         tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs)
         out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch)

From d90956cce1168b7877588e43eeb976c728049493 Mon Sep 17 00:00:00 2001
From: Huy Vu <huvu@nvidia.com>
Date: Mon, 30 Sep 2024 23:36:10 -0700
Subject: [PATCH 35/50] ADLR/megatron-lm!2109 - Adding more MR tests for T5
 (e.g., transformer_engine, distributed_checkpoint)

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
---
 tests/functional_tests/jet_recipes/t5.yaml    |   6 +
 .../golden_values.json                        |   1 +
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 .../golden_values.json                        |   1 +
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 .../golden_values.json                        | 763 ++++++++++++++++++
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 10 files changed, 1101 insertions(+)
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml

diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
index b2451a9600..dbbbc508d2 100644
--- a/tests/functional_tests/jet_recipes/t5.yaml
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -31,6 +31,12 @@ products:
   - scope: [mr]
     time_limit: [12000]
     test_case:
+    - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
   - scope: [weekly]
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..bcff777664
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [19.39068, 0.66038, 0.65673, 0.66493, 0.65894, 0.6473, 0.65746, 0.64942, 0.66259, 0.65247, 0.65165, 0.64944, 0.81313, 0.65069, 0.64982, 0.65247, 0.65149, 0.65284, 0.64913, 0.6496]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.63253, 0.27412, 0.26777, 0.27338, 0.26922, 0.26445, 0.27043, 0.26308, 0.27178, 0.26246, 0.26565, 0.26691, 0.42095, 0.26741, 0.26653, 0.26546, 0.26547, 0.26403, 0.26266, 0.26606]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.0264, 0.24005, 0.23751, 0.24162, 0.24102, 0.23888, 0.24027, 0.23829, 0.24182, 0.24308, 0.24109, 0.23964, 0.23841, 0.24005, 0.23898, 0.23896, 0.24052, 0.23894, 0.24242, 0.23863]}, "forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [8.32911, 0.07441, 0.07755, 0.07578, 0.07557, 0.07223, 0.0737, 0.07404, 0.07108, 0.07174, 0.07137, 0.07162, 0.07437, 0.07185, 0.07129, 0.07247, 0.0719, 0.07573, 0.07292, 0.07122]}, "forward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.47287, 0.00053, 0.00063, 0.00048, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00063, 0.00044, 0.00046, 0.00047, 0.00045, 0.00056, 0.00046, 0.00045, 0.00046, 0.00045, 0.00044]}, "backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.1444, 0.13179, 0.12767, 0.13592, 0.1279, 0.12912, 0.13033, 0.1328, 0.13106, 0.13249, 0.12957, 0.12877, 0.13334, 0.12829, 0.12815, 0.13128, 0.12985, 0.13117, 0.12901, 0.1277]}, "backward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00065, 0.00056, 0.00066, 0.00067, 0.0006, 0.00059, 0.00064, 0.00067, 0.00068, 0.0006, 0.00056, 0.00058, 0.00059, 0.00056, 0.00064, 0.00058, 0.00049, 0.00079, 0.00081, 0.0006]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [12.49425, 0.23291, 0.228, 0.22475, 0.22786, 0.22525, 0.22534, 0.22597, 0.23004, 0.22656, 0.22342, 0.22577, 0.38374, 0.22857, 0.22673, 0.22371, 0.22908, 0.23017, 0.23145, 0.23191]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5.02478, 0.00608, 0.00441, 0.00414, 0.0093, 0.00347, 0.00363, 0.00527, 0.0093, 0.00705, 0.00369, 0.00633, 0.00834, 0.00352, 0.0034, 0.00565, 0.00346, 0.00354, 0.00341, 0.0035]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.47745, 0.00052, 0.00064, 0.00053, 0.00052, 0.0006, 0.00052, 0.00062, 0.00052, 0.00056, 0.00065, 0.00056, 0.00054, 0.00053, 0.00058, 0.00052, 0.00052, 0.00052, 0.00055, 0.00053]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.43086, 0.00036, 0.00041, 0.00037, 0.00032, 0.00037, 0.00048, 0.00044, 0.00043, 0.00045, 0.00034, 0.00044, 0.00037, 0.00043, 0.00044, 0.00032, 0.00032, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00053, 0.00034, 0.00032, 0.00033, 0.00034, 0.00031, 0.00033, 0.00035, 0.00032, 0.00033, 0.00036, 0.00035, 0.00033, 0.00033, 0.00034, 0.00035, 0.00033, 0.00034, 0.00032, 0.00035]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.26638, 0.00127, 0.00123, 0.00144, 0.00125, 0.00123, 0.00128, 0.00162, 0.00128, 0.00131, 0.00138, 0.00133, 0.00142, 0.0013, 0.00136, 0.00137, 0.00133, 0.00135, 0.00129, 0.00136]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.01282, 0.00738, 0.00728, 0.00736, 0.00738, 0.00733, 0.00738, 0.00735, 0.00731, 0.00727, 0.00897, 0.00755, 0.0073, 0.00721, 0.00734, 0.00746, 0.00736, 0.00734, 0.00737, 0.00726]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00984, 0.00108, 0.00105, 0.00108, 0.00105, 0.00105, 0.00107, 0.00104, 0.00105, 0.00106, 0.00106, 0.00105, 0.0012, 0.00106, 0.00105, 0.00105, 0.00105, 0.00106, 0.00104, 0.00106]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0011, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.0015, 0.00102, 0.00101, 0.00101, 0.00102, 0.00268, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.29197, 0.01172, 0.01152, 0.01191, 0.01165, 0.01156, 0.0117, 0.01199, 0.01159, 0.01161, 0.0134, 0.01194, 0.01269, 0.01155, 0.01172, 0.01186, 0.01173, 0.01343, 0.01172, 0.01165]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.7057, 0.68569, 0.68236, 0.69077, 0.68415, 0.67238, 0.68288, 0.67481, 0.6874, 0.67748, 0.6785, 0.67478, 0.83941, 0.6755, 0.67503, 0.67787, 0.67668, 0.67904, 0.67443, 0.67541]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..076389c3d6
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b0d00b8f83
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..c59b98b90a
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [11.55278, 0.77358, 0.76856, 0.77172, 0.75887, 0.76061, 0.75836, 0.76125, 0.76192, 0.76187, 0.76171, 0.76045, 0.7599, 0.76535, 0.76121, 0.76796, 0.76998, 0.76511, 0.76167, 0.75816]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [6.97639, 0.39525, 0.3898, 0.39437, 0.37749, 0.38195, 0.37908, 0.37821, 0.38433, 0.38023, 0.38359, 0.37973, 0.37768, 0.37754, 0.38336, 0.38173, 0.39026, 0.38845, 0.38337, 0.37691]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.32964, 0.37495, 0.37481, 0.37567, 0.37884, 0.37558, 0.37486, 0.37929, 0.37612, 0.37965, 0.37608, 0.37503, 0.37843, 0.38541, 0.37552, 0.38094, 0.37923, 0.37628, 0.37437, 0.37757]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.89543, 0.00188, 0.00211, 0.00164, 0.00165, 0.00162, 0.00162, 0.00162, 0.00184, 0.00165, 0.00164, 0.00208, 0.00162, 0.00167, 0.0016, 0.00168, 0.00165, 0.00163, 0.00164, 0.00161]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00146, 0.00105, 0.00105, 0.00102, 0.00107, 0.00107, 0.00107, 0.00109, 0.00105, 0.00106, 0.00107, 0.00106, 0.00106, 0.00106, 0.00108, 0.00108, 0.00107, 0.00104, 0.00103, 0.0011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.50022, 0.00376, 0.00381, 0.00329, 0.00321, 0.00354, 0.00371, 0.00375, 0.00366, 0.00301, 0.00349, 0.00372, 0.00349, 0.00369, 0.00297, 0.00283, 0.00369, 0.00377, 0.00388, 0.00369]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.04986, 0.02302, 0.02299, 0.02588, 0.02338, 0.0231, 0.02293, 0.0231, 0.02309, 0.02329, 0.02328, 0.02332, 0.02304, 0.02327, 0.02287, 0.02321, 0.02315, 0.0234, 0.02312, 0.02327]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0158, 0.00219, 0.00221, 0.00411, 0.0022, 0.0022, 0.00216, 0.0022, 0.00217, 0.00218, 0.00218, 0.00225, 0.00233, 0.00219, 0.00223, 0.00222, 0.00212, 0.0022, 0.00222, 0.00225]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00301, 0.00302, 0.00302, 0.00339, 0.003, 0.00302, 0.00302, 0.00301, 0.00301, 0.00301, 0.003, 0.00301, 0.00302, 0.00304, 0.003, 0.00301, 0.00299, 0.00304, 0.00303, 0.00303]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.57167, 0.03386, 0.03382, 0.03847, 0.03353, 0.03358, 0.03363, 0.03394, 0.03377, 0.03326, 0.03368, 0.03412, 0.03363, 0.03407, 0.03281, 0.03316, 0.03373, 0.03419, 0.03396, 0.034]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [13.15856, 0.82951, 0.82427, 0.83168, 0.8147, 0.81581, 0.81386, 0.8171, 0.8176, 0.81664, 0.81719, 0.81685, 0.81547, 0.82136, 0.81551, 0.82315, 0.82591, 0.82132, 0.81777, 0.81414]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d1b9e8429e
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..540d4c1b73
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..d932464f76
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1,763 @@
+{
+    "forward-backward-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.18678,
+            0.67885,
+            0.68278,
+            0.68333,
+            0.67855,
+            0.68179,
+            0.68809,
+            0.67808,
+            0.67889,
+            0.69586,
+            0.69577,
+            0.67938,
+            0.68076,
+            0.68551,
+            0.69108,
+            0.67821,
+            0.68422,
+            0.68947,
+            0.67891,
+            0.68614
+        ]
+    },
+    "forward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            8.91183,
+            0.31386,
+            0.31455,
+            0.31529,
+            0.31399,
+            0.31376,
+            0.3168,
+            0.31219,
+            0.31205,
+            0.32539,
+            0.32943,
+            0.31424,
+            0.31569,
+            0.32161,
+            0.32188,
+            0.31166,
+            0.31627,
+            0.31935,
+            0.31029,
+            0.32078
+        ]
+    },
+    "backward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            4.25414,
+            0.3682,
+            0.37658,
+            0.37755,
+            0.37333,
+            0.37381,
+            0.37727,
+            0.37278,
+            0.37206,
+            0.37541,
+            0.37183,
+            0.37214,
+            0.37101,
+            0.37247,
+            0.37485,
+            0.36955,
+            0.37359,
+            0.3825,
+            0.37545,
+            0.37777
+        ]
+    },
+    "layernorm-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00004,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002
+        ]
+    },
+    "embedding-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003
+        ]
+    },
+    "all-grads-sync-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.9061,
+            0.00163,
+            0.00202,
+            0.00163,
+            0.00157,
+            0.00156,
+            0.00183,
+            0.0016,
+            0.00183,
+            0.00157,
+            0.00157,
+            0.00158,
+            0.00168,
+            0.00158,
+            0.00169,
+            0.00156,
+            0.00157,
+            0.00157,
+            0.00156,
+            0.00185
+        ]
+    },
+    "optimizer-copy-to-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0011,
+            0.00104,
+            0.00102,
+            0.00101,
+            0.00097,
+            0.00098,
+            0.001,
+            0.00096,
+            0.00096,
+            0.00099,
+            0.00095,
+            0.00097,
+            0.00096,
+            0.00098,
+            0.00097,
+            0.00098,
+            0.00095,
+            0.00099,
+            0.00098,
+            0.00099
+        ]
+    },
+    "optimizer-clip-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.59317,
+            0.00265,
+            0.00282,
+            0.00284,
+            0.00289,
+            0.00298,
+            0.00282,
+            0.00294,
+            0.00302,
+            0.00301,
+            0.00304,
+            0.00294,
+            0.00253,
+            0.00296,
+            0.00251,
+            0.00227,
+            0.00282,
+            0.00287,
+            0.00308,
+            0.00276
+        ]
+    },
+    "optimizer-count-zeros-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.04375,
+            0.02396,
+            0.02387,
+            0.02381,
+            0.02385,
+            0.02393,
+            0.0241,
+            0.02406,
+            0.02393,
+            0.024,
+            0.02396,
+            0.024,
+            0.0241,
+            0.02397,
+            0.024,
+            0.02378,
+            0.0238,
+            0.02393,
+            0.02395,
+            0.02405
+        ]
+    },
+    "optimizer-inner-step-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.01715,
+            0.00212,
+            0.0021,
+            0.00212,
+            0.00212,
+            0.00211,
+            0.00218,
+            0.00213,
+            0.00212,
+            0.00214,
+            0.00211,
+            0.00226,
+            0.00211,
+            0.00209,
+            0.00211,
+            0.00218,
+            0.00207,
+            0.00211,
+            0.00213,
+            0.00218
+        ]
+    },
+    "optimizer-copy-main-to-model-params-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00281,
+            0.00282,
+            0.00281,
+            0.00283,
+            0.00281,
+            0.00283,
+            0.00289,
+            0.00286,
+            0.00281,
+            0.00284,
+            0.00282,
+            0.00431,
+            0.00295,
+            0.00284,
+            0.00283,
+            0.00283,
+            0.18259,
+            0.00284,
+            0.00283,
+            0.00295
+        ]
+    },
+    "optimizer-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.65881,
+            0.03322,
+            0.03326,
+            0.03323,
+            0.03329,
+            0.03345,
+            0.03361,
+            0.03357,
+            0.03352,
+            0.03364,
+            0.03349,
+            0.03532,
+            0.03332,
+            0.03347,
+            0.03313,
+            0.03267,
+            0.21285,
+            0.03336,
+            0.03358,
+            0.03357
+        ]
+    },
+    "learning-rate": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "learning-rate vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "batch-size": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "batch-size vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "lm loss vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "loss-scale": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "loss-scale vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "grad-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "grad-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "num-zeros vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "params-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "params-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            15.87098,
+            0.73261,
+            0.73669,
+            0.73696,
+            0.73228,
+            0.73561,
+            0.74191,
+            0.73193,
+            0.73279,
+            0.75004,
+            0.74974,
+            0.73772,
+            0.73447,
+            0.73951,
+            0.74553,
+            0.73119,
+            0.9162,
+            0.74318,
+            0.73275,
+            0.74014
+        ]
+    },
+    "lm loss validation": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation ppl": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    },
+    "lm loss validation ppl vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6aae44ca71
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6e9731d4ce
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume

From 77f62d84c7cc532fbe3c297f90a712110d81d50d Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Tue, 1 Oct 2024 14:52:43 -0700
Subject: [PATCH 36/50] ADLR/megatron-lm!2164 - ci: Download artifacts

---
 .../jet/generate_jet_trigger_job.py           |  1 +
 .../jet/launch_jet_workload.py                | 28 ++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index b67d856464..30d13c3730 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -97,6 +97,7 @@ def main(
             "timeout": "7 days",
             "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "jet-generate"}],
             "script": [" ".join(script)],
+            "artifacts": {"paths": ["results/"]},
         }
 
     with open(output_path, 'w') as outfile:
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
index 123c322677..3e243c542a 100644
--- a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -86,6 +86,22 @@ def launch_and_wait_for_completion(
     return pipeline
 
 
+def download_job_assets(job: jetclient.JETJob, iteration: int = 0) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}"
+
+    for restart_idx, log in enumerate(logs):
+        assets = log.get_assets()
+        assets_path = assets_base_path / f"restart={restart_idx}"
+        assets_path.mkdir(parents=True, exist_ok=True)
+        for log_filename in assets.keys():
+            with open(assets_path / log_filename, "w") as fh:
+                assets[log_filename].download(pathlib.Path(fh.name))
+
+
 def download_job_logs(job: jetclient.JETJob) -> List[str]:
     logs = job.get_logs()
     if not logs:
@@ -157,6 +173,7 @@ def main(
         sys.exit(1)
 
     n_attempts = 0
+    n_iteration = 0
     while True and n_attempts < 3:
         pipeline = launch_and_wait_for_completion(
             test_case=test_case,
@@ -168,12 +185,14 @@ def main(
             wandb_experiment=wandb_experiment,
         )
 
-        logs = download_job_logs(
-            job=[job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
-        )
+        main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
+
+        logs = download_job_logs(job=main_job)
         concat_logs = "\n".join(logs)
         print(f"Logs:\n{concat_logs}")
 
+        download_job_assets(job=main_job, iteration=n_iteration)
+
         if test_type != "release":
             success = pipeline.get_status() == PipelineStatus.SUCCESS
             sys.exit(int(not success))  # invert for exit 0
@@ -186,9 +205,10 @@ def main(
 
         current_iteration, total_iterations = parsed_result
         if current_iteration == total_iterations:
+
             success = pipeline.get_status() == PipelineStatus.SUCCESS
             sys.exit(int(not success))  # invert for exit 0
-
+        n_iteration += 1
     sys.exit(1)
 
 

From cf25e497d17ed6998d48afe13b42bf9f835cffe7 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Wed, 2 Oct 2024 11:30:38 -0700
Subject: [PATCH 37/50] ADLR/megatron-lm!2165 - ci: Bump version

---
 .gitlab/stages/01.tests.yml   | 4 +---
 megatron/core/package_info.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index 3a667cbe02..dc59e026ac 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -94,7 +94,7 @@ unit_tests:
         IMAGE: ${CI_MCORE_IMAGE}
       # - TAG: latest
       #   IMAGE: ${CI_MCORE_DEV_IMAGE}
-      - TAG: 63be779b4608403f956aa1ef6c9013ab78db3eeb
+      - TAG: core_r0.9.0
         IMAGE: ${CI_MCORE_IMAGE}
   tags: [8xL40S]
   variables:
@@ -132,8 +132,6 @@ unit_tests:
     - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
       allow_failure: true
       when: always
-    - if: '$TAG != "latest"'
-      allow_failure: true
     - when: always
 
 unit-tests-results-notify:
diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index bc385ad268..6135dc52c8 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,7 +2,7 @@
 
 
 MAJOR = 0
-MINOR = 9
+MINOR = 10
 PATCH = 0
 PRE_RELEASE = 'rc0'
 

From a792575243a44cfdca05f36f68f21373d92cb33d Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Wed, 2 Oct 2024 17:47:56 -0700
Subject: [PATCH 38/50] ADLR/megatron-lm!2153 - Add the interface to set TP
 communication bootstrap backend

Co-authored-by: Deepak Narayanan <dnarayanan@nvidia.com>
---
 megatron/core/model_parallel_config.py | 24 +++++++++++++++++-------
 megatron/training/arguments.py         |  3 +++
 megatron/training/initialize.py        | 22 ++++++++++++++++------
 3 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index caae41cb4a..f2751673e4 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -182,8 +182,8 @@ class ModelParallelConfig:
 
     tp_comm_atomic_ag: bool = False
     """Deprecated from TransformerEngine v1.6.0.
-        If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both
-       done atomically. Don't care if tp_comm_overlap is False.
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       both done atomically. Don't care if tp_comm_overlap is False.
     """
 
     tp_comm_split_rs: bool = True
@@ -213,6 +213,11 @@ class ModelParallelConfig:
        If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled
     """
 
+    tp_comm_bootstrap_backend: str = 'nccl'
+    """
+       Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo'
+    """
+
     ###################
     # Pipeline Parallel
     ###################
@@ -257,7 +262,8 @@ class ModelParallelConfig:
 
     wgrad_deferral_limit: int = 0
     """This value tunes the number of micro-batches for which the embedding weight gradient compute
-       needs to be deferred to pipeline flush, this argument is invalid if `defer_embedding_wgrad_compute` is False.
+       needs to be deferred to pipeline flush, this argument is invalid if
+       `defer_embedding_wgrad_compute` is False.
        Defaults to 0, which means all micro-batches are deferred.
     """
 
@@ -276,7 +282,9 @@ class ModelParallelConfig:
     """Tells the number of transformer layers for which activations has to be offloaded."""
 
     _cpu_offloading_context: ContextManager = (
-        None  # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible.
+        None
+        # Used for internal use only, not to be set by a user.
+        # TODO: Need to move to the 'right' place when possible.
     )
     """For internal use only, do not set."""
 
@@ -297,7 +305,8 @@ class ModelParallelConfig:
 
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization.
-        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
+        details.
         """
         if self.sequence_parallel:
             if self.tensor_model_parallel_size <= 1:
@@ -324,11 +333,12 @@ def __post_init__(self):
 
         if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0:
             raise ValueError(
-                "Wgrad deferral limit should be greater than or equal to 0 when this optimization is enabled!"
+                "Wgrad deferral limit should be greater than or equal to 0 when it is enabled!"
             )
 
         if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
             if self.sequence_parallel is False:
                 raise ValueError(
-                    "When using expert parallelism and tensor parallelism, sequence parallelism must be used"
+                    "When using expert parallelism and tensor parallelism, sequence parallelism "
+                    "must be used"
                 )
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 162d719314..e3d876a5f2 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1160,6 +1160,9 @@ def _add_training_args(parser):
     group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false',
                        help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
                        dest='tp_comm_bulk_wgrad')
+    group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str,
+                       choices=['nccl', 'mpi', 'gloo'],
+                       help='Set the bootstrapping backend of Tensor parallel communications.')
     group.add_argument('--use-cpu-initialization', action='store_true',
                        default=None,
                        help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.')
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index 8e4877c8b5..ad68ce8cb7 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -22,6 +22,7 @@
 from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
 from megatron.core.fusions.fused_bias_gelu import bias_gelu
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
+from megatron.core.utils import get_te_version, is_te_min_version
 
 logger = logging.getLogger(__name__)
 
@@ -211,12 +212,21 @@ def _initialize_tp_communicators():
 
     input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
 
-    #We create a MPI process group, which is needed to bootstrap the pipelined
-    #tensor-model-parallel communication overlap
-    torch.distributed.new_group(backend='mpi')
-
-    te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
-                                 use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
+    if is_te_min_version("1.9.0"):
+        # The process group with the target bootstrap backend is created in Transformer Engine.
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,
+                                     bootstrap_backend = args.tp_comm_bootstrap_backend)
+    else:
+        if args.tp_comm_bootstrap_backend != 'mpi':
+            warnings.warn(
+                f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend."
+            )
+        # Create a MPI process group to help with TP communication overlap bootstrap.
+        torch.distributed.new_group(backend='mpi')
+    
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs)
 
 def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
     """Initialize torch.distributed and core model parallel."""

From 74d4b1aaf769aa5f28d419bca67fb88f40e1306e Mon Sep 17 00:00:00 2001
From: Matthieu Le <matthieul@nvidia.com>
Date: Thu, 3 Oct 2024 15:20:54 -0700
Subject: [PATCH 39/50] ADLR/megatron-lm!2095 - Add support for SigLIP vision
 encoder to multimodal mcore

---
 examples/multimodal/config.py                 | 23 ++++++-
 examples/multimodal/model.py                  |  5 +-
 .../core/models/multimodal/llava_model.py     | 20 ++++++-
 megatron/core/models/vision/clip_vit_model.py | 58 ++++++++++++++----
 megatron/training/activations.py              |  4 ++
 pretrain_vlm.py                               |  5 +-
 tests/unit_tests/models/test_llava_model.py   | 60 +++++++++++++++++++
 7 files changed, 156 insertions(+), 19 deletions(-)

diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
index d4ee17db1b..cf48b131a7 100644
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from megatron.training.activations import quick_gelu, squared_relu
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
 
 
 def get_language_model_config(config):
@@ -77,7 +77,26 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
         config.gated_linear_unit = False
         config.activation_func = quick_gelu
         config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
         config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
         config.num_query_groups = 16
         config.layernorm_zero_centered_gamma = False
         config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
@@ -86,6 +105,8 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
         config.attention_softmax_in_fp32 = True
         config.normalization = 'LayerNorm'
         config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
 
     return config
 
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
index b21c687525..b4bab73cfb 100644
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
@@ -36,7 +36,8 @@ def model_provider(
     print_rank_0('building a multimodal model ...')
 
     num_image_embeddings = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+        args.img_h, args.img_w, args.patch_dim, args.vision_model_type,
+        args.disable_vision_class_token, 1
     )
     old_seq_length = args.seq_length
     args.seq_length = args.encoder_seq_length = num_image_embeddings
@@ -82,7 +83,7 @@ def model_provider(
     )
 
     vision_model_type = args.vision_model_type
-    if vision_model_type == "clip":
+    if vision_model_type in ["clip", "siglip"]:
         if use_te:
             vision_transformer_layer_spec = get_layer_spec_te(
                 is_vit=True
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index 32527f9dea..074cfaae93 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -125,6 +125,16 @@ def __init__(
 
         class_token_len = 1
         if self.add_encoder:
+            self._drop_vision_class_token = drop_vision_class_token
+            add_class_token = True
+            if vision_transformer_config.vision_model_type == "siglip":
+                class_token_len = 0
+                add_class_token = False
+                error_msg = (
+                    "Siglip does not support vision class token, "
+                    "set disable-vision-class-token to False."
+                )
+                assert not self._drop_vision_class_token, error_msg
             self.vision_model = CLIPViTModel(
                 vision_transformer_config,
                 vision_transformer_layer_spec,
@@ -132,8 +142,9 @@ def __init__(
                 img_w=img_w,
                 class_token_len=class_token_len,
                 patch_dim=patch_dim,
+                model_subtype=vision_transformer_config.vision_model_type,
+                add_class_token=add_class_token,
             )
-            self._drop_vision_class_token = drop_vision_class_token
             # Map (intermediate) vision model outputs to the language model input dimension.
             self.vision_projection = MultimodalProjector(
                 vision_projection_config,
@@ -155,7 +166,12 @@ def __init__(
                 )
 
         self._img_seq_len = get_num_image_embeddings(
-            img_h, img_w, patch_dim, drop_vision_class_token, class_token_len
+            img_h,
+            img_w,
+            patch_dim,
+            vision_transformer_config.vision_model_type,
+            drop_vision_class_token,
+            class_token_len,
         )
 
     def shared_embedding_or_output_weight(self):
diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
index 110a8687f7..53c3feddee 100644
--- a/megatron/core/models/vision/clip_vit_model.py
+++ b/megatron/core/models/vision/clip_vit_model.py
@@ -33,12 +33,22 @@ def __init__(
         transformer_config: TransformerConfig,
         transformer_layer_spec: ModuleSpec,
         ln_pre_impl: Union[ModuleSpec, type] = TENorm,
+        ln_post_impl: Union[ModuleSpec, type] = TENorm,
         add_class_token: bool = True,
         class_token_len: int = 1,
         patch_dim: int = 14,
         img_h: int = 336,
         img_w: int = 336,
+        model_subtype: str = "clip",
     ) -> None:
+
+        error_msg = f"CLIPViTModel model subtype {model_subtype} is not supported."
+        assert model_subtype in ["clip", "siglip"], error_msg
+
+        if model_subtype == "siglip":
+            assert class_token_len == 0, "SigLIP does not support class tokens."
+            assert not add_class_token, "SigLIP does not support class tokens."
+
         super().__init__(config=transformer_config)
 
         if has_config_logger_enabled(transformer_config):
@@ -61,12 +71,34 @@ def __init__(
 
         self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0)
 
+        self.ln_pre = None
+        self.ln_post = None
+        if model_subtype == "clip":
+            self.ln_pre = build_module(
+                ln_pre_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = False
+            padding = 0
+        if model_subtype == "siglip":
+            self.ln_post = build_module(
+                ln_post_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = True
+            padding = "valid"
+
         self.conv1 = torch.nn.Conv2d(
             in_channels=3,
             out_channels=self.visual_hidden_size,
             kernel_size=self.patch_dim,
             stride=self.patch_dim,
-            bias=False,
+            bias=conv_bias,
+            padding=padding,
         )
 
         self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
@@ -79,13 +111,6 @@ def __init__(
                 torch.randn(1, self.class_token_len, self.visual_hidden_size)
             )
 
-        self.ln_pre = build_module(
-            ln_pre_impl,
-            config=transformer_config,
-            hidden_size=self.visual_hidden_size,
-            eps=transformer_config.layernorm_epsilon,
-        )
-
         self.model_type = ModelType.encoder_or_decoder
 
         # Transformer layers.
@@ -134,7 +159,8 @@ def forward(
 
         assert x.shape[1] == self.seq_length, f"{x.shape[1]} != {self.seq_length}"
         x = x + self.position_embeddings(self.position_ids)
-        x = self.ln_pre(x)
+        if self.ln_pre:
+            x = self.ln_pre(x)
         x = x.permute(1, 0, 2)  # [b, s, h] -> [s, b, h]
         # `permute` can make the tensor non-contiguous, breaking pipelining.
         x = x.contiguous()
@@ -142,17 +168,23 @@ def forward(
         x = self.decoder(x, attention_mask)
         x = x.permute(1, 0, 2)  # [s, b, h] -> [b, s, h]
         x = x.contiguous()
-
+        if self.ln_post:
+            x = self.ln_post(x)
         return x
 
 
-def get_num_image_embeddings(img_h, img_w, patch_dim, disable_vision_class_token, class_token_len):
+def get_num_image_embeddings(
+    img_h, img_w, patch_dim, vision_model_type, disable_vision_class_token, class_token_len
+):
     """Get the number of image embeddings per image tile."""
-    add_class_token = not disable_vision_class_token
+    if vision_model_type == "siglip":
+        keep_class_token = False
+    elif vision_model_type == "clip":
+        keep_class_token = not disable_vision_class_token
 
     num_patches_per_dim_h = img_h // patch_dim
     num_patches_per_dim_w = img_w // patch_dim
     num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-    num_image_embeddings_per_tile = num_patches + (class_token_len if add_class_token else 0)
+    num_image_embeddings_per_tile = num_patches + (class_token_len if keep_class_token else 0)
 
     return num_image_embeddings_per_tile
diff --git a/megatron/training/activations.py b/megatron/training/activations.py
index fee84bddd0..c6ce9f1de1 100644
--- a/megatron/training/activations.py
+++ b/megatron/training/activations.py
@@ -16,3 +16,7 @@ def squared_relu(x: torch.Tensor) -> torch.Tensor:
 @jit_fuser
 def quick_gelu(x: torch.Tensor) -> torch.Tensor:
     return x * torch.sigmoid(1.702 * x)
+
+@jit_fuser
+def fast_gelu(x: torch.Tensor) -> torch.Tensor:
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index b0b9d21d97..6b1848e96c 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -46,10 +46,12 @@ def model_provider(
         model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model
     """
     args = get_args()
+    vision_model_type = "clip"
 
     num_image_embeddings = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+        args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
     )
+
     old_seq_length = args.seq_length
     # decoder_seq_length denotes the language model sequence length.
     args.decoder_seq_length = args.seq_length + num_image_embeddings
@@ -87,6 +89,7 @@ def model_provider(
     vision_transformer_config.num_layers = args.encoder_num_layers
     vision_transformer_config.first_pipeline_num_layers = None
     vision_transformer_config.last_pipeline_num_layers = None
+    vision_transformer_config.vision_model_type = vision_model_type
 
     vision_projection_type = "mlp"
     vision_projection_config = deepcopy(language_transformer_config)
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index 0110ad4e8b..b3142fb807 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -42,6 +42,7 @@ def setup_method(self, method):
         vision_layer_spec = deepcopy(language_layer_spec)
         vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
 
+        vision_config.vision_model_type = "clip"
         self.model = LLaVAModel(
             language_transformer_config=language_config,
             language_transformer_layer_spec=language_layer_spec,
@@ -377,3 +378,62 @@ def test_freeze(self):
 
         for param in self.model.vision_projection.parameters():
             assert param.requires_grad
+
+
+class TestLLaVAModelSigLIP:
+    @pytest.mark.internal  # The model is under active development and its methods may change.
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        language_config = TransformerConfig(
+            num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False
+        )
+        vision_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False
+        )
+        vision_projection_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            ffn_hidden_size=72,
+            num_attention_heads=1,
+            use_cpu_initialization=False,
+        )
+
+        language_layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        vision_layer_spec = deepcopy(language_layer_spec)
+        vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
+
+        vision_config.vision_model_type = "siglip"
+        self.model = LLaVAModel(
+            language_transformer_config=language_config,
+            language_transformer_layer_spec=language_layer_spec,
+            language_vocab_size=2048,
+            language_max_sequence_length=4096,
+            vision_transformer_config=vision_config,
+            vision_transformer_layer_spec=vision_layer_spec,
+            drop_vision_class_token=False,
+            vision_projection_config=vision_projection_config,
+            vision_projection_layer_spec=vision_projection_spec,
+            img_h=336,
+            img_w=336,
+            patch_dim=14,
+        )
+
+    @pytest.mark.internal
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.model, LLaVAModel)
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 1832456
+
+    @pytest.mark.internal
+    def test_set_input_tensor(self):
+        expected_shape = (1, 2, 3, 4)
+        input_tensor = torch.zeros(expected_shape)
+        self.model.set_input_tensor(input_tensor)
+        assert self.model.vision_model.decoder.input_tensor.shape == expected_shape

From d819b9ca8b8196c75abacbdc104eb18ee02accb2 Mon Sep 17 00:00:00 2001
From: Lifu Zhang <lifuz@nvidia.com>
Date: Thu, 3 Oct 2024 17:00:22 -0700
Subject: [PATCH 40/50] ADLR/megatron-lm!2175 - adding cu_seqlens_padded
 support in MCore

Co-authored-by: root <root@cw-dfw-h100-002-248-012.cm.cluster>
Co-authored-by: Lifu Zhang <tomzhanglf@gmail.com>
---
 megatron/core/extensions/transformer_engine.py | 7 +++++++
 megatron/core/packed_seq_params.py             | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 36781f9cca..db39f8775b 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -566,6 +566,13 @@ def forward(
             packed_seq_kwargs.pop("max_seqlen_q", None)
             packed_seq_kwargs.pop("max_seqlen_kv", None)
 
+        if get_te_version() < PkgVersion("1.8.0"):
+            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
+            # in each individual sequence in THD format dataset
+            # These two arguments did not exist prior to 1.8.0
+            packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
+            packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
+
         if self.config.apply_rope_fusion and qkv_format == 'bshd':
             query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
             # In PyTorch, the following two tensors are in fact the same:
diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py
index fe63e13e99..dff0cc5992 100644
--- a/megatron/core/packed_seq_params.py
+++ b/megatron/core/packed_seq_params.py
@@ -6,9 +6,15 @@
 
 @dataclass
 class PackedSeqParams:
-    # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format,
+    '''
+    parameters to TEDotProductAttention and fused rope kernels for the
+    `thd` (packed) sequence format
+    '''
+
     qkv_format: str = None
     cu_seqlens_q: Tensor = None
     cu_seqlens_kv: Tensor = None
+    cu_seqlens_q_padded: Tensor = None
+    cu_seqlens_kv_padded: Tensor = None
     max_seqlen_q: Tensor = None
     max_seqlen_kv: Tensor = None

From 4ded7ce559bfbcb65ac7cc99870c5cc5c2c24ca8 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Date: Thu, 3 Oct 2024 21:33:08 -0700
Subject: [PATCH 41/50] ADLR/megatron-lm!2181 - Fixing attention mask dimenions
 to support TE versions > 1.9

Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
---
 megatron/core/models/bert/bert_model.py    |  81 ++++++----
 tests/unit_tests/models/test_bert_model.py | 179 +++++++++++----------
 2 files changed, 147 insertions(+), 113 deletions(-)

diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
index 541d05d905..eb08d4cfd6 100644
--- a/megatron/core/models/bert/bert_model.py
+++ b/megatron/core/models/bert/bert_model.py
@@ -8,7 +8,7 @@
 
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
+from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec
 from megatron.core.models.bert.bert_lm_head import BertLMHead
 from megatron.core.models.bert.pooler import Pooler
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
@@ -93,9 +93,7 @@ def __init__(
         # megatron core pipelining currently depends on model type
         self.model_type = ModelType.encoder_or_decoder
 
-        self.attn_mask_dimensions = self._santiy_check_attention_and_get_attn_mask_dimension(
-            transformer_layer_spec
-        )
+        self.attn_mask_dimensions = self._sanity_check_attention_and_get_attn_mask_dimension()
 
         # Embeddings.
         if self.pre_process:
@@ -154,52 +152,71 @@ def __init__(
         if self.pre_process or self.post_process:
             self.setup_embeddings_and_output_layer()
 
-    def _santiy_check_attention_and_get_attn_mask_dimension(
-        self, transformer_layer_spec: ModuleSpec
-    ) -> str:
+    # pylint: disable=line-too-long
+    def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str:
         """We do some checks and return attention mask dimensions for self attention
 
         Transformer engine library underwent a lot of change. So we need to change dimensions of
         the attention mask depending on the TE version. We also santiy check some arguments.
 
         1. If we use local version of attention dimension of the mask is [b,1,s,s]
-        2. If we use transformer engine < 1.7
-          (Flash and Fused attention not supported. We use unfused path).
-          Attn mask dimension is [b,1,s,s]
-        2. If we use transformer engine >= 1.7
-          (Flash and fused attention supported with attn mask dimension [b,1,1,s]).
-          Unfused path will use attn mask dimension [b,1,s,s] with attn mask type arbitrary.
-          Default if you dont set any NVTE_ATTN flag will just use unfused path.
+        2. If we use transformer engine > 1.10 we support all 3 backends with padding mask and [b,1,s,s]
+        3. If we use transformer engine >= 1.7 but less than 1.10
+          a ) Flash and Fused attention uses padding mask with [b,1,1,s]
+          b ) Unfused attention works with arbitrary mask with [b,1,s,s]
+        4. If we use transformer engine < 1.7
+          Flash and fused attention is not supported. Unfused attention will work with padding mask [b,1,s,s]
+
+        Default if you dont set any NVTE_ATTN flag will it will just use the fused path for transformer engine version >= 1.7 and unfused path for other
 
         Args:
-            transformer_layer_spec (ModuleSpec): _description_
+            transformer_layer_spec (ModuleSpec): The transformer layer spec
 
         Returns:
-            str: _description_
+            str: A string showing the format of the attn mask dimensions
         """
-        attn_mask_dimensions = "b1ss"
-        if transformer_layer_spec == bert_layer_with_transformer_engine_spec:
-            if is_te_min_version("1.7.0"):
-                if os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0':
-                    assert (
-                        transformer_layer_spec.submodules.self_attention.params['attn_mask_type']
-                        == AttnMaskType.arbitrary
-                    ), (
-                        "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset "
-                        "both of them or set one of them to 1 to use a more optimized attention "
-                        "kernel. Currently using unfused attention path. If you want to proceed "
-                        "with this path set AttnMaskType in module spec to be arbitrary"
+        attn_mask_dimensions = None
+        # For local layer spec we just use b1ss
+        if self.transformer_layer_spec == bert_layer_local_spec:
+            attn_mask_dimensions = "b1ss"
+        else:
+            attn_mask_type = self.transformer_layer_spec.submodules.self_attention.params[
+                'attn_mask_type'
+            ]
+            flash_attention_enabled = os.getenv('NVTE_FLASH_ATTN') == '1'
+            fused_attention_enabled = os.getenv('NVTE_FUSED_ATTN') == '1'
+            # For TE >= 1.10 (We always use padding mask and use b11s)
+            if is_te_min_version("1.10.0"):
+                attn_mask_dimensions = "b11s"
+                if attn_mask_type != AttnMaskType.padding:
+                    warnings.warn(
+                        f'For TE versions >= 1.10 , flash/fused/unfused support padding mask. Setting attention mask from {attn_mask_type} to padding'
                     )
-                else:
+                    self.transformer_layer_spec.submodules.self_attention.params[
+                        'attn_mask_type'
+                    ] = AttnMaskType.padding
+            # For 1.7 >= TE < 1.10 flash and fused path use padding mask with b11s and unfused path uses arbitrary mask with b1ss
+            elif is_te_min_version("1.7.0"):
+                if flash_attention_enabled or fused_attention_enabled:
                     attn_mask_dimensions = "b11s"
+                else:
+                    if attn_mask_type != AttnMaskType.arbitrary:
+                        warnings.warn(
+                            f'For TE versions >= 1.7 but < 1.10 , unfused path supports only arbitrary mask. Setting attention mask from {attn_mask_type} to arbitray'
+                        )
+                        self.transformer_layer_spec.submodules.self_attention.params[
+                            'attn_mask_type'
+                        ] = AttnMaskType.arbitrary
+                    attn_mask_dimensions = "b1ss"
+            # For TE < 1.7 we only support unfused attention with b1ss and padding mask
             else:
-                assert (
-                    os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0'
-                ), (
+                attn_mask_dimensions = "b1ss"
+                assert not flash_attention_enabled and not fused_attention_enabled, (
                     "Flash and fused attention is not supported with transformer engine version "
                     "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
                     "engine >= 1.7"
                 )
+
         return attn_mask_dimensions
 
     def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
index 75fbf914a2..186ce5c34e 100644
--- a/tests/unit_tests/models/test_bert_model.py
+++ b/tests/unit_tests/models/test_bert_model.py
@@ -8,9 +8,13 @@
 from packaging.version import Version as PkgVersion
 from pytest_mock import mocker
 
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
 from megatron.core.models.bert.bert_model import BertModel
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
@@ -90,118 +94,131 @@ def test_post_process_forward(self):
         assert logits[0].shape[2] == self.bert_model.vocab_size
 
 
-class TestBertModelAssertions:
+class TestBertModelAttentionDimensions:
 
-    @pytest.mark.internal
-    def test_te_assertions_te_less_than_1_7(self, mocker):
-        os.environ.pop('NVTE_FLASH_ATTN', None)
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
         os.environ.pop('NVTE_FUSED_ATTN', None)
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
+        os.environ.pop('NVTE_FLASH_ATTN', None)
+        os.environ.pop('NVTE_UNFUSED_ATTN', None)
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
+        self.transformer_config = TransformerConfig(
             num_layers=2,
             hidden_size=12,
             num_attention_heads=4,
             use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
             pipeline_dtype=torch.bfloat16,
         )
+        # This should convert arbitray mask to padding mask
+        self.bert_model = BertModel(
+            config=self.transformer_config,
+            num_tokentypes=0,
+            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+            vocab_size=100,
+            max_sequence_length=4,
+        )
 
-        with pytest.raises(Exception) as exc_info:
-            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.4"))
-            self.bert_model = BertModel(
-                config=transformer_config,
-                num_tokentypes=0,
-                transformer_layer_spec=bert_layer_with_transformer_engine_spec,
-                vocab_size=100,
-                max_sequence_length=4,
-            )
+    @pytest.mark.internal
+    def test_local_spec(self, mocker):
+        self.bert_model.transformer_layer_spec = bert_layer_local_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
 
-        assert str(exc_info.value) == (
-            "Flash and fused attention is not supported with transformer engine version < 1.7. "
-            "Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
-        )
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_10(self, mocker):
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.arbitrary
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.10"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.padding
+        ), f"Exepcted attn mask type to be padding, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
 
     @pytest.mark.internal
-    def test_te_assertions_te_equal_to_1_7_exception(self, mocker):
+    def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
         os.environ['NVTE_FLASH_ATTN'] = '0'
         os.environ['NVTE_FUSED_ATTN'] = '0'
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2,
-            hidden_size=12,
-            num_attention_heads=4,
-            use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
-            pipeline_dtype=torch.bfloat16,
-        )
 
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
         with pytest.raises(Exception) as exc_info:
-            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.7"))
             self.bert_model = BertModel(
-                config=transformer_config,
+                config=self.transformer_config,
                 num_tokentypes=0,
                 transformer_layer_spec=bert_layer_with_transformer_engine_spec,
                 vocab_size=100,
                 max_sequence_length=4,
             )
-
         assert str(exc_info.value) == (
-            "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set "
-            "one of them to 1 to use a more optimized attention kernel. Currently using unfused attention "
-            "path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
+            "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
+            "instantiating TERowParallelLinear when instantiating SelfAttention when "
+            "instantiating TransformerLayer"
         )
 
     @pytest.mark.internal
-    def test_te_assertions_te_equal_to_1_7_no_exception(self, mocker):
-        os.environ.pop('NVTE_FLASH_ATTN', None)
-        os.environ.pop('NVTE_FUSED_ATTN', None)
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2,
-            hidden_size=12,
-            num_attention_heads=4,
-            use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
-            pipeline_dtype=torch.bfloat16,
-        )
+    def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        os.environ['NVTE_FUSED_ATTN'] = '0'
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.arbitrary
+        ), f"Exepcted attn mask type to be arbitrary, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+        Utils.destroy_model_parallel()
 
-        if is_te_min_version("1.7"):  # If TE version >= 1.7, no exception should be raised
+    @pytest.mark.internal
+    def test_transformer_engine_version_less_than_1_7(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
+        with pytest.raises(Exception) as exc_info:
+            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.5"))
             self.bert_model = BertModel(
-                config=transformer_config,
+                config=self.transformer_config,
                 num_tokentypes=0,
                 transformer_layer_spec=bert_layer_with_transformer_engine_spec,
                 vocab_size=100,
                 max_sequence_length=4,
             )
-        else:  # If TE version < 1.7, an exception should be raised in other files
-            with pytest.raises(Exception) as exc_info:
-                mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.7"))
-                self.bert_model = BertModel(
-                    config=transformer_config,
-                    num_tokentypes=0,
-                    transformer_layer_spec=bert_layer_with_transformer_engine_spec,
-                    vocab_size=100,
-                    max_sequence_length=4,
-                )
-            assert str(exc_info.value) == (
-                "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
-                "instantiating TERowParallelLinear when instantiating SelfAttention when "
-                "instantiating TransformerLayer"
-            )
 
-        Utils.destroy_model_parallel()
+        assert str(exc_info.value) == (
+            "Flash and fused attention is not supported with transformer engine version "
+            "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+            "engine >= 1.7"
+        )

From c3026975a0c7ad429e0ae9af0a3f822d2c8b61d1 Mon Sep 17 00:00:00 2001
From: James Shen <yueshen@nvidia.com>
Date: Thu, 3 Oct 2024 21:56:01 -0700
Subject: [PATCH 42/50] ADLR/megatron-lm!2180 - rotary_scaling fix for llama3.1
 and 3.2

---
 .../export/ptq_and_trtllm_export/README.md    | 48 ++++++++++++++-----
 .../ptq_trtllm_llama2_7b.sh                   |  6 +--
 .../ptq_trtllm_llama3_1_8b.sh                 |  7 +--
 .../ptq_trtllm_llama3_8b.sh                   |  6 +--
 .../ptq_trtllm_minitron_8b.sh                 |  2 +-
 .../ptq_trtllm_mistral_12b.sh                 |  2 +-
 .../text_generation_ptq.py                    |  7 +--
 megatron/core/models/gpt/gpt_model.py         |  3 +-
 megatron/inference/gpt/model_provider.py      |  1 +
 9 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
index e167b60e1c..c5255f7ccf 100644
--- a/examples/export/ptq_and_trtllm_export/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -74,7 +74,7 @@ cd ../..
 
 Now launch the PTQ + TensorRT-LLM export script,
 ```sh
-bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
 ```
 By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
 quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
@@ -104,12 +104,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
 ```
 
 ### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
@@ -139,7 +139,7 @@ huggingface-cli login
 Now launch the PTQ + TensorRT-LLM checkpoint export script,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -149,12 +149,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
 ```
 
 
@@ -165,7 +165,7 @@ python examples/inference/quantization/trtllm_text_generation.py --tokenizer mis
 > that we support.
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
 ```
 
 The script expect `${CHECKPOINT_DIR}` to have the following structure:
@@ -184,8 +184,23 @@ The script expect `${CHECKPOINT_DIR}` to have the following structure:
 In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
 the source of the tokenizer.
 
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b
+```
+
 ### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
-> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12.
+> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13.
 
 > **NOTE:** There are two ways to acquire the checkpoint. Users can follow
 > the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
@@ -199,16 +214,23 @@ If users choose to download the model from NGC, first extract the sharded checkp
 tar -xvf 8b_pre_trained_bf16.nemo
 ```
 
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
 Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
 ```
 
 or llama-3.1
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -218,14 +240,14 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
 # For llama-3
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
 #For llama-3.1
 ```
\ No newline at end of file
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
index 8c4777f07a..ebcc448955 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
@@ -66,7 +66,7 @@ options=" \
     --tokenizer-model ${TOKENIZER_MODEL} \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
+    --load ${CHECKPOINT_LOAD_DIR} \
     --fp16"
 
 # Precompile CUDA extentions
@@ -76,7 +76,5 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
 
-# This script is using mpi4py which will fork multiple processes.
-python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
index d22ae4d472..a6251663f7 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
@@ -63,9 +63,10 @@ options=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model meta-llama/Meta-Llama-3.1-8B \
     --save-interval 1000000 \
+    --use-rope-scaling \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +76,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
index 11ab023fad..f181c8c2dd 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
@@ -64,8 +64,8 @@ options=" \
     --tokenizer-model meta-llama/Meta-Llama-3-8B \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +75,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
index 8c7bc0cb82..31ec192fd5 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
@@ -71,4 +71,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
index 17ded50d1e..3eb02d2e1d 100644
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
@@ -72,4 +72,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
index 13b327b25a..340c9c90f7 100644
--- a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -6,12 +6,11 @@
 import sys
 from pathlib import Path
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
 
 import modelopt.torch.quantization as mtq
 import torch
 from datasets import load_dataset
-from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
 from tqdm import tqdm
 
 # [ModelOpt]: changing the default model provider to the ModelOpt version
@@ -179,10 +178,6 @@ def hf_dataset_forword_loop_func(model):
     if args.calib_dataset is not None:
         ptq_forward_loop_func = hf_dataset_forword_loop_func
 
-    # Setting data parallel and tensor parallel group
-    set_data_parallel_group(mpu.get_data_parallel_group())
-    set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
-
     if args.export_quant_cfg in QUANT_CFG_CHOICES:
         mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
         if "*output_layer*" not in mtq_config["quant_cfg"]:
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 7ee6dde182..bd52f89680 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -91,10 +91,11 @@ def __init__(
         # TODO: remove this dependency ?
         self.model_type = ModelType.encoder_or_decoder
 
-        # These 2 attributes are needed for TensorRT-LLM export.
+        # These 4 attributes are needed for TensorRT-LLM export.
         self.max_position_embeddings = max_sequence_length
         self.rotary_percent = rotary_percent
         self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
 
         if self.pre_process:
             self.embedding = LanguageModelEmbedding(
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index 2e92a96e9e..0df0168fa5 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -64,6 +64,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
         "position_embedding_type": args.position_embedding_type,
         "rotary_percent": args.rotary_percent,
         "rotary_base": args.rotary_base,
+        "rope_scaling": args.use_rope_scaling,
     }
 
     model = model_type(**model_kwargs)

From 7619780e5c335ac4aeaefdca7f93a042cbbe5f20 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Fri, 4 Oct 2024 07:47:58 -0700
Subject: [PATCH 43/50] ADLR/megatron-lm!2185 - chore: Improve generator for
 launch scripts

---
 .../jet/generate_local_jobs.py                | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
index 4124e1c338..bc9ad22302 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
@@ -1,3 +1,10 @@
+"""Generate launch scripts for local execution.
+
+This script allows to generate pre-filled launch scripts that allow for local execution of Megatron-LM functional tests inside containerized enviroments (i.e. Slurm enroot or Docker).
+
+This script will generate scripts into `$(pwd)/test_cases`.
+"""
+
 import pathlib
 from typing import Optional
 
@@ -22,7 +29,13 @@ def load_script(config_path: str) -> str:
 @click.option(
     "--test-case", required=False, type=str, help="Returns a single test-case with matching name."
 )
-@click.option("--output-path", required=True, type=str, help="Path to write jobs to")
+@click.option(
+    "--output-path",
+    required=True,
+    type=str,
+    help="Directory where the functional test will write its artifacts to (Tensorboard logs)",
+    default="/opt/megatron-lm",
+)
 def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
     workloads = common.load_workloads(
         container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
@@ -32,10 +45,10 @@ def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], o
         if workload.type == "build":
             continue
         magic_values = dict(workload.spec)
-        magic_values["assets_dir"] = "."
+        magic_values["assets_dir"] = output_path
 
         file_path = (
-            pathlib.Path(output_path)
+            pathlib.Path.cwd()
             / "test_cases"
             / workload.spec.model
             / f"{workload.spec.test_case}.sh"

From 52699ce1aea78879e0cd6d8b28c050a6cbf64170 Mon Sep 17 00:00:00 2001
From: Huy Vu <huvu@nvidia.com>
Date: Fri, 4 Oct 2024 17:43:25 -0700
Subject: [PATCH 44/50] ADLR/megatron-lm!2160 - Adding Inference pipeline for
 T5

Co-authored-by: Eric Harper <eharper@nvidia.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
---
 .../inference/t5/simple_t5_batch_inference.py | 157 ++++++++++++++
 .../core/inference/engines/mcore_engine.py    |  53 +++--
 megatron/core/inference/inference_request.py  |   9 +
 .../model_inference_wrappers/t5/__init__.py   |   1 +
 .../t5/t5_inference_wrapper.py                | 205 ++++++++++++++++++
 megatron/core/inference/scheduler.py          |  37 ++--
 ...oder_decoder_text_generation_controller.py |  35 +++
 .../simple_text_generation_controller.py      | 112 +++++++---
 megatron/training/tokenizer/tokenizer.py      | 122 ++++++-----
 .../inference/engines/test_mcore_engine.py    |  27 +++
 .../t5/test_t5_inference_wrapper.py           | 124 +++++++++++
 ...oder_decoder_text_generation_controller.py | 143 ++++++++++++
 .../test_simple_text_generation_controller.py |   2 +-
 13 files changed, 917 insertions(+), 110 deletions(-)
 create mode 100644 examples/inference/t5/simple_t5_batch_inference.py
 create mode 100644 megatron/core/inference/model_inference_wrappers/t5/__init__.py
 create mode 100644 megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
 create mode 100644 megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
 create mode 100644 tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
 create mode 100644 tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py

diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py
new file mode 100644
index 0000000000..3f4557d3c2
--- /dev/null
+++ b/examples/inference/t5/simple_t5_batch_inference.py
@@ -0,0 +1,157 @@
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    common_inference_params = CommonInferenceParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        common_inference_params=common_inference_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py
index 496a288bae..fe8160228b 100644
--- a/megatron/core/inference/engines/mcore_engine.py
+++ b/megatron/core/inference/engines/mcore_engine.py
@@ -13,47 +13,66 @@
 
 
 class MCoreEngine(AbstractEngine):
+    """The Megatron core backend constructor
+
+    This is the backend that does a simple forward pass on the model.
+    Supports any model that is callable (Accepts the inputs and outputs the tensor)
+
+    Args:
+        text_generation_controller (SimpleTextGenerationController): A text generation
+            controller that will be used to define how to preprocess prompts, generate
+            outputs and detokenizer the output tokens.
+        max_batch_size : The maxinum number of requests to process at once
+        random_seed (int, optional): Use a random seed if you want deterministic
+            results. Defaults to None.
+    """
+
     def __init__(
         self,
         text_generation_controller: SimpleTextGenerationController,
         max_batch_size,
         random_seed: int = None,
     ):
-        """The Megatron core backend constructor
-
-        This is the backend that does a simple forward pass on the model. Supports any model that is callable (Accepts the inputs and outputs the tensor)
-
-        Args:
-            text_generation_controller (SimpleTextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens.
-            max_batch_size : The maxinum number of requests to process at once
-            random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None.
-        """
-
         self.text_generation_controller = text_generation_controller
         self.random_seed = random_seed
         self.scheduler = Scheduler(max_batch_size=max_batch_size)
 
-    def generate(self, prompts: List[str], common_inference_params: CommonInferenceParams) -> dict:
+    def generate(
+        self,
+        prompts: List[str],
+        add_BOS: bool = False,
+        encoder_prompts: List[str] = None,
+        common_inference_params: CommonInferenceParams = None,
+    ) -> dict:
         """The megatron core inference backend generate function
 
-        This backend returns the output generations as a dictionary. It returns the prompt tokens along with the generated tokens, the prompt plus the generated string and the output log probabilities if requested
+        This backend returns the output generations as a dictionary.
+        It returns the prompt tokens along with the generated tokens, the prompt
+        plus the generated string and the output log probabilities if requested
 
         Args:
             prompts (List[str]): All the prompts as a list of strings
+            add_BOS (bool): Whether to add BOS token to beginning of prompts
+            encoder_prompts (List[dict]): All the encoder prompts as a list of strings
             common_inference_params (CommonInferenceParams): The inference parameters
 
         Returns:
-            List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required
+            List[InferenceRequest]: The output is list of inference requests containing the
+            generated tokens, texts and log probs if required
         """
         # TODO :M core- get rng state tracker
         if self.random_seed:
             torch.random.manual_seed(self.random_seed)
 
-        for prompt in prompts:
-            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt)
+        for i in range(len(prompts)):
+            prompt = prompts[i]
+            encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
+            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
+
             self.scheduler.add_request(
                 prompt=prompt,
                 prompt_tokens=prompt_tokens,
+                encoder_prompt=encoder_prompt,
                 inference_parameters=common_inference_params,
             )
 
@@ -68,7 +87,9 @@ def run_engine(self):
         Runs the engine until there are no requests in the queue.
 
         Args:
-            dynamic_generation (bool, optional): Set this to True, if you want to enable dynamic batching. Mainly used with an inference server. Defaults to False.
+            dynamic_generation (bool, optional): Set this to True, if you want
+                to enable dynamic batching. Mainly used with an inference server.
+                Defaults to False.
         """
         while self.scheduler.have_requests_pending():
             active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy()
diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py
index a03834c7e4..4825dfd366 100644
--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
@@ -10,6 +10,8 @@
 
 # class syntax
 class Status(Enum):
+    """Enum for status"""
+
     WAITING_IN_QUEUE = 1
     ACTIVE_AND_GENERATING_TOKENS = 2
     ACTIVE_BUT_NOT_GENERATING_TOKENS = 3
@@ -18,12 +20,19 @@ class Status(Enum):
 
 @dataclass
 class InferenceRequest:
+    """Class for one inference request
+
+    Containing relevant data for an inference request
+
+    """
+
     request_id: str
     prompt: str
     inference_parameters: CommonInferenceParams
     prompt_tokens: List[int]
     arrival_time: float
     status: Status
+    encoder_prompt: str = None
     generated_text: str = None
     generated_tokens: torch.Tensor = None
     generated_log_probs: torch.Tensor = None
diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
new file mode 100644
index 0000000000..10e1da4812
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from argparse import Namespace
+from collections import deque
+from typing import Any, List, Tuple
+
+import numpy
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.models.T5 import T5Model
+
+
+class T5InferenceWrapper(AbstractModelInferenceWrapper):
+    """Constructor for the model inference wrapper
+
+    The wrapper prepares the model for inference, provides the required input
+    data, and runs the forward pass
+
+    Args:
+        model (T5Model): The T5 model (MCore or legacy)
+        args (Namespace): The command line arguments that were passed
+    """
+
+    def __init__(self, model: T5Model, args: Namespace):
+        super().__init__(model, args)
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None
+    ):
+        """A utility function for preparing model for inference
+
+        This function is called before the forward pass. It puts the model in eval mode, builds
+        position ids, and creates attention masks so that required slices can be extracted during
+        the forward pass.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            encoder_prompts (dict): List of string of encoder input prompts
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+        """
+
+        super().prep_model_for_inference(prompts_tokens=prompts_tokens)
+
+        encoder_prompts_tokens_list = [
+            self.tokenize_encoder_prompt(encoder_prompt, tokenizer)
+            for encoder_prompt in encoder_prompts
+        ]
+        self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
+            encoder_prompts_tokens_list, self.model.max_sequence_length, tokenizer
+        )
+
+        # create batch mask for encoder_prompt (self.batch_input_tokens) and
+        # decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
+        decoder_prompts_tokens = self.prompts_tokens.cpu().numpy()
+        encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy()
+        self.batch_mask_encoder = []
+        self.batch_mask_decoder = []
+        self.batch_mask_encoder_decoder = []
+        for i in range(len(self.prompts_tokens)):
+            self.batch_mask_encoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    encoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+            self.batch_mask_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], decoder_prompts_tokens[i]
+                )
+                * T5MaskedWordPieceDataset._make_history_mask(decoder_prompts_tokens[i])
+            )
+            self.batch_mask_encoder_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+        self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda()
+        self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda()
+        self.batch_mask_encoder_decoder = torch.tensor(
+            numpy.array(self.batch_mask_encoder_decoder)
+        ).cuda()
+        self.batch_mask_encoder = self.batch_mask_encoder < 0.5
+        self.batch_mask_decoder = self.batch_mask_decoder < 0.5
+        self.batch_mask_encoder_decoder = self.batch_mask_encoder_decoder < 0.5
+
+    def tokenize_encoder_prompt(
+        self, encoder_prompt: str, tokenizer
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the encoder_prompt
+
+        Args:
+            encoder_prompt (str): The encoder_prompt
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing string
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+
+        # if there is the word "<mask>" in prompt, replacing it with special_additional_token,
+        # similar to processing step in megatron/core/datasets/t5_dataset.py
+        divided_encoder_prompt_list = encoder_prompt.split("<mask>")
+        masks_count = len(divided_encoder_prompt_list) - 1
+        sentinels = deque(tokenizer.additional_special_tokens_ids)
+
+        encoder_prompt_tokens = []
+        for divided_encoder_prompt in divided_encoder_prompt_list:
+            divided_encoder_prompt_tokens = tokenizer.tokenize(divided_encoder_prompt)
+            encoder_prompt_tokens.extend(divided_encoder_prompt_tokens)
+            if masks_count > 0:
+                sentinel = sentinels.popleft()
+                encoder_prompt_tokens.extend([sentinel])
+
+        return encoder_prompt_tokens
+
+    def pad_encoder_prompts_tokens(
+        self, encoder_prompts_tokens_list: List[List[int]], max_sequence_length: int, tokenizer
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            encoder_prompts_tokens_list (List[List[int]]): A list containing the
+                encoder_input_tokens
+            max_sequence_length (int): Maximum of the length of the encoder inputs tokens
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_sequence_length]
+        """
+
+        for encoder_prompt_tokens in encoder_prompts_tokens_list:
+            padding_size = max_sequence_length - len(encoder_prompt_tokens)
+            encoder_prompt_tokens.extend([tokenizer.pad] * padding_size)
+
+        return torch.tensor(encoder_prompts_tokens_list).cuda()
+
+    def get_batch_for_context_window(
+        self, context_start_position: int, context_end_position: int
+    ) -> List:
+        """Returns the inference data given context window
+
+        This function gets called iteratively in a loop . Given the start and end context
+        positions , it extracts the appropriate data.
+
+        Args:
+            context_start_position (int): Start of the context window. During
+                the first inference step it is mostly 0
+            context_end_position (int): End of the context window. During the
+                last inference step it will mostly be the max generated sequence length.
+
+        Returns:
+            List: A list of inputs that will be used by your model in the forward step
+        """
+
+        # rerun encoder every step
+        # T5 inference not yet support kv_cache
+        encoder_tokens2use = self.batch_encoder_prompts_tokens
+        decoder_tokens2use = self.prompts_tokens[:, :context_end_position]
+        encoder_mask2use = self.batch_mask_encoder
+        decoder_mask2use = self.batch_mask_decoder[:, :context_end_position, :context_end_position]
+        encoder_decoder_mask2use = self.batch_mask_encoder_decoder[:, :context_end_position, :]
+        data_at_step_idx = [
+            encoder_tokens2use,
+            decoder_tokens2use,
+            encoder_mask2use,
+            decoder_mask2use,
+            encoder_decoder_mask2use,
+        ]
+
+        return data_at_step_idx
+
+    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
+        """Utility to carry out simple forward pass for TP or no model parallel models
+
+        Runs a very simple forward pass for model. Used  in the case of models without
+        any parallelism or only tensor parallelism.
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt
+                model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        [encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = (
+            inference_input
+        )
+        tokens = decoder_tokens
+
+        # T5 inference not yet support kv_cache
+        logits = self.model(
+            encoder_tokens,
+            decoder_tokens,
+            encoder_mask,
+            decoder_mask,
+            encoder_decoder_mask,
+            inference_params=None,
+        )
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+
+        return logits
diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py
index abcb325185..00ab81b4ab 100644
--- a/megatron/core/inference/scheduler.py
+++ b/megatron/core/inference/scheduler.py
@@ -2,7 +2,7 @@
 import time
 import typing
 from collections import OrderedDict
-from typing import Dict, List
+from typing import Dict
 
 import torch
 
@@ -12,14 +12,16 @@
 
 
 class Scheduler:
-    def __init__(self, max_batch_size: int):
-        """Scheduler for handling requests to inference engine
+    """Scheduler for handling requests to inference engine
 
-        This class is responsible for handing of all the incomign requests
+    This class is responsible for handing of all the incomign requests
 
-        Args:
-            max_batch_size (int): The max batch size that we can pass to the inference engine at a time.
-        """
+    Args:
+        max_batch_size (int): The max batch size that we can pass to the
+            inference engine at a time.
+    """
+
+    def __init__(self, max_batch_size: int):
         self.max_batch_size = max_batch_size
         self.active_request_pool: Dict[int, InferenceRequest] = OrderedDict()
         self.waiting_request_pool: Dict[int, InferenceRequest] = OrderedDict()
@@ -30,16 +32,19 @@ def add_request(
         self,
         prompt: str,
         prompt_tokens: torch.Tensor,
-        inference_parameters: CommonInferenceParams,
+        encoder_prompt: str = None,
+        inference_parameters: CommonInferenceParams = None,
         arrival_time: float = None,
     ):
         """Add an incoming request
 
-        This method will add the request to either the active pool or the waiting pool depending on the batch size.
+        This method will add the request to either the active pool or the waiting pool
+        depending on the batch size.
 
         Args:
             prompt (str): Input prompt string
             prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized
+            encoder_prompt (str): Encoder input string
             inference_parameters (CommonInferenceParams): The inference parameters
             arrival_time (float, optional): The incoming request time. Defaults to None.
         """
@@ -61,6 +66,7 @@ def add_request(
             arrival_time=arrival_time,
             prompt_tokens=prompt_tokens,
             status=status,
+            encoder_prompt=encoder_prompt,
         )
 
         if status == status.ACTIVE_BUT_NOT_GENERATING_TOKENS:
@@ -79,7 +85,8 @@ def have_requests_pending(self) -> bool:
     def add_earliest_waiting_request_to_active_pool(self):
         """Utility to add the waiting request to active pool
 
-        This method will add the earliest request (FIFO) that is in the waiting request pool to the active request pool.
+        This method will add the earliest request (FIFO) that is in the waiting request
+        pool to the active request pool.
         """
         assert (
             len(self.active_request_pool) < self.max_batch_size
@@ -94,11 +101,15 @@ def add_earliest_waiting_request_to_active_pool(self):
     def update_requests_pools(self, result_dict: typing.OrderedDict[int, InferenceRequest] = None):
         """Update request pool status
 
-        This method will full up the active request pool, if it has less than max batch size elements from the waiting request pool.
-        If provided with a request dict, it will put the completed requests into the completed request pool and add waiting request into active pool.
+        This method will full up the active request pool, if it has less than max batch size
+        elements from the waiting request pool.
+        If provided with a request dict, it will put the completed requests into the completed
+        request pool and add waiting request into active pool.
 
         Args:
-            result (typing.OrderedDict[int, InferenceRequest], optional): The result returned by the engine. A dictionary with keys as the request ids, and values as the requests. Defaults to None
+            result (typing.OrderedDict[int, InferenceRequest], optional): The result returned
+                by the engine. A dictionary with keys as the request ids, and values as the
+                requests. Defaults to None
         """
         for result_request_id in list(result_dict.keys()):
             active_request = self.active_request_pool[result_request_id]
diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..61beff0211
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import OrderedDict
+
+import torch
+
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+
+
+class EncoderDecoderTextGenerationController(SimpleTextGenerationController):
+    """The text generation controller for encoder-decoder architecture
+
+    This class ingherits from SimpleTextGenerationController, adding features
+    relating to encoder input encoder_prompt
+
+    """
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        encoder_prompts = list(
+            map(lambda request: request.encoder_prompt, active_requests.values())
+        )
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=prompts_tokens, encoder_prompts=encoder_prompts, tokenizer=self.tokenizer
+        )
diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
index e4db83f6b3..0667af8373 100644
--- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
@@ -14,15 +14,18 @@
 
 
 class SimpleTextGenerationController:
-    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
-        """The basic text generation controller
+    """The basic text generation controller
 
-        This class is responsible for tokenizing the input , running the inference, sampling and also detokenizing the output
+    This class is responsible for tokenizing the input , running the inference, sampling
+    and also detokenizing the output
 
-        Args:
-            inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py
-            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
-        """
+    Args:
+        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
+            is wrapped using the specs given in the abstract_model_inference_wrapper.py
+        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
+    """
+
+    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
         self.inference_wrapped_model = inference_wrapped_model
         self.tokenizer = tokenizer
 
@@ -31,7 +34,9 @@ def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, token
             parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
         )
 
-    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    def tokenize_prompt(
+        self, prompt: str, add_BOS: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Utility to tokenize the input prompts
 
         Args:
@@ -40,13 +45,19 @@ def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         Returns:
             torch.Tensor: Returns the tokenized prompt
         """
-        return self.tokenizer.tokenize(prompt)
+        prompt_tokens = self.tokenizer.tokenize(prompt)
+
+        if add_BOS:
+            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
+
+        return prompt_tokens
 
     def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
         """Detokenize the output generations
 
         Args:
-            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt tokens plus the generated tokens
+            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
+            tokens plus the generated tokens
 
         Returns:
             str: The detokenized output
@@ -62,11 +73,15 @@ def sample_from_logits(
     ) -> torch.Tensor:
         """Samples the logits to generate outputs
 
-        Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples
+        Given the logits of the last token, this function samples it
+        according to the parameters defined in common_inference_params
+        and returns the samples
 
         Args:
-            last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size]
-            common_inference_params (CommonInferenceParams): The paramters to use for inference
+            last_token_logits (torch.Tensor): The last token logits. A tensor of
+                size [batch_size, vocab_size]
+            common_inference_params (CommonInferenceParams): The paramters to use
+                for inference
             vocab_size (int): Obtained from the tokenizer. Defaults to None
 
         Returns:
@@ -141,23 +156,35 @@ def update_generation_status(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Checks which prompts have reached an end condition
 
-        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True. The generated sequence lengths increase as we keep generating, until that prompts hits an end condition. The generation_started tensor determines which prompts have started generating.
+        We check which prompts have reached an end condition and set the corresponding
+        flags of the is_generation_done_tensor to True. The generated sequence lengths
+        increase as we keep generating, until that prompts hits an end condition. The
+        generation_started tensor determines which prompts have started generating.
 
         Args:
-            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest generated tokens. A tensor of shape [batch_size, max_seq_len] (i.e max_seq_len = max_prompt_len + tokens_to_generate)
-            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens.
-            current_context_end_position (int): An integer indicating which position to extract from the prompts tokens to get the latest generated tokens.
-            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has reached end condition.
-            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. Each value represents the generated sequence lengths for that prompt.
+            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
+                generated tokens. A tensor of shape [batch_size, max_seq_len]
+                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
+            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
+                indicates the prompt at that index has started generating tokens.
+            current_context_end_position (int): An integer indicating which position to
+                extract from the prompts tokens to get the latest generated tokens.
+            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
+                True indicates the prompt at that index has reached end condition.
+            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
+                Each value represents the generated sequence lengths for that prompt.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean is_generation_done_tensor and the generated_sequence_lengths after updating it
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
+                is_generation_done_tensor and the generated_sequence_lengths after updating it
         """
         latest_samples = updated_prompts_tokens[:, current_context_end_position]
-        # Make sure we are checking eod criterion only for prompts that have started generating (i.e) We only look at the generated tokenns and not the input tokens.
+        # Make sure we are checking eod criterion only for prompts that have started generating
+        # (i.e) We only look at the generated tokenns and not the input tokens.
         reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
         is_generation_done_tensor = is_generation_done_tensor | reached_eod
-        # We increment generated sequence lengths when that prompt has not hit the EOD and generation has started
+        # We increment generated sequence lengths when that prompt has not hit the
+        # EOD and generation has started
         generated_sequence_lengths += ~is_generation_done_tensor & generation_started
 
         return is_generation_done_tensor, generated_sequence_lengths
@@ -178,7 +205,9 @@ def pad_input_prompt_tokens(
             num_tokens_togenerate (int): The number of tokens to generate for each prompt
 
         Returns:
-            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, with extra indices for each tensor padded with mask id.
+            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
+            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
+            with extra indices for each tensor padded with mask id.
         """
         max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
 
@@ -193,13 +222,16 @@ def generate_output_tokens_dynamic_batch(
     ) -> OrderedDict[int, InferenceRequest]:
         """Utility to generate the output tokens and probabilities for the prompts
 
-        This utility generates the output tokens for a dynamic batch. It will run one forward step at a time, and pass control back to the engine, which will update the request pool and call this method again.
+        This utility generates the output tokens for a dynamic batch. It will run one forward step
+        at a time, and pass control back to the engine, which will update the request pool and call
+        this method again.
 
         Args:
             active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
 
         Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests after running one forward step.
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+            after running one forward step.
         """
         raise Exception("Not implemented yet")
 
@@ -208,7 +240,9 @@ def generate_all_output_tokens_static_batch(
     ) -> OrderedDict[int, InferenceRequest]:
         """Utility to generate the all the output tokens and probabilities for the prompts .
 
-        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        This utility generates the output tokens for a static batch. It runs the forward steps till
+        all prompts complete generation, updates the status of these requests to completed, adds
+        the generated result and returns these requests
 
         Args:
             active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
@@ -252,8 +286,9 @@ def generate_all_output_tokens_static_batch(
         generated_sequence_lengths = torch.zeros(batch_size).cuda()
 
         with torch.no_grad():
-            self.inference_wrapped_model.prep_model_for_inference(
-                prompts_tokens=batch_prompt_tokens
+
+            self.prep_model_for_inference(
+                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
             )
 
             context_start_position = 0
@@ -275,14 +310,17 @@ def generate_all_output_tokens_static_batch(
                         tensor=logits,
                     )
 
-                # Indicates which of the input prompts have started generating tokens. A 1D boolean tensor with [batch_size] elements (i.e) The shortest prompts will start generating first and so on
+                # Indicates which of the input prompts have started generating tokens.
+                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
+                # prompts will start generating first and so on
                 generation_started = prompt_lengths_in_batch <= context_end_position
                 last_token_logits = logits[:, -1, :]
                 sampled_logits = self.sample_from_logits(
                     last_token_logits, common_inference_params, self.tokenizer.vocab_size
                 )
 
-                # Substitute the sampled logits only for only the prompts that have started generating tokens
+                # Substitute the sampled logits only for only the prompts that
+                # have started generating tokens
                 batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
                     generation_started
                 ]
@@ -302,7 +340,8 @@ def generate_all_output_tokens_static_batch(
 
                 context_start_position = context_end_position
 
-                # Check end of generation status for each tensor and update generated sequence lengths
+                # Check end of generation status for each tensor
+                # and update generated sequence lengths
                 (is_generation_done_tensor, generated_sequence_lengths) = (
                     self.update_generation_status(
                         updated_prompts_tokens=batch_prompt_tokens,
@@ -348,3 +387,14 @@ def generate_all_output_tokens_static_batch(
             request.generated_text = self.detokenize_generations(required_result_tokens)
 
         return active_requests
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
index 226ae1e799..af0d493f87 100644
--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -2,15 +2,14 @@
 
 """Megatron tokenizers."""
 
-import math
-from abc import ABC, abstractmethod
 import base64
 import json
+import math
+import types
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import types
-
 from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
@@ -20,27 +19,28 @@
 def build_tokenizer(args, **kwargs):
     """Initialize tokenizer."""
     if args.rank == 0:
-        print('> building {} tokenizer ...'.format(args.tokenizer_type),
-              flush=True)
+        print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True)
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type == 'BertWordPieceLowerCase':
         assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=True, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'BertWordPieceCase':
         assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=False, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.vocab_file is not None
         assert args.merge_file is not None
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
     elif args.tokenizer_type == 'SentencePieceTokenizer':
         assert args.tokenizer_model is not None
-        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _SentencePieceTokenizer(
+            args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
         assert args.tokenizer_model is not None
         tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
@@ -65,13 +65,11 @@ def build_tokenizer(args, **kwargs):
         assert args.vocab_size is not None
         tokenizer = _NullTokenizer(args.vocab_size)
     else:
-        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(args.tokenizer_type))
+        raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type))
 
     # Add vocab size (if not already set from a checkpoint).
     if getattr(args, "padded_vocab_size", None) is None:
-        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
-                                                          args)
+        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
 
     return tokenizer
 
@@ -81,13 +79,14 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
     still having GPU friendly size."""
 
     after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * \
-        args.tensor_model_parallel_size
+    multiple = args.make_vocab_size_divisible_by * args.tensor_model_parallel_size
     after = int(math.ceil(after / multiple) * multiple)
     if args.rank == 0 and logging_enabled:
-        print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+        print(
+            ' > padded vocab (size: {}) with {} dummy tokens '
+            '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after),
+            flush=True,
+        )
     return after
 
 
@@ -97,10 +96,14 @@ def __init__(self, pretrained_model_name_or_path, **kwargs):
         try:
             import transformers
         except ImportError:
-            raise EnvironmentError(f"The transformers library must be installed to use huggingface_tokenizer_provider")
+            raise EnvironmentError(
+                f"The transformers library must be installed to use huggingface_tokenizer_provider"
+            )
 
         # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there
-        self._tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
+        )
         self._vocab = self._tokenizer.get_vocab()
         self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()}
 
@@ -146,8 +149,7 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
         self._additional_special_tokens = []
 
         # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
+        SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'}
         self._bos_token = '[BOS]'
         self.add_token(self._bos_token)
         self._bos_token_id = self.vocab.get(self._bos_token)
@@ -160,7 +162,8 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
         # These can be used as sentinel tokens in T5 model inputs
         additional_special_tokens = []
         additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)]
+        )
         self.add_additional_special_tokens(additional_special_tokens)
 
     def add_token(self, token):
@@ -195,6 +198,10 @@ def decode(self, ids):
         tokens = self.tokenizer.convert_ids_to_tokens(ids)
         return self.tokenizer.convert_tokens_to_string(tokens)
 
+    def detokenize(self, token_ids):
+        """Copy of decode() method for inference pipeline compatibility"""
+        return self.decode(token_ids)
+
     def decode_token_ids(self, token_ids):
         tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
         exclude_list = ['[PAD]', '[CLS]']
@@ -227,32 +234,37 @@ def mask(self):
 
     @property
     def bos(self):
-        """ Id of the beginning of sentence token in the vocabulary."""
+        """Id of the beginning of sentence token in the vocabulary."""
         return self._bos_token_id
 
     @property
     def eos(self):
-        """ Id of the end of sentence token in the vocabulary."""
+        """Id of the end of sentence token in the vocabulary."""
         return self._eos_token_id
 
+    @property
+    def eod(self):
+        """Copy of eod property for inference pipeline compatibility"""
+        return self.eos
+
     @property
     def bos_token(self):
-        """ Beginning of sentence token id """
+        """Beginning of sentence token id"""
         return self._bos_token
 
     @property
     def eos_token(self):
-        """ End of sentence token id """
+        """End of sentence token id"""
         return self._eos_token
 
     @property
     def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
+        """All the additional special tokens you may want to use (list of strings)."""
         return self._additional_special_tokens
 
     @property
     def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
+        """Ids of all the additional special tokens in the vocabulary (list of integers)."""
         return [self.vocab.get(token) for token in self._additional_special_tokens]
 
     @additional_special_tokens.setter
@@ -266,8 +278,9 @@ class _GPT2BPETokenizer(MegatronTokenizer):
     def __init__(self, vocab_file, merge_file):
         super().__init__(vocab_file, merge_file)
 
-        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
+        self.tokenizer = GPT2Tokenizer(
+            vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None
+        )
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
@@ -300,6 +313,7 @@ def __init__(self, model_file, vocab_extra_ids=0):
         super().__init__(model_file, vocab_extra_ids=vocab_extra_ids)
 
         import sentencepiece
+
         self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
         self._initalize(vocab_extra_ids)
 
@@ -462,7 +476,7 @@ def additional_special_tokens_ids(self):
 class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
     """SentencePieceTokenizer-Megatron wrapper"""
 
-    def __init__(self, model_file,):
+    def __init__(self, model_file):
         super().__init__(model_file, vocab_extra_ids=0)
 
     def _initalize(self, vocab_extra_ids):
@@ -502,7 +516,7 @@ def additional_special_tokens_ids(self):
 class _Llama2Tokenizer(_SentencePieceTokenizer):
     """SentencePieceTokenizer-Megatron wrapper"""
 
-    def __init__(self, model_file,):
+    def __init__(self, model_file):
         super().__init__(model_file, vocab_extra_ids=0)
 
     def _initalize(self, vocab_extra_ids):
@@ -549,10 +563,7 @@ def additional_special_tokens_ids(self):
         return None
 
 
-def reload_mergeable_ranks(
-    path: str,
-    max_vocab: Optional[int] = None,
-) -> Dict[bytes, int]:
+def reload_mergeable_ranks(path: str, max_vocab: Optional[int] = None) -> Dict[bytes, int]:
     """
     Reload our tokenizer JSON file and convert it to Tiktoken format.
     """
@@ -585,9 +596,12 @@ def reload_mergeable_ranks(
     return ranks
 
 
-PATTERN_TIKTOKEN = r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
+PATTERN_TIKTOKEN = (
+    r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
+)
 PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 
+
 class CustomTikTokenizer(MegatronTokenizer):
     def __init__(
         self,
@@ -602,9 +616,10 @@ def __init__(
             pattern=pattern,
             vocab_size=vocab_size,
             num_special_tokens=num_special_tokens,
-            special_tokens=special_tokens
+            special_tokens=special_tokens,
         )
         import tiktoken
+
         from .. import print_rank_0  # To prevent circular import.
 
         if vocab_size is None:
@@ -614,20 +629,30 @@ def __init__(
         SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
         if special_tokens is None:
             special_tokens = SPECIAL_TOKENS.copy()
-        assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}"
+        assert len(special_tokens) == len(
+            set(special_tokens)
+        ), f"Special tokens should be unique: {special_tokens}"
         assert len(special_tokens) <= num_special_tokens < self._vocab_size
-        assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"
+        assert set(SPECIAL_TOKENS) <= set(
+            special_tokens
+        ), f"Custom special tokens should include {SPECIAL_TOKENS}"
 
-        special_filler = ["<SPECIAL_{id}>".format(id=i) for i in range(len(special_tokens), num_special_tokens)]
+        special_filler = [
+            "<SPECIAL_{id}>".format(id=i) for i in range(len(special_tokens), num_special_tokens)
+        ]
         if special_filler:
             print_rank_0(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
         special_tokens = special_tokens + special_filler
         assert len(set(special_tokens)) == len(special_tokens) == num_special_tokens, special_tokens
         inner_vocab_size = self._vocab_size - num_special_tokens
 
-        token_to_id_without_special_tokens = reload_mergeable_ranks(path, max_vocab=inner_vocab_size)
+        token_to_id_without_special_tokens = reload_mergeable_ranks(
+            path, max_vocab=inner_vocab_size
+        )
         # Create space for special tokens.
-        token_to_id_without_special_tokens = {t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()}
+        token_to_id_without_special_tokens = {
+            t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()
+        }
 
         special_tokens = {t: i for i, t in enumerate(special_tokens)}
         self._unk_id = special_tokens["<unk>"]
@@ -650,7 +675,6 @@ def __init__(
         self._id_to_token = {v: k for k, v in self._token_to_id.items()}
         assert set(range(self._vocab_size)) == set(self._id_to_token.keys())
 
-
     @property
     def bos(self) -> int:
         return self._bos_id
diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py
index 161284ceeb..835aeed22d 100644
--- a/tests/unit_tests/inference/engines/test_mcore_engine.py
+++ b/tests/unit_tests/inference/engines/test_mcore_engine.py
@@ -93,3 +93,30 @@ def test_generate(self):
             ), f"Status should be completed but its {result.status}"
             assert result.generated_length > 0, f"Generated length should be greater than zero"
             assert result.generated_text is not None, f'Generated text should not be None'
+
+    def test_generate_empty_prompt(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.bos = self.vocab_size - 2
+        # Generating random length integer prompts
+        self.mock_tokenizer.tokenize.return_value = [
+            random.randint(0, self.vocab_size - 1) for _ in range(random.randint(5, 10))
+        ]
+        # Generates some random string
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+
+        prompts = ["" for i in range(self.batch_size)]
+        results: List[InferenceRequest] = self.mcore_engine.generate(
+            prompts,
+            add_BOS=True,
+            common_inference_params=CommonInferenceParams(num_tokens_to_generate=10),
+        )
+
+        for result in results:
+            assert (
+                result.status == Status.COMPLETED
+            ), f"Status should be completed but its {result.status}"
+            assert result.generated_length > 0, f"Generated length should be greater than zero"
+            assert result.generated_text is not None, f'Generated text should not be None'
diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
new file mode 100644
index 0000000000..b9ece5c395
--- /dev/null
+++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
@@ -0,0 +1,124 @@
+from argparse import Namespace
+from copy import deepcopy
+from unittest import mock
+
+import numpy as np
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestT5InferenceWrapper:
+
+    def setup_model(self, tensor_parallel_size, pipeline_parallel_size):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_inference_only_tensor_parallel(self):
+        self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1)
+
+        batch_prompt_tokens = (
+            torch.randint(
+                low=0, high=self.vocab_size, size=(self.batch_size, self.decoder_sequence_length)
+            )
+            .int()
+            .cuda()
+        )
+        batch_encoder_prompts = ["sample prompt encoders"] * self.batch_size
+        mock_tokenizer = mock.Mock()
+        mock_tokenizer.pad = self.vocab_size - 1
+        mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=self.encoder_sequence_length
+        ).tolist()
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=batch_prompt_tokens,
+            encoder_prompts=batch_encoder_prompts,
+            tokenizer=mock_tokenizer,
+        )
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+            0, self.decoder_sequence_length
+        )
+
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+
+        assert logits.shape == (
+            self.batch_size,
+            self.decoder_sequence_length,
+            self.vocab_size,
+        ), f"Shape mismatch . Expected {(self.batch_size, self.decoder_sequence_length, self.vocab_size)}, but got {logits.shape}"
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..14c9a88852
--- /dev/null
+++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
@@ -0,0 +1,143 @@
+import random
+import string
+import time
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict
+from unittest import mock
+
+import numpy as np
+import pytest
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestEncoderDecoderTextGenerationController:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=4, pipeline_model_parallel_size=1
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=4,
+            pipeline_model_parallel_size=1,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+        self.mock_tokenizer = mock.Mock()
+
+        self.text_generation_controller = EncoderDecoderTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_generate_all_output_tokens_static_batch(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.pad = self.vocab_size - 2
+        self.mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+        self.mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=(self.encoder_sequence_length - 5)
+        ).tolist()
+
+        active_requests: Dict[int, InferenceRequest] = OrderedDict()
+        for i in range(self.batch_size):
+            prompt = "decoder_sample"
+            prompt_tokens = np.random.randint(
+                self.vocab_size, size=self.decoder_sequence_length
+            ).tolist()
+            encoder_prompt = "encoder_sample"
+            inference_request = InferenceRequest(
+                request_id=i,
+                prompt=prompt,
+                encoder_prompt=encoder_prompt,
+                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                arrival_time=time.time(),
+                prompt_tokens=prompt_tokens,
+                status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
+            )
+            active_requests[i] = inference_request
+
+        requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
+            active_requests
+        )
+
+        for request_id, request in requests.items():
+            assert (
+                request.status == Status.COMPLETED
+            ), f"Status should be completed but its {request.status}"
+            assert request.generated_length > 0, f"Generated length should be greater than zero"
+            assert request.generated_text is not None, "Generated text should not be None"
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
index a9f15faf80..df7109e021 100644
--- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
+++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
@@ -26,7 +26,7 @@
 from tests.unit_tests.test_utilities import Utils
 
 
-class TestTextGenerationController:
+class TestSimpleTextGenerationController:
 
     def setup_method(self, method):
         Utils.initialize_model_parallel(

From 691b323f5243208e4e64040e8c2144c5f6cbec19 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 5 Oct 2024 10:48:37 -0700
Subject: [PATCH 45/50] ADLR/megatron-lm!2182 - ci: Group runs by model

---
 tests/functional_tests/jet_recipes/bert.yaml  |  2 +-
 .../jet/generate_jet_trigger_job.py           | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
index 717664a69e..088436e8ea 100644
--- a/tests/functional_tests/jet_recipes/bert.yaml
+++ b/tests/functional_tests/jet_recipes/bert.yaml
@@ -32,7 +32,7 @@ products:
     time_limit: [12000]
     test_case: 
     - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
-    - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
+    # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
     - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
     - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
     - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
index 30d13c3730..c7338d3181 100644
--- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -37,13 +37,18 @@ def main(
     run_name: Optional[str] = None,
     wandb_experiment: Optional[str] = None,
 ):
-
-    gitlab_pipeline = {"stages": ["functional_tests"], "default": {"interruptible": True}}
-
-    for test_case in common.load_workloads(scope=scope, container_tag=container_tag):
-        if test_case.type == "build":
-            continue
-
+    test_cases = [
+        test_case
+        for test_case in common.load_workloads(scope=scope, container_tag=container_tag)
+        if test_case.type != "build"
+    ]
+
+    gitlab_pipeline = {
+        "stages": list(set([test_case.spec.model for test_case in test_cases])),
+        "default": {"interruptible": True},
+    }
+
+    for test_case in test_cases:
         if test_case.spec.platforms == "dgx_a100":
             cluster = a100_cluster
         elif test_case.spec.platforms == "dgx_h100":
@@ -87,7 +92,7 @@ def main(
             )
 
         gitlab_pipeline[test_case.spec.test_case] = {
-            "stage": "functional_tests",
+            "stage": f"{test_case.spec.model}",
             "image": f"{container_image}:{container_tag}",
             "tags": ["mcore-docker-node-jet"],
             "rules": [

From ce67659494dbcb4578d19837e40689e2e1daa2ec Mon Sep 17 00:00:00 2001
From: Peter Dykas <wdykas@nvidia.com>
Date: Sat, 5 Oct 2024 16:21:15 -0700
Subject: [PATCH 46/50] ADLR/megatron-lm!1862 - Cpu init te

Co-authored-by: William Dykas <wdykas@cw-dfw-cs-001-dc-02.cm.cluster>
Co-authored-by: root <root@cw-dfw-h100-001-097-026.cm.cluster>
Co-authored-by: William Dykas <wdykas@cs-cw-dfw-login-01.cm.cluster>
---
 .../core/extensions/transformer_engine.py     | 103 +++++++++++++++++-
 megatron/core/tensor_parallel/layers.py       |   9 +-
 .../tensor_parallel/test_initialization.py    |  84 +++++++++++++-
 3 files changed, 188 insertions(+), 8 deletions(-)

diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index db39f8775b..0dbd1a58f2 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -9,6 +9,7 @@
 import transformer_engine as te
 from packaging.version import Version as PkgVersion
 from torch import Tensor
+from torch.nn.parameter import Parameter
 
 from megatron.core import ModelParallelConfig, parallel_state
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
@@ -18,8 +19,14 @@
     get_context_parallel_group,
     get_tensor_and_expert_parallel_world_size,
     get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
 )
 from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    set_tensor_model_parallel_attributes,
+)
 from megatron.core.tensor_parallel.utils import divide
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -305,7 +312,11 @@ def __init__(
             get_rng_state_tracker=(
                 get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
             ),
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             return_bias=self.te_return_bias,
             parallel_mode="column",
@@ -314,6 +325,33 @@ def __init__(
             **extra_kwargs,
         )
 
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
     def forward(self, x):
         """Forward."""
         _is_first_microbatch = (
@@ -365,7 +403,11 @@ def __init__(
             output_size=output_size,
             parallel_mode="column",
             config=config,
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             skip_bias_add=skip_bias_add,
             is_expert=is_expert,
@@ -373,6 +415,32 @@ def __init__(
             tp_comm_buffer_name=tp_comm_buffer_name,
         )
 
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix='', keep_vars=True)
@@ -410,13 +478,42 @@ def __init__(
             output_size=output_size,
             parallel_mode="row",
             config=config,
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             skip_bias_add=skip_bias_add,
             skip_weight_param_allocation=False,  # We don't currently use this for row parallel layers # pylint: disable=line-too-long
             is_expert=is_expert,
             tp_comm_buffer_name=tp_comm_buffer_name,
         )
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            input_size_per_partition = divide(input_size, world_size)
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                input_size_per_partition,
+                1,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(torch.empty(output_size, dtype=config.params_dtype))
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+                setattr(self.bias, 'sequence_parallel', config.sequence_parallel)
 
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 61d9c7c34d..903b4ed873 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -120,21 +120,22 @@ def _initialize_affine_weight_cpu(
     params_dtype=torch.float32,
     rank=None,
     world_size=None,
+    skip_set_tensor_parallel_attributes=False,
 ):
     """Initialize affine weight for model parallel.
 
     Build the master weight on all processes and scatter
     the relevant chunk."""
 
-    set_tensor_model_parallel_attributes(
-        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
-    )
+    if not skip_set_tensor_parallel_attributes:
+        set_tensor_model_parallel_attributes(
+            tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+        )
 
     # Initialize master weight
     master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
     init_method(master_weight)
     master_weight = master_weight.to(dtype=params_dtype)
-
     # Split and copy
     per_partition_per_stride_size = divide(per_partition_size, stride)
     weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py
index 9fcc38c259..039ad071a7 100644
--- a/tests/unit_tests/tensor_parallel/test_initialization.py
+++ b/tests/unit_tests/tensor_parallel/test_initialization.py
@@ -4,13 +4,16 @@
 import torch
 
 import megatron.core.parallel_state as ps
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.tensor_parallel.layers import (
     ColumnParallelLinear,
     RowParallelLinear,
     VocabParallelEmbedding,
 )
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TERowParallelLinear,
+)
 from megatron.core.transformer.transformer_config import TransformerConfig
 from tests.unit_tests.test_utilities import Utils
 
@@ -21,6 +24,9 @@ class Test:
         num_layers=1, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
     )
 
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_embedding_init(self):
 
@@ -117,3 +123,79 @@ def test_col_init(self):
         rank = ps.get_tensor_model_parallel_rank()
         assert tp4.shape[0] * 4 == tp1.shape[0]
         assert torch.equal(tp1[rank * 4 : (rank + 1) * 4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_col_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[0] * 4 == tp1.shape[0]
+            assert torch.allclose(tp1[:4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_row_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[1] * 4 == tp1.shape[1]
+            assert torch.allclose(tp1[:, :4], tp4)

From 73ef7159abbe4d7b8a0d0570178b3cc4b68d17e2 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Sat, 5 Oct 2024 16:37:52 -0700
Subject: [PATCH 47/50] ADLR/megatron-lm!2186 - ci: Run script after export

---
 .../shell_test_utils/_run_training.sh                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
index d43a3af77f..9266b4a108 100644
--- a/tests/functional_tests/shell_test_utils/_run_training.sh
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -41,12 +41,6 @@ done
 cat $TRAINING_PARAMS_PATH | envsubst >$TRAINING_PARAMS_PATH.tmp
 mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH
 
-# Run before script
-SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')
-if [[ "$SCRIPT" != null ]]; then
-    eval "$SCRIPT"
-fi;
-
 # Pull env vars to export
 ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
 for ARGUMENT in $ENV_VARS; do
@@ -59,6 +53,12 @@ for ARGUMENT in $ENV_VARS; do
     echo "$KEY=$VALUE"
 done
 
+# Run before script
+SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')
+if [[ "$SCRIPT" != null ]]; then
+    eval "$SCRIPT"
+fi;
+
 # Exit earlier to leave time for properly saving checkpoint
 if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
     PARAMS=""

From 329c1d7a765d4d9886594a88b9b2fa4972bf2b93 Mon Sep 17 00:00:00 2001
From: "Ray Wang (HW-Comp DevTech-CN05)" <raywang@nvidia.com>
Date: Mon, 7 Oct 2024 14:22:02 -0700
Subject: [PATCH 48/50] ADLR/megatron-lm!2089 - Fix upcycling issues.

---
 .../core/transformer/moe/upcycling_utils.py   | 36 ++++++++++++++++++-
 megatron/training/training.py                 |  9 +++--
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py
index 66fe86aee5..b905fc99be 100644
--- a/megatron/core/transformer/moe/upcycling_utils.py
+++ b/megatron/core/transformer/moe/upcycling_utils.py
@@ -56,7 +56,40 @@ def _covert_to_moe_state_dict(state_dict, moe_model):
         router_key = mlp_weight_key.replace('mlp.linear_fc1.weight', 'mlp.router.weight')
         new_state_dict[router_key] = moe_state_dict[router_key].data.data.clone()
 
-    if mlp.config.moe_grouped_gemm:
+    use_te_grouped_gemm = 'decoder.layers.0.mlp.experts.linear_fc1.weight0' in moe_state_dict
+
+    if mlp.config.moe_grouped_gemm and use_te_grouped_gemm:
+        for mlp_weight_key in mlp_fc1_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc1.weight', f'mlp.experts.linear_fc1.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for mlp_weight_key in mlp_fc2_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc2.weight', f'mlp.experts.linear_fc2.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for extra_state_key in mlp_fc1_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc1._extra_state', 'mlp.experts.linear_fc1._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+        for extra_state_key in mlp_fc2_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc2._extra_state', 'mlp.experts.linear_fc2._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+    elif mlp.config.moe_grouped_gemm:
         for mlp_weight_key in mlp_fc1_weight_keys:
             weight_tensor = new_state_dict.pop(mlp_weight_key)
             shape = weight_tensor.shape
@@ -76,6 +109,7 @@ def _covert_to_moe_state_dict(state_dict, moe_model):
             )
             new_key = mlp_weight_key.replace('mlp.linear_fc2.weight', 'mlp.experts.weight2')
             new_state_dict[new_key] = weight_tensor
+
     else:
 
         def covert_to_experts(keys):
diff --git a/megatron/training/training.py b/megatron/training/training.py
index fbe4ecf079..d5ee16be5f 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -28,6 +28,7 @@
 )
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.checkpointing import save_checkpoint
+from megatron.training.checkpointing import checkpoint_exists
 from megatron.legacy.model import Float16Module
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.distributed import DistributedDataParallel as DDP
@@ -642,7 +643,8 @@ def setup_model_and_optimizer(model_provider_func,
     opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
     if args.moe_use_upcycling:
-        assert not os.path.exists(
+        torch.distributed.barrier()
+        assert not checkpoint_exists(
             args.save
         ), ("The upcycling destination directory already exists. "
             "Please check if --moe-use-upcycling is mistakenly enabled. "
@@ -650,15 +652,18 @@ def setup_model_and_optimizer(model_provider_func,
             "All subsequent runs should remove this flag. ")
         num_experts = args.num_experts
         args.num_experts = None
+        expert_model_parallel_size = args.expert_model_parallel_size
+        args.expert_model_parallel_size = 1
         dense_model_for_upcycling = get_model(model_provider_func, model_type)
         args.num_experts = num_experts
+        args.expert_model_parallel_size = expert_model_parallel_size
         _, args.num_floating_point_operations_so_far = upcycling_utils.load_and_upcycle_model(
             load_checkpoint,
             unwrapped_model,
             dense_model_for_upcycling,
             load_kwargs = {'model': dense_model_for_upcycling, 'optimizer': None, 'opt_param_scheduler': None}
         )
-        args.iteration = 0
+        args.iteration = 1
         save_checkpoint(args.iteration, model, None, None, args.num_floating_point_operations_so_far)
         torch.distributed.barrier()
         del dense_model_for_upcycling

From 99f63e80cbef507276a4932ec20129a560d1380a Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 7 Oct 2024 14:22:05 -0700
Subject: [PATCH 49/50] ADLR/megatron-lm!2189 - tests: Fix ENV export

---
 tests/functional_tests/shell_test_utils/_run_training.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
index 9266b4a108..12dd359c65 100644
--- a/tests/functional_tests/shell_test_utils/_run_training.sh
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -43,7 +43,7 @@ mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH
 
 # Pull env vars to export
 ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
-for ARGUMENT in $ENV_VARS; do
+while IFS= read -r ARGUMENT; do
     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 
     KEY_LENGTH=${#KEY}
@@ -51,7 +51,7 @@ for ARGUMENT in $ENV_VARS; do
 
     export "$KEY"="$VALUE"
     echo "$KEY=$VALUE"
-done
+done <<< "$ENV_VARS"
 
 # Run before script
 SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')

From fd20cda33b7b99d22daaf3af6f1ed37e7df12406 Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitg@nvidia.com>
Date: Tue, 8 Oct 2024 10:48:14 -0700
Subject: [PATCH 50/50] rebase to 24.09

---
 .gitlab-ci.yml                                |  50 +-
 .gitlab/stages/00.pre.yml                     |  19 +-
 .gitlab/stages/01.tests.yml                   | 119 ++-
 .gitlab/stages/02.functional-tests.yml        | 119 +--
 .gitlab/stages/03.convergence-tests.yml       |  86 --
 .../stages/{04.publish.yml => 03.publish.yml} |   0
 Dockerfile.ci                                 | 120 +--
 Dockerfile.ci.dev                             |  62 ++
 docs/llama_mistral.md                         | 106 +++
 examples/export/README.md                     |  10 +
 .../ptq_and_trtllm_export}/README.md          |  48 +-
 .../ptq_trtllm_llama2_7b.sh                   |   6 +-
 .../ptq_trtllm_llama3_1_8b.sh                 |   7 +-
 .../ptq_trtllm_llama3_8b.sh                   |   6 +-
 .../ptq_trtllm_minitron_8b.sh                 |   2 +-
 .../ptq_trtllm_mistral_12b.sh                 |   2 +-
 .../text_generation_ptq.py                    |   7 +-
 .../trtllm_text_generation.py                 |   0
 examples/export/trtllm_export/README.md       | 161 ++++
 .../gpt_distributed_gpu_export.py             | 117 +++
 .../gpt_single_device_cpu_export.py           | 118 +++
 .../llama_mistral/huggingface_reference.py    |   1 +
 .../run_text_generation_llama3.1.sh           |  56 ++
 .../inference/t5/simple_t5_batch_inference.py | 157 ++++
 examples/multimodal/config.py                 |  48 +-
 examples/multimodal/dataloader_provider.py    |  10 +-
 examples/multimodal/evaluate_textvqa.py       |  20 +-
 examples/multimodal/evaluate_vqav2.py         |  12 +-
 examples/multimodal/model.py                  | 150 +++
 examples/multimodal/multimodal_args.py        |  43 +
 examples/multimodal/pretrain_mistral_clip.sh  |   2 -
 examples/multimodal/run_text_generation.py    | 855 ++++++++++++------
 examples/multimodal/sft_mistral_clip.sh       |   2 -
 .../text_generation_mistral_clip.sh           |   7 -
 examples/multimodal/train.py                  | 214 ++---
 .../dist_checkpointing/strategies/torch.py    |   7 +-
 megatron/core/export/__init__.py              |   1 +
 megatron/core/export/data_type.py             |   5 +
 megatron/core/export/export_config.py         |  19 +
 megatron/core/export/model_type.py            |   7 +
 megatron/core/export/trtllm/__init__.py       |   1 +
 .../export/trtllm/engine_builder/__init__.py  |   1 +
 .../engine_builder/trtllm_engine_builder.py   | 148 +++
 .../trtllm/model_to_trllm_mapping/__init__.py |   1 +
 .../default_conversion_dict.py                |  18 +
 .../model_to_trllm_mapping/falcon_model.py    |  26 +
 .../model_to_trllm_mapping/gemma_model.py     |  21 +
 .../model_to_trllm_mapping/gpt_model.py       |  28 +
 .../model_to_trllm_mapping/gpt_next_model.py  |  24 +
 .../model_to_trllm_mapping/llama_model.py     |  22 +
 .../model_to_trllm_mapping/starcoder_model.py |  30 +
 .../core/export/trtllm/trt_model_config.py    |  15 +
 megatron/core/export/trtllm/trt_model_type.py |  13 +
 megatron/core/export/trtllm/trtllm_helper.py  | 461 ++++++++++
 megatron/core/export/trtllm/trtllm_layers.py  | 157 ++++
 .../trtllm_weights_converter/__init__.py      |   1 +
 ...tributed_trtllm_model_weights_converter.py | 258 ++++++
 ...e_device_trtllm_model_weights_converter.py | 437 +++++++++
 .../core/extensions/transformer_engine.py     | 198 +++-
 .../core/inference/engines/mcore_engine.py    |  53 +-
 megatron/core/inference/inference_request.py  |   9 +
 .../model_inference_wrappers/t5/__init__.py   |   1 +
 .../t5/t5_inference_wrapper.py                | 205 +++++
 megatron/core/inference/scheduler.py          |  37 +-
 ...oder_decoder_text_generation_controller.py |  35 +
 .../simple_text_generation_controller.py      | 112 ++-
 megatron/core/model_parallel_config.py        |  24 +-
 megatron/core/models/T5/t5_spec.py            |   6 +-
 megatron/core/models/bert/bert_model.py       |  93 +-
 .../core/models/common/embeddings/__init__.py |   5 +
 .../models/common/embeddings/rope_utils.py    | 191 ++++
 .../common/embeddings/rotary_pos_embedding.py | 200 ++--
 .../embeddings/yarn_rotary_pos_embedding.py   | 169 ++++
 megatron/core/models/gpt/gpt_layer_specs.py   | 147 ++-
 megatron/core/models/gpt/gpt_model.py         |  19 +-
 .../core/models/multimodal/llava_model.py     |  34 +-
 megatron/core/models/retro/config.py          |   7 +-
 megatron/core/models/vision/clip_vit_model.py |  58 +-
 megatron/core/num_microbatches_calculator.py  |  19 +-
 megatron/core/package_info.py                 |   2 +-
 megatron/core/packed_seq_params.py            |   8 +-
 megatron/core/requirements.txt                |   3 +-
 megatron/core/tensor_parallel/layers.py       |  39 +-
 megatron/core/tensor_parallel/random.py       |   4 +-
 megatron/core/transformer/__init__.py         |   2 +-
 megatron/core/transformer/attention.py        |  10 +-
 megatron/core/transformer/cuda_graphs.py      |  17 +-
 .../core/transformer/dot_product_attention.py |  11 +-
 megatron/core/transformer/moe/moe_utils.py    |   6 +-
 megatron/core/transformer/moe/router.py       |  11 +-
 .../core/transformer/moe/token_dispatcher.py  |  33 +-
 .../core/transformer/moe/upcycling_utils.py   |  36 +-
 .../transformer/multi_latent_attention.py     | 375 ++++++++
 .../core/transformer/transformer_block.py     |   9 +-
 .../core/transformer/transformer_config.py    |  65 +-
 megatron/core/utils.py                        |  29 +
 megatron/inference/gpt/model_provider.py      |   1 +
 megatron/legacy/model/rms_norm.py             |   3 +-
 megatron/legacy/model/transformer.py          |  20 +-
 megatron/training/activations.py              |   4 +
 megatron/training/arguments.py                |  37 +-
 megatron/training/checkpointing.py            |   2 +-
 megatron/training/initialize.py               |  24 +-
 megatron/training/tokenizer/tokenizer.py      | 122 ++-
 megatron/training/training.py                 |  39 +-
 megatron/training/yaml_arguments.py           |   7 +-
 pretrain_gpt.py                               |   7 +-
 pretrain_vlm.py                               |  25 +-
 .../jet_recipes/_build-mcore.yaml             |  11 +
 .../jet_recipes/_build-nemo.yaml              |  10 +
 .../jet_recipes/_build-pyt.yaml               |  23 -
 tests/functional_tests/jet_recipes/bert.yaml  |   4 +-
 .../jet_recipes/gpt-nemo.yaml                 |   4 +-
 tests/functional_tests/jet_recipes/gpt.yaml   |   2 +-
 .../jet_recipes/multimodal-llava.yaml         |   6 +-
 tests/functional_tests/jet_recipes/t5.yaml    |   8 +-
 .../python_test_utils/jet/common.py           | 140 +++
 .../jet/generate_jet_trigger_job.py           | 113 +++
 .../jet/generate_local_jobs.py                |  62 ++
 .../jet/launch_jet_workload.py                | 216 +++++
 .../shell_test_utils/_run_training.sh         |  16 +-
 .../shell_test_utils/notify.sh                |  70 +-
 .../shell_test_utils/notify_unit_tests.sh     | 186 ++++
 .../shell_test_utils/run_ci_test.sh           |   4 +-
 .../shell_test_utils/run_ci_test_locally.sh   | 124 ---
 .../gpt/gpt3_15b_8t_release/model_config.yaml |   4 +-
 .../gpt3_15b_8t_release_sm/model_config.yaml  |   2 +-
 .../model_config.yaml                         |   3 +-
 .../model_config.yaml                         |   2 +-
 .../model_config.yaml                         |   2 +-
 .../model_config.yaml                         |   3 +-
 .../golden_values.json                        |   2 +-
 .../golden_values.json                        |   2 +-
 .../golden_values.json                        |   2 +-
 .../golden_values.json                        |   1 +
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 .../golden_values.json                        |   1 +
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 .../golden_values.json                        | 763 ++++++++++++++++
 .../model_config.yaml                         |  55 ++
 .../model_config.yaml                         |  55 ++
 .../t5/t5_release/model_config.yaml           |   2 +-
 .../models/test_sequential_mlp.py             |  10 +-
 .../unit_tests/dist_checkpointing/test_fp8.py |   1 +
 .../dist_checkpointing/test_nonpersistent.py  |   1 +
 tests/unit_tests/export/trtllm/__init__.py    |   0
 .../test_trtllm_distributed_gpu_converter.py  | 100 ++
 .../export/trtllm/test_trtllm_helper.py       |  73 ++
 .../export/trtllm/test_trtllm_layers.py       | 111 +++
 .../test_trtllm_single_device_converter.py    | 169 ++++
 .../inference/engines/test_mcore_engine.py    |  27 +
 .../t5/test_t5_inference_wrapper.py           | 124 +++
 ...oder_decoder_text_generation_controller.py | 143 +++
 .../test_simple_text_generation_controller.py |   2 +-
 tests/unit_tests/models/test_bert_model.py    | 182 ++--
 tests/unit_tests/models/test_llava_model.py   | 162 +++-
 .../tensor_parallel/test_initialization.py    |  84 +-
 .../transformer/moe/test_grouped_mlp.py       |   8 +-
 .../transformer/moe/test_sequential_mlp.py    |  12 +-
 .../test_multi_latent_attention.py            | 160 ++++
 .../transformer/test_spec_customization.py    |   6 +-
 tools/checkpoint/loader_llama_mistral.py      |   9 +-
 tools/checkpoint/saver_mcore.py               |   8 +-
 tools/run_text_generation_server.py           |   5 +-
 166 files changed, 8884 insertions(+), 1647 deletions(-)
 delete mode 100644 .gitlab/stages/03.convergence-tests.yml
 rename .gitlab/stages/{04.publish.yml => 03.publish.yml} (100%)
 create mode 100644 Dockerfile.ci.dev
 create mode 100644 examples/export/README.md
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/README.md (82%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama2_7b.sh (88%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama3_1_8b.sh (91%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_llama3_8b.sh (92%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_minitron_8b.sh (94%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/ptq_trtllm_mistral_12b.sh (94%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/text_generation_ptq.py (96%)
 rename examples/{inference/quantization => export/ptq_and_trtllm_export}/trtllm_text_generation.py (100%)
 create mode 100644 examples/export/trtllm_export/README.md
 create mode 100644 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
 create mode 100644 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
 create mode 100755 examples/inference/llama_mistral/run_text_generation_llama3.1.sh
 create mode 100644 examples/inference/t5/simple_t5_batch_inference.py
 create mode 100644 examples/multimodal/model.py
 create mode 100644 examples/multimodal/multimodal_args.py
 create mode 100644 megatron/core/export/__init__.py
 create mode 100644 megatron/core/export/data_type.py
 create mode 100644 megatron/core/export/export_config.py
 create mode 100644 megatron/core/export/model_type.py
 create mode 100644 megatron/core/export/trtllm/__init__.py
 create mode 100644 megatron/core/export/trtllm/engine_builder/__init__.py
 create mode 100644 megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
 create mode 100644 megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
 create mode 100644 megatron/core/export/trtllm/trt_model_config.py
 create mode 100644 megatron/core/export/trtllm/trt_model_type.py
 create mode 100644 megatron/core/export/trtllm/trtllm_helper.py
 create mode 100644 megatron/core/export/trtllm/trtllm_layers.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
 create mode 100644 megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
 create mode 100644 megatron/core/inference/model_inference_wrappers/t5/__init__.py
 create mode 100644 megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
 create mode 100644 megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
 create mode 100644 megatron/core/models/common/embeddings/rope_utils.py
 create mode 100644 megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
 create mode 100644 megatron/core/transformer/multi_latent_attention.py
 create mode 100644 tests/functional_tests/jet_recipes/_build-mcore.yaml
 create mode 100644 tests/functional_tests/jet_recipes/_build-nemo.yaml
 delete mode 100644 tests/functional_tests/jet_recipes/_build-pyt.yaml
 create mode 100644 tests/functional_tests/python_test_utils/jet/common.py
 create mode 100644 tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
 create mode 100644 tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
 create mode 100644 tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
 create mode 100644 tests/functional_tests/shell_test_utils/notify_unit_tests.sh
 delete mode 100644 tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
 create mode 100644 tests/unit_tests/export/trtllm/__init__.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_helper.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_layers.py
 create mode 100644 tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
 create mode 100644 tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
 create mode 100644 tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
 create mode 100644 tests/unit_tests/transformer/test_multi_latent_attention.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e2f7725fb1..c99b97f697 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,22 +13,28 @@ workflow:
         FUNCTIONAL_TEST: "no"
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: mr
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: nightly
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
-        FUNCTIONAL_TEST: "yes"
-        FUNCTIONAL_TEST_SCOPE: weekly
         UNIT_TEST_REPEAT: 5
         UNIT_TEST_TIMEOUT: 50
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
     - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
       variables:
         FUNCTIONAL_TEST: "no"
@@ -58,29 +64,23 @@ variables:
       - "mr"
       - "nightly"
       - "weekly"
+      - "pre-release"
+      - "release"
     description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
-  FUNCTIONAL_TEST_CLUSTER:
+  FUNCTIONAL_TEST_CLUSTER_A100:
     value: "dgxa100_dracooci"
     options:
       - "dgxa100_dracooci"
       - "dgxa100_dracooci-ord"
-      - "dgxh100_eos"
-    description: '"dgxa100_dracooci" for OCI-IAD, "dgxh100_eos" for EOS'
-  CONVERGENCE_TEST:
-    value: "no"
+    description: 'Cluster for A100 workloads'
+  FUNCTIONAL_TEST_CLUSTER_H100:
+    value: "dgxh100_eos"
     options:
-      - "yes"
-      - "no"
-    description: To run a convergence test
-  CONVERGENCE_TEST_SCOPE:
-    value: "release"
-    options:
-      - "release"
-      - "pre-release"
-    description: "Test suite to run (only for CONVERGENCE_TEST=yes)"
-  CONVERGENCE_TEST_RUN_NAME:
-    value: "pre-release-$$CI_PIPELINE_ID"
-    description: "Run directory of convergence test"
+      - "dgxh100_coreweave"
+      - "dgxh100_eos"
+    description: 'Cluster for H100 workloads'
+  FUNCTIONAL_TEST_NAME:
+    description: "Name of functional test run (only for pre-release and release)"
   PUBLISH: 
     value: "no"
     options: 
@@ -96,6 +96,7 @@ variables:
 
   # CI wide variables
   CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
+  CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
   CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
   LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
   UNIT_TEST_TIMEOUT: 15
@@ -105,5 +106,4 @@ include:
   - .gitlab/stages/00.pre.yml
   - .gitlab/stages/01.tests.yml
   - .gitlab/stages/02.functional-tests.yml
-  - .gitlab/stages/03.convergence-tests.yml
-  - .gitlab/stages/04.publish.yml
+  - .gitlab/stages/03.publish.yml
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index e0b5c579c1..a91436be87 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -76,9 +76,10 @@ clean_docker_node:
     matrix:
       - node: 8xL40S
       - node: mcore-docker-node-small
+      - node: mcore-docker-node-jet
   script:
     - export DOCKER_HOST='unix:///var/run/docker.sock'
-    - docker system prune -a --filter "until=48h" -f || true
+    - docker system prune -a --filter "until=36h" -f || true
 
 maybe_cherry_pick_commit:
   rules:
@@ -101,8 +102,13 @@ maybe_cherry_pick_commit:
     - git config --global user.email "mcore-bot@nvidia.com"
     - git config --global user.name "Mcore Bot"
     - |
-      LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}" | jq '.labels | join(",")' | tr -d '"')
-      
+      MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
+
+      LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
+      AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
+      AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
+      TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
+      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
       TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
 
       if [[ $TARGET_BRANCHES == "" ]]; then
@@ -134,8 +140,11 @@ maybe_cherry_pick_commit:
             --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
             -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
             -d "target_branch=$RELEASE_BRANCH" \
-            -d "title=Cherry-pick $MR_ID into $RELEASE_BRANCH" \
-            -d "labels=cherry-pick"
+            -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
+            -d "labels=cherry-pick" \
+            -d "reviewer_ids=$AUTHOR_ID" \
+            -d "milestone_id=$MILESTONE_ID" \
+            -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
 
         else
           URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index b3cefc0fde..dc59e026ac 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -21,6 +21,10 @@ build_image:
         FILE: Dockerfile.ci
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
         TAG: mcore-docker-node-large
+      - IMAGE: CI_MCORE_DEV_IMAGE
+        FILE: Dockerfile.ci.dev
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
+        TAG: mcore-docker-node-large
       - IMAGE: CI_NEMO_IMAGE
         FILE: Dockerfile.ci
         BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
@@ -35,48 +39,44 @@ build_image:
   variables:
     STAGE: main
   script:
+    - apk add bash
     - |
-      set -x
-      env
-      eval "IMAGE=\$$IMAGE"
-
-      docker system prune -a --filter "until=24h" -f || true
-
-      if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
-        ADDITIONAL_PARAMS="--pull"
-      fi
-
-      docker pull ${IMAGE}:${CI_PIPELINE_ID} || true
-      docker pull ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} || true
-      docker pull ${IMAGE}:buildcache || true
-
-      docker build \
-        --secret id=JET_INDEX_URLS \
-        --target $STAGE \
-        -f $FILE \
-        -t ${IMAGE}:${CI_PIPELINE_ID} \
-        -t ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \
-        --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
-        --cache-to type=inline \
-        --cache-from type=registry,ref=${IMAGE}:buildcache \
-        --cache-from type=registry,ref=${IMAGE}:${CI_PIPELINE_ID} \
-        --cache-from type=registry,ref=${IMAGE}:${CI_MERGE_REQUEST_IID:-noop} \
-        --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
-        ${ADDITIONAL_PARAMS} .
-
-      docker push ${IMAGE}:${CI_PIPELINE_ID}
-      docker push ${IMAGE}:${CI_MERGE_REQUEST_IID:-noop}
-
-      if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
-        docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:nightly
-        docker push ${IMAGE}:nightly
-      fi
+      bash -c '
+        set -x
+        env
+        eval "IMAGE=\$$IMAGE"
+
+        docker system prune -a --filter "until=24h" -f || true
+        
+        docker buildx create --name container --driver=docker-container
+      
+        ADDITIONAL_PARAMS=()
+
+        if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
+          ADDITIONAL_PARAMS+=("--pull")
+          ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
+        fi
 
-      if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
-        docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache
-        docker push ${IMAGE}:buildcache
-      fi
+        if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
+          ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+        fi
 
+        DOCKER_BUILDKIT=1 docker build \
+          --secret id=JET_INDEX_URLS \
+          --target $STAGE \
+          -f $FILE \
+          -t ${IMAGE}:${CI_PIPELINE_ID} \
+          --builder=container \
+          --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:main \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+          --push \
+          ${ADDITIONAL_PARAMS[@]} .
+        '
   retry:
     max: 2
 
@@ -85,13 +85,17 @@ unit_tests:
   # the current code. This is a form of backwards compatibility testing
   # and helps in providing stable interfaces.
   extends: [.test_mr_rules]
-  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
+  image: ${IMAGE}:${CI_PIPELINE_ID}
   needs: [build_image]
   timeout: 180m
   parallel:
     matrix:
       - TAG: latest
-      - TAG: 8fc755388a03bae05cb740857008b8916e01a63c
+        IMAGE: ${CI_MCORE_IMAGE}
+      # - TAG: latest
+      #   IMAGE: ${CI_MCORE_DEV_IMAGE}
+      - TAG: core_r0.9.0
+        IMAGE: ${CI_MCORE_IMAGE}
   tags: [8xL40S]
   variables:
     GIT_STRATEGY: clone
@@ -112,11 +116,14 @@ unit_tests:
 
       for i in $(seq $UNIT_TEST_REPEAT); do
         SEED=$((RANDOM % 9000 + 1000));
-        SKIPPED=()
+        ARGS=()
         if [[ $TAG != latest ]]; then
-          SKIPPED+=(-m "not internal")
+          ARGS+=(-m "not internal")
+        fi
+        if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then
+          ARGS+=(-m "experimental")
         fi
-        timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${SKIPPED[@]}" tests/unit_tests
+        timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
       done
   artifacts:
     paths:
@@ -125,10 +132,30 @@ unit_tests:
     - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
       allow_failure: true
       when: always
-    - if: '$TAG != "latest"'
-      allow_failure: true
     - when: always
 
+unit-tests-results-notify:
+  extends: [.test_mr_rules]
+  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
+  needs: [unit_tests]
+  tags:
+    - mcore-docker-node-small
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export DATE=$(date +"%Y-%m-%d")
+    - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID}
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
+      when: always
+    - when: never
+
 docs_build_test:
   extends: [.test_mr_rules]
   image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
index 0c30857409..531527b8b4 100644
--- a/.gitlab/stages/02.functional-tests.yml
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -16,91 +16,68 @@ include:
     ref: main
     file: downstreams.yml
 
-jet-configure:
-  image:
-    name: mikefarah/yq:4.35.2
-    entrypoint: [""]
-  extends: [.jet_common, .jet-configure]
+jet-build:
+  extends: [build_image, .jet_common]
+  variables:
+    STAGE: jet
+
+jet-generate:
+  needs: [jet-build]
+  extends: [.jet_common]
+  image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
   tags: [mcore-docker-node-small]
-  script:
+  before_script:
+    - git rm -r tests/functional_tests/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
+    - ls tests/functional_tests/local_recipes
+  script: 
     - set -x
     - |
-      if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" && "$CI_MERGE_REQUEST_LABELS" == "*H100*" ]]; then
-        FUNCTIONAL_TEST_CLUSTER=$DEFAULT_H100_CLUSTER
-      fi
+      A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
     - |
-      JET_CUSTOM_FILTER="type == 'basic'"
-
-      if [[ $FUNCTIONAL_TEST_CLUSTER == dgxh100_eos ]]; then
-        JET_CI_BRANCH=mcore/eos
-        PLATFORM=dgx_h100
-      elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci ]]; then
-        JET_CI_BRANCH=mcore/draco-oci
-        PLATFORM=dgx_a100
-      elif [[ $FUNCTIONAL_TEST_CLUSTER == dgxa100_dracooci-ord ]]; then
-        JET_CI_BRANCH=mcore/draco-oci-ord
-        PLATFORM=dgx_a100
-      fi
-
-      # Add platform
-      JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$PLATFORM' in spec.platforms"
-
-      # Add scope
-      JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and '$FUNCTIONAL_TEST_SCOPE' in spec.scope"
-
-      if [[ "$JET_CUSTOM_FILTER" == "type == 'basic'" ]]; then
-        JET_CUSTOM_FILTER="False"
+      if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
+        RELEASE_ARGS=(
+          "--run-name"
+          $FUNCTIONAL_TEST_NAME
+          "--wandb-experiment"
+          $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-')
+        )
+      else
+        RELEASE_ARGS=()
       fi
 
-      echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a jet.env
-      echo "JET_CUSTOM_FILTER=$JET_CUSTOM_FILTER" | tee -a jet.env
-
     - |
-      IMAGE=${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} yq '. |=
-        (
-          select(.spec.name == "mcore-pyt")
-          | .spec.source.image = env(IMAGE)
-        )
-      ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml
-
-      IMAGE=${CI_NEMO_IMAGE}:${CI_PIPELINE_ID} yq '. |=
-        (
-          select(.spec.name == "mcore-nemo")
-          | .spec.source.image = env(IMAGE)
-        )
-      ' -i tests/functional_tests/jet_recipes/_build-pyt.yaml
+      export PYTHONPATH=$(pwd)
+      python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
+        --scope $FUNCTIONAL_TEST_SCOPE \
+        --a100-cluster $A100_CLUSTER \
+        --h100-cluster $H100_CLUSTER \
+        --container-tag ${CI_PIPELINE_ID} \
+        --container-image ${CI_MCORE_IMAGE} \
+        --container-image-dev ${CI_MCORE_DEV_IMAGE} \
+        --output-path "jet-trigger-job.yaml" \
+        ${RELEASE_ARGS[@]}
   artifacts:
-    reports:
-      dotenv: jet.env
     paths:
-      - tests/functional_tests/jet_recipes
-  retry:
-    max: 2
-    when: job_execution_timeout
-
-jet-build:
-  extends: [build_image, .jet_common]
-  variables:
-    STAGE: jet
+      - jet-trigger-job.yaml
+      - tests/functional_tests/local_recipes
 
 jet-trigger:
-  extends: [.jet_common, .jet-trigger]
-  needs: [jet-configure, jet-build]
+  stage: functional_tests
+  needs: [jet-generate]
+  extends: [.jet_common]
   trigger:
-    project: dl/jet/ci
-    branch: $JET_CI_BRANCH
+    include:
+      - artifact: jet-trigger-job.yaml
+        job: jet-generate
     strategy: depend
   variables:
-    JET_WORKLOADS_FILTER: '$JET_CUSTOM_FILTER'
-    JET_CUSTOM_CONFIG: |
-      retrier:
-        enabled: true
-        max_retries: 2
-        retry_on: ['1.2', '1.2.*'] # All infra related issues
-        waiting_time: 60
-        environment: jet-auto-retrier
-      builds: 
-        jet_flavour: # An empty mapping will disable building the JET flavor 
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_IMAGE: $CI_MCORE_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   inherit:
     variables: true
       
diff --git a/.gitlab/stages/03.convergence-tests.yml b/.gitlab/stages/03.convergence-tests.yml
deleted file mode 100644
index 5c7bd6a7a3..0000000000
--- a/.gitlab/stages/03.convergence-tests.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-.common_release:
-  stage: convergence_tests
-  needs: [build_image]
-  timeout: 7d
-  before_script:
-    - git rm -r tests/functional_tests/local_recipes || true
-    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
-    - ls tests/functional_tests/local_recipes
-    - INSTALL_DIR=$(pwd)/local
-    - rm -rf "$INSTALL_DIR"
-    - mkdir -p "$INSTALL_DIR"
-    - wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname --machine).sh" -O "$INSTALL_DIR/miniconda.sh"
-    - bash "$INSTALL_DIR/miniconda.sh" -b -u -p "$INSTALL_DIR"
-    - rm -rf "$INSTALL_DIR/miniconda.sh"
-    - source $INSTALL_DIR/bin/activate
-    - pip install jet-api --upgrade $JET_INDEX_URLS
-  variables:
-    GIT_STRATEGY: clone
-    GIT_SUBMODULE_STRATEGY: none
-  script:
-    - |
-      env 
-      set -x
-      
-      export IMAGE_TAG=${CI_PIPELINE_ID} 
-      export WANDB_API_KEY
-      CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME)
-      
-      if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then
-        echo Please assign a CONVERGENCE_TEST_RUN_NAME
-      fi
-
-      export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT
-      export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT
-
-      bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
-  artifacts:
-    paths:
-      - ./golden_values.json
-  retry:
-    max: 2
-
-release-test:
-  rules:
-    - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release"
-  extends: [.common_release]
-  tags:
-    - ${TAG}
-  parallel:
-    matrix:
-      - MODEL: bert
-        VARIANT: bert_release
-        TAG: mcore-ssh-node-B
-      - MODEL: gpt
-        VARIANT: gpt3_15b_8t_release 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_tp1pp4ep8vpp8_release
-        TAG: mcore-ssh-agent-C
-      - MODEL: mixtral
-        VARIANT: mixtral_8x22b_tp2pp8ep8vpp1_release
-        TAG: mcore-ssh-agent-C
-      - MODEL: t5
-        VARIANT: t5_release
-        TAG: mcore-ssh-agent-C
-  
-pre-release-test:
-  rules:
-    - if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release"
-  extends: [.common_release]
-  tags:
-    - ${TAG}
-  parallel:
-    matrix:
-      - MODEL: bert
-        VARIANT: bert_release
-        TAG: mcore-ssh-node-B
-      - MODEL: gpt
-        VARIANT: gpt3_15b_8t_release_sm 
-        TAG: mcore-ssh-node-B
-      - MODEL: mixtral
-        VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
-        TAG: mcore-ssh-node-B
diff --git a/.gitlab/stages/04.publish.yml b/.gitlab/stages/03.publish.yml
similarity index 100%
rename from .gitlab/stages/04.publish.yml
rename to .gitlab/stages/03.publish.yml
diff --git a/Dockerfile.ci b/Dockerfile.ci
index dfcc7381f7..fa13c48fd4 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -1,88 +1,62 @@
-# syntax=docker/dockerfile:experimental
+# syntax=docker/dockerfile:1.3-labs
 
 ARG FROM_IMAGE_NAME
-FROM $FROM_IMAGE_NAME as main
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
-      /etc/apt/apt.conf.d/docker-clean
-
-RUN apt-get update && \
-      apt-get install -y --no-install-recommends gettext && \
-      apt-get clean
-
-RUN wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
-chmod a+x /usr/local/bin/yq
-
-##### For Mamba begin #####
-RUN pip uninstall -y triton && \
-    pip install triton==2.1.0
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
 
-# The causal-conv1d and mamba-ssm packages below are built from scratch here
-# (which takes significant time) because there are no wheels available on PyPI
-# for these relatively newer versions of the packages that are compatible with
-# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
-# are using (in the NGC base container). Generally, if the package is not
-# compatible with the PyTorch version, then it will generate a Python import
-# error. The package authors tend to only release wheels for new versions of
-# these pacakges which are compatible with the versions of regular PyTorch and
-# NGC-variant PyTorch that are newer at the time of release. So, to use newer
-# versions of these packages with relatively older versions of the NGC PyTorch
-# container, we tend to have to build the packages from scratch.
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
 
-RUN cd /tmp && \
-    pip uninstall -y causal-conv1d && \
-    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
-    cd causal-conv1d && \
-    git checkout v1.2.2.post1 && \
-    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
-    cd .. && \
-    rm -rf causal-conv1d
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
 
-RUN cd /tmp && \
-    pip uninstall -y mamba-ssm && \
-    git clone https://github.com/state-spaces/mamba.git && \
-    cd mamba && \
-    git checkout v2.0.3 && \
-    MAMBA_FORCE_BUILD=TRUE pip install . && \
-    cd .. && \
-    rm -rf mamba
-##### For Mamba end #####
-
-##### For JET-API start #####
-RUN apt-get update && \ 
-    apt-get install -y python3-venv && \
-    apt-get clean -y && \
-    python -m venv /opt/jet
-##### For JET-API end #####
-
-RUN pip3 install --no-cache-dir \
-      einops \
-      flask-restful \
-      nltk \
-      pytest \
-      pytest-cov \
-      pytest_mock \
-      pytest-random-order \
-      sentencepiece \
-      wrapt \
-      git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 \
-      zarr \
-      tensorstore==0.1.45 \
-      wandb
-
-COPY . /workspace/megatron-lm
-
-COPY . /workspace/megatron-lm
-RUN cp -r /workspace/megatron-lm /opt && \
-    pip install /opt/megatron-lm  
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
 
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+triton==2.1.0 \
+causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
+mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
+grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+COPY . /opt/megatron-lm
+RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
 
 ##### For NVIDIANS only #####
 FROM main as jet
 ARG CACHEBUST=0
 RUN --mount=type=secret,id=JET_INDEX_URLS \
     JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
     /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
 ENV PATH="$PATH:/opt/jet/bin"
 ###
\ No newline at end of file
diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev
new file mode 100644
index 0000000000..fa13c48fd4
--- /dev/null
+++ b/Dockerfile.ci.dev
@@ -0,0 +1,62 @@
+# syntax=docker/dockerfile:1.3-labs
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
+
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+wrapt \
+zarr \
+wandb \
+triton==2.1.0 \
+causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
+mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
+grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+COPY . /opt/megatron-lm
+RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
+
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
+    /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 01e55c4a23..11601fd44f 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -282,6 +282,104 @@ If loading for either inference or finetuning, use the following arguments:
 --bf16 \
 ```
 
+# Llama-3.1
+
+Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Convert the checkpoints from Huggingface format to Megatron format.
+3. (Optional) Validate converted checkpoints
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Huggingface format](#huggingface-format)
+  * [Validate checkpoint](#optional-validate-checkpoint)
+  * [Launch model](#launch-model)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Llama-3 checkpoints from [Huggingface](https://huggingface.co/meta-llama).
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  8B        | 1                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --bf16 \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+ >    --model-size llama3-8B \
+```
+
+Valid values for `--model-size` are `llama3.1-8B` and `llama3.1-70B` (for pretrained-only models), and `llama3.1-8Bf` and `llama3.1-70Bf` (for chat-finetuned models).
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## (Optional) Validate checkpoints
+
+A Megatron-LM text generation server for Llama3.1 can be launched using the script `examples/llama_mistral/run_text_generation_llama3.1.sh <PATH_TO_CONVERTED_MCORE_CHECKPOINT> <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT>`.
+
+Once running, query the server with `curl 'http://<TEXT_GENERATION_SERVER_IP>:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["<SOME_PROMPT>"], "tokens_to_generate":100, "top_k":1}'`.
+
+A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT> --prompt <SOME_PROMPT>`.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 8192 \
+--max-position-embeddings 131072 \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32 \
+--disable-bias-linear \
+--transformer-impl transformer_engine \
+--group-query-attention 8 \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--rotary-base 500000 \
+--rotary-percent 1.0 \
+--use-rope-scaling \
+--ffn-hidden-size 14336 \
+--num-attention-heads 32 \
+--swiglu \
+--bf16 \
+```
+
 # Mistral-7b
 
 Megatron currently supports loading the v0.3 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps:
@@ -372,3 +470,11 @@ If loading for either inference or finetuning, use the following arguments:
 *Note: Experimental*
 
 Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3).
+
+# Known numerical differences
+
+It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
+
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas mcore combines them into a single GEMM for efficiency. This leads to small numerical differences.
+
diff --git a/examples/export/README.md b/examples/export/README.md
new file mode 100644
index 0000000000..ddb8216f94
--- /dev/null
+++ b/examples/export/README.md
@@ -0,0 +1,10 @@
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
\ No newline at end of file
diff --git a/examples/inference/quantization/README.md b/examples/export/ptq_and_trtllm_export/README.md
similarity index 82%
rename from examples/inference/quantization/README.md
rename to examples/export/ptq_and_trtllm_export/README.md
index e167b60e1c..c5255f7ccf 100644
--- a/examples/inference/quantization/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -74,7 +74,7 @@ cd ../..
 
 Now launch the PTQ + TensorRT-LLM export script,
 ```sh
-bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
 ```
 By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
 quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
@@ -104,12 +104,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
 ```
 
 ### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
@@ -139,7 +139,7 @@ huggingface-cli login
 Now launch the PTQ + TensorRT-LLM checkpoint export script,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -149,12 +149,12 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
 ```
 
 
@@ -165,7 +165,7 @@ python examples/inference/quantization/trtllm_text_generation.py --tokenizer mis
 > that we support.
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
 ```
 
 The script expect `${CHECKPOINT_DIR}` to have the following structure:
@@ -184,8 +184,23 @@ The script expect `${CHECKPOINT_DIR}` to have the following structure:
 In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
 the source of the tokenizer.
 
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b
+```
+
 ### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
-> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12.
+> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13.
 
 > **NOTE:** There are two ways to acquire the checkpoint. Users can follow
 > the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
@@ -199,16 +214,23 @@ If users choose to download the model from NGC, first extract the sharded checkp
 tar -xvf 8b_pre_trained_bf16.nemo
 ```
 
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
 Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
 ```
 
 or llama-3.1
 
 ```sh
-bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
 ```
 
 Then build TensorRT engine and run text generation example using the newly built TensorRT engine
@@ -218,14 +240,14 @@ export trtllm_options=" \
     --checkpoint_dir /tmp/trtllm_ckpt \
     --output_dir /tmp/trtllm_engine \
     --max_input_len 2048 \
-    --max_output_len 512 \
+    --max_seq_len 512 \
     --max_batch_size 8 "
 
 trtllm-build ${trtllm_options}
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
 # For llama-3
 
-python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
 #For llama-3.1
 ```
\ No newline at end of file
diff --git a/examples/inference/quantization/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
similarity index 88%
rename from examples/inference/quantization/ptq_trtllm_llama2_7b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
index 8c4777f07a..ebcc448955 100644
--- a/examples/inference/quantization/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
@@ -66,7 +66,7 @@ options=" \
     --tokenizer-model ${TOKENIZER_MODEL} \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
+    --load ${CHECKPOINT_LOAD_DIR} \
     --fp16"
 
 # Precompile CUDA extentions
@@ -76,7 +76,5 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
 
-# This script is using mpi4py which will fork multiple processes.
-python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options}
diff --git a/examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
similarity index 91%
rename from examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
index d22ae4d472..a6251663f7 100644
--- a/examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
@@ -63,9 +63,10 @@ options=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model meta-llama/Meta-Llama-3.1-8B \
     --save-interval 1000000 \
+    --use-rope-scaling \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +76,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/inference/quantization/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
similarity index 92%
rename from examples/inference/quantization/ptq_trtllm_llama3_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
index 11ab023fad..f181c8c2dd 100644
--- a/examples/inference/quantization/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
@@ -64,8 +64,8 @@ options=" \
     --tokenizer-model meta-llama/Meta-Llama-3-8B \
     --save-interval 1000000 \
     --use-dist-ckpt \
-    --load ${CHECKPOINT_LOAD_DIR}
-    --rotary-base 500000
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
     --fp16"
 
 # Precompile CUDA extentions
@@ -75,4 +75,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/inference/quantization/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
similarity index 94%
rename from examples/inference/quantization/ptq_trtllm_minitron_8b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
index 8c7bc0cb82..31ec192fd5 100644
--- a/examples/inference/quantization/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
@@ -71,4 +71,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/inference/quantization/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
similarity index 94%
rename from examples/inference/quantization/ptq_trtllm_mistral_12b.sh
rename to examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
index 17ded50d1e..3eb02d2e1d 100644
--- a/examples/inference/quantization/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
@@ -72,4 +72,4 @@ python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_
 launch_config="--nproc_per_node=${TP}"
 
 # Launch multi-process with torchrun
-torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/inference/quantization/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
similarity index 96%
rename from examples/inference/quantization/text_generation_ptq.py
rename to examples/export/ptq_and_trtllm_export/text_generation_ptq.py
index 13b327b25a..340c9c90f7 100644
--- a/examples/inference/quantization/text_generation_ptq.py
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -6,12 +6,11 @@
 import sys
 from pathlib import Path
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
 
 import modelopt.torch.quantization as mtq
 import torch
 from datasets import load_dataset
-from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
 from tqdm import tqdm
 
 # [ModelOpt]: changing the default model provider to the ModelOpt version
@@ -179,10 +178,6 @@ def hf_dataset_forword_loop_func(model):
     if args.calib_dataset is not None:
         ptq_forward_loop_func = hf_dataset_forword_loop_func
 
-    # Setting data parallel and tensor parallel group
-    set_data_parallel_group(mpu.get_data_parallel_group())
-    set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
-
     if args.export_quant_cfg in QUANT_CFG_CHOICES:
         mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
         if "*output_layer*" not in mtq_config["quant_cfg"]:
diff --git a/examples/inference/quantization/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
similarity index 100%
rename from examples/inference/quantization/trtllm_text_generation.py
rename to examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md
new file mode 100644
index 0000000000..52cad78583
--- /dev/null
+++ b/examples/export/trtllm_export/README.md
@@ -0,0 +1,161 @@
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
new file mode 100644
index 0000000000..57d44f9f62
--- /dev/null
+++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
@@ -0,0 +1,117 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
new file mode 100644
index 0000000000..587e7cfdd3
--- /dev/null
+++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
@@ -0,0 +1,118 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py
index 7b583612a5..9d8f4465f6 100644
--- a/examples/inference/llama_mistral/huggingface_reference.py
+++ b/examples/inference/llama_mistral/huggingface_reference.py
@@ -20,5 +20,6 @@
 for key in inputs:
     inputs[key] = inputs[key].cuda()
 # top_k, top_p and do_sample are set for greedy argmax based sampling
+
 outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
new file mode 100755
index 0000000000..06584f0917
--- /dev/null
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py
new file mode 100644
index 0000000000..3f4557d3c2
--- /dev/null
+++ b/examples/inference/t5/simple_t5_batch_inference.py
@@ -0,0 +1,157 @@
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    common_inference_params = CommonInferenceParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        common_inference_params=common_inference_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
index f8c3714eb3..cf48b131a7 100644
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+
 import torch
 
-from megatron.training.activations import quick_gelu, squared_relu
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
 
 
 def get_language_model_config(config):
@@ -75,7 +77,26 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
         config.gated_linear_unit = False
         config.activation_func = quick_gelu
         config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
         config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
         config.num_query_groups = 16
         config.layernorm_zero_centered_gamma = False
         config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
@@ -84,6 +105,8 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
         config.attention_softmax_in_fp32 = True
         config.normalization = 'LayerNorm'
         config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
 
     return config
 
@@ -107,3 +130,26 @@ def get_vision_projection_config(config, hidden_size):
         config.activation_func = torch.nn.functional.gelu
 
     return config
+
+
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+
+    out_seq_length: int = 32
+
+    output_path: str = ""
+
+    input_image_path: str = ""
+    gt_path: str = ""
+
+    num_partitions: int = 1
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
+
+    prompt_format: str = "mistral"
diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
index 33bcf1bf1f..4bd1b29e51 100644
--- a/examples/multimodal/dataloader_provider.py
+++ b/examples/multimodal/dataloader_provider.py
@@ -4,7 +4,7 @@
 import torch
 from dataset_helpers import TaskEncoder, print_error_handler
 
-from megatron.core import mpu
+from megatron.core import parallel_state
 from megatron.energon import (
     LimitDataset,
     RepeatDataset,
@@ -71,9 +71,9 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
     worker_debug_path = None
     worker_log_level = 0
 
-    rank = mpu.get_data_parallel_rank()
-    world_size = mpu.get_data_parallel_world_size()
-    data_parallel_group = mpu.get_data_parallel_group()
+    rank = parallel_state.get_data_parallel_rank()
+    world_size = parallel_state.get_data_parallel_world_size()
+    data_parallel_group = parallel_state.get_data_parallel_group()
 
     worker_config = WorkerConfig(
         rank=rank,
@@ -88,7 +88,7 @@ def train_valid_test_dataloaders_provider(train_val_test_num_samples):
     train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
     if args.load is not None:
         if getattr(args, "dataloader_save", None):
-            dp_rank = mpu.get_data_parallel_rank()
+            dp_rank = parallel_state.get_data_parallel_rank()
             data_save_name = get_checkpoint_name(
                 args.dataloader_save,
                 args.iteration,
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
index b80974a893..7d0a059f4d 100644
--- a/examples/multimodal/evaluate_textvqa.py
+++ b/examples/multimodal/evaluate_textvqa.py
@@ -1,16 +1,23 @@
 import argparse
 import glob
 import json
+import os
 
 from evaluate_vqav2 import compute_vqa_accuracy
 
 
 def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
-    output_file_path = input_path + "-TextVQA-merged.json"
+    # Single input file.
+    if os.path.exists(input_path):
+        input_file_paths = [input_path]
+        output_file_path = input_path.replace(".jsonl", "-merged.json")
+    # Directory of partitioned input files.
+    else:
+        pattern = input_path + "-TextVQA-[0-9].*jsonl"
+        input_file_paths = glob.glob(pattern)
 
-    pattern = input_path + "-TextVQA-[0-9].*jsonl"
-    input_file_paths = glob.glob(pattern)
+        output_file_path = input_path + "-TextVQA-merged.json"
 
     results = []
 
@@ -35,7 +42,8 @@ def merge_input_files(input_path):
 def textvqa_eval(input_path):
     """Run TextVQA evaluation."""
     result_file_path = merge_input_files(input_path)
-    compute_vqa_accuracy(result_file_path)
+    avg_acc = compute_vqa_accuracy(result_file_path)
+    return avg_acc
 
 
 if __name__ == "__main__":
@@ -43,4 +51,6 @@ def textvqa_eval(input_path):
     parser.add_argument('--input-path', type=str, help="Path to input file(s)")
     args = parser.parse_args()
 
-    textvqa_eval(args.input_path)
+    avg_acc = textvqa_eval(args.input_path)
+
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
index 5d9dfe7844..cf10a0549d 100644
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -55,7 +55,7 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
         # "We consider an answer to be correct if it is within 5% of the gold answer.
         #  For non-numeric answers, we still need an exact match to consider an answer to be correct."
         if use_chartqa_metric:
-            acc = 0.
+            acc = 0.0
             assert len(gt) == 1, "expected exactly one groundtruth answer."
             gt = gt[0]
 
@@ -74,13 +74,15 @@ def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
             all_acc.append(acc)
 
     acc_avg = sum(all_acc) / len(all_acc) * 100
-    print(f"===== Accuracy {acc_avg:.2f}% =====")
+
+    return acc_avg
 
 
 def vqav2_eval(input_path):
     """Run VQAv2 evaluation."""
     result_file = merge_input_files(input_path)
-    compute_vqa_accuracy(result_file)
+    avg_acc = compute_vqa_accuracy(result_file)
+    return avg_acc
 
 
 if __name__ == "__main__":
@@ -88,4 +90,6 @@ def vqav2_eval(input_path):
     parser.add_argument('--input-path', type=str, help="Path to input file(s)")
     args = parser.parse_args()
 
-    vqav2_eval(args.input_path)
+    avg_acc = vqav2_eval(args.input_path)
+
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
new file mode 100644
index 0000000000..b4bab73cfb
--- /dev/null
+++ b/examples/multimodal/model.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec
+
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, args.vision_model_type,
+        args.disable_vision_class_token, 1
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+
+    vision_model_type = args.vision_model_type
+    if vision_model_type in ["clip", "siglip"]:
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+        vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
+        vision_projection_config.pipeline_model_parallel_size = (
+            args.encoder_pipeline_model_parallel_size
+        )
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py
new file mode 100644
index 0000000000..a7cb4235e3
--- /dev/null
+++ b/examples/multimodal/multimodal_args.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
+    )
+    group.add_argument("--use-te", action="store_true", default=False)
+    group.add_argument(
+        "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
+    )
+    group.add_argument(
+        "--use-tiling", action="store_true", default=False, help="Use input image tiling"
+    )
+    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
+    group.add_argument(
+        "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
+    )
+    group.add_argument(
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--num-frames",
+        type=int,
+        default=1,
+        help="Number of frames to regularly sample from the video as input to the model.",
+    )
+    group.add_argument(
+        "--online-evaluation-config", type=str, help="Config file for online evaluation."
+    )
+
+    return parser
diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh
index da72c335c0..b06dbfe53c 100755
--- a/examples/multimodal/pretrain_mistral_clip.sh
+++ b/examples/multimodal/pretrain_mistral_clip.sh
@@ -32,7 +32,6 @@ fi
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
-DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
 
 DEBUG=0
 if [[ $DEBUG -eq 1 ]]; then
@@ -96,7 +95,6 @@ OPTIONS=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
     --data-path ${DATA_TRAIN} \
-    --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
     --save-interval 1000 \
     --save ${FINETUNE_DIR} \
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index 391f3071d0..6cf5fd6232 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 """Generate text using a vision language model."""
 import glob
+import itertools
 import json
 import logging
 import os
+import re
 import sys
 from collections import defaultdict
 from functools import partial
-import itertools
 
 # Add megatron to the path.
 sys.path.append(
@@ -17,7 +18,8 @@
 import datasets
 import numpy as np
 import torch
-from torchvision.io import read_video
+import yaml
+from config import EvaluationConfig
 from dataset_helpers import tokenizer_image_token
 from image_processing import get_visual_transform
 from MMMU.mmmu.utils.data_utils import (
@@ -27,10 +29,14 @@
     process_single_sample,
 )
 from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
 from PIL import Image
-from train import add_multimodal_extra_args, get_num_image_embeddings, model_provider
+from torchvision.io import read_video
 
+from megatron.core import parallel_state
 from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.inference.text_generation.api import generate_and_post_process
 from megatron.inference.text_generation.forward_step import ForwardStep
 from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
@@ -48,14 +54,12 @@ def add_text_generation_args(parser):
     group.add_argument(
         "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
     )
-    group.add_argument("--output-path", type=str, required=True, help='Output file path')
-    group.add_argument('--input-image-path', type=str, required=True, help="Input image directory")
-    group.add_argument('--input-metadata-path', type=str, help="Input metadata path")
+    group.add_argument("--output-path", type=str, help='Output file path')
+    group.add_argument('--input-image-path', type=str, help="Input image directory")
     group.add_argument(
         '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
     )
     group.add_argument('--partition-id', type=int, default=0, help="Partition index")
-    group.add_argument("--drop-vision-class-token", action="store_true", default=False)
     group.add_argument("--gt-path", type=str, help="Optional ground truth file")
     group.add_argument(
         "--task",
@@ -69,10 +73,11 @@ def add_text_generation_args(parser):
     group.add_argument(
         "--prompt-format",
         type=str,
-        required=True,
+        default="mistral",
         choices=["llama3", "mistral"],
         help="Prompting format to use",
     )
+    group.add_argument("--config-path", type=str, help="Config file to use.")
 
     # Add common multimodal arguments needed for e.g. building the model.
     parser = add_multimodal_extra_args(parser)
@@ -85,61 +90,30 @@ def _get_partition_bounds(
 ):
     if num_samples_per_partition == 0:
         samples_per_partition = [
-            int(x) for x in np.linspace(0, total_num_samples, num_partitions+1)]
-        return samples_per_partition[partition_id], samples_per_partition[partition_id+1] 
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
     return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
 
 
-def get_evaluation_dataset(
-    task,
-    input_image_path,
-    gt_path,
-    img_h,
-    img_w,
-    use_tiling,
-    max_num_tiles,
-    use_thumbnail,
-    num_samples_per_partition,
-    num_partitions,
-    partition_id,
-    num_frames,
-):
-    """Build evaluation dataset."""
-    images = []
-    tile_counts = []
-    questions, answers = [], []
-    samples, sample_ids = [], []
-
-    if task == "TextVQA":
-        samples = json.load(open(gt_path, encoding='utf-8'))['data']
-
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(samples), num_samples_per_partition, num_partitions, partition_id
-            )
-            samples = samples[lb:ub]
-
-        for i in range(len(samples)):
-            sample = samples[i]
-
-            img_file = "{}/{}.jpg".format(input_image_path, sample["image_id"])
-            if not os.path.exists(img_file):
-                img_file = img_file.replace('.jpg', '.png')
-
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
-
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
-
-            questions.append(sample["question"])
-            answers.append(sample["answers"])
-            sample_ids.append(sample["question_id"])
-    elif task == "VQAv2":
+class VQADataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
         samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
 
         # Optionally, process only a subset of the input files.
         if num_partitions > 0:
@@ -148,50 +122,72 @@ def get_evaluation_dataset(
             )
             samples = samples[lb:ub]
 
-        for i in range(len(samples)):
-            sample = samples[i]
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
 
-            img_file = "{}/{}".format(input_image_path, sample["image"])
+    def __len__(self):
+        return len(self._samples)
 
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
 
-            questions.append(sample["question"])
-            answers.append(sample["answer"])
-            sample_ids.append(sample["question_id"])
-    elif task == "ChartQA":
-        samples = json.load(open(gt_path, encoding='utf-8'))
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
 
-        # Optionally, process only a subset of the input files.
-        if num_partitions > 0:
-            lb, ub = _get_partition_bounds(
-                len(samples), num_samples_per_partition, num_partitions, partition_id
-            )
-            samples = samples[lb:ub]
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
 
-        for i in range(len(samples)):
-            sample = samples[i]
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
 
-            img_file = "{}/{}".format(input_image_path, sample["imgname"])
+        metadata = ""  # Not used.
 
-            img = Image.open(img_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
 
-            questions.append(sample["query"])
-            answers.append(sample["label"])
-            sample_ids.append(i)
-    elif task == "captioning":
+class CaptioningDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
         image_files = sorted(glob.glob(input_image_path + "/*"))
+
         # Optionally, process only a subset of the input files.
         if num_partitions > 0:
             lb, ub = _get_partition_bounds(
@@ -204,20 +200,54 @@ def get_evaluation_dataset(
         for gt in gts["annotations"]:
             answers[gt["image_id"]].append(gt['caption'])
 
-        # Run image preprocessing.
-        for i in range(len(image_files)):
-            image_file = image_files[i]
-            img = Image.open(image_file)
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
-            )
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+
+    def __len__(self):
+        return len(self._image_files)
+
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
 
-            image_id = int(image_file.split("_")[-1].split(".")[0])
-            sample_ids.append(image_id)
-    elif task == 'MMMU':
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        single_image,
+    ):
         # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
         all_mmmu_datasets = []
 
@@ -225,9 +255,22 @@ def get_evaluation_dataset(
         assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
 
         for subject in CAT_SHORT2LONG.values():
-            subject_dataset = datasets.load_dataset(
-                "MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache
-            )
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+
             all_mmmu_datasets.append(subject_dataset)
 
         dataset = datasets.concatenate_datasets(all_mmmu_datasets)
@@ -235,14 +278,11 @@ def get_evaluation_dataset(
         dataset = [s for s in dataset if s['id'].startswith("val")]
 
         # Optionally, process only a subset of the input files.
-        start_idx = 0
-        end_idx = len(dataset)
         if num_partitions > 0:
-            start_idx, end_idx = _get_partition_bounds(
+            lb, ub = _get_partition_bounds(
                 len(dataset), num_samples_per_partition, num_partitions, partition_id
             )
-
-        end_idx = min(len(dataset), end_idx)
+            dataset = dataset[lb:ub]
 
         # Using the LLaVA config from the MMMU repo.
         config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
@@ -251,30 +291,119 @@ def get_evaluation_dataset(
                 assert len(v) == 1, "only one value supported."
                 config[k] = v[0]
 
-        for idx in range(start_idx, end_idx):
-            sample = dataset[idx]
+        self._config = config
+
+        self._dataset = dataset
+
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._single_image = single_image
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        sample = self._dataset[idx]
+
+        # Use the single image approach from the MMMU repo.
+        if self._single_image:
             sample = process_single_sample(sample)
-            sample = construct_prompt(sample, config)
+            sample = construct_prompt(sample, self._config)
 
             img = sample["image"]
-            imgs = get_visual_transform(
-                img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
             )
+            sample_num_tiles = [len(sample_imgs)]
+        else:
+            sample = construct_prompt(sample, self._config)
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
+                )
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+
+        prompt = sample['final_input_prompt']
+        if self._single_image:
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            prompt = f"<image>\n{prompt}"
 
-            images.append(imgs)
-            tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
 
-            sample_ids.append(sample['id'])
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
 
-            # TODO: Support multiple input images and the original image position. Note: <image> is added back in the prompt construction below.
-            prompt = sample['final_input_prompt']
-            for i in range(8):
-                prompt = prompt.replace(f"<image {i}>", "")
-            questions.append(prompt)
 
-            answers.append(sample['answer'])
-            samples.append(sample)
-    elif task == "VideoMME":
+class VideoMMMEDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+    ):
         ground_truth_original = json.load(open(gt_path))
         ground_truth = []
         for gt in ground_truth_original:
@@ -286,135 +415,295 @@ def get_evaluation_dataset(
                 continue
             gt["video_path"] = video_path
             ground_truth.append(gt)
-        
+
         ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
         print_rank_0(f"Found {len(ground_truth)} videos to process.")
 
         if num_partitions > 0:
             start_idx, end_idx = _get_partition_bounds(
-                len(ground_truth), num_samples_per_partition,
-                num_partitions, partition_id
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
             )
             ground_truth = ground_truth[start_idx:end_idx]
 
-        # Run image preprocessing.
-        for idx, gt in enumerate(ground_truth):
-            print_rank_0(f"Processing input video: {idx} / {len(ground_truth)}")
-            video, _, _ = read_video(
-                gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
-            video = video.numpy()
-            selected_frames = torch.linspace(
-                0, video.shape[0] - 1, num_frames).long()
-            video_frames = video[selected_frames]
-            if num_frames == 1:
-                video_frames = video_frames[None]
-
-            imgs = list(itertools.chain.from_iterable(
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+
+    def __len__(self):
+        return len(self._ground_truth)
+
+    def __getitem__(self, idx):
+        gt = self._ground_truth[idx]
+
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+
+        imgs = list(
+            itertools.chain.from_iterable(
                 get_visual_transform(
-                    img, img_h, img_w, use_tiling, max_num_tiles,
-                    use_thumbnail, augment=False) for img in video_frames))
-
-            for question in gt["questions"]:
-                # Very hacky, but we essentially re-create gt holding only the
-                # question of interest. This is the make this generation script
-                # compatible with the Video MME evaluation script.
-                question_dict = {
-                    "video_id": gt["video_id"],
-                    "duration_category": gt["duration_category"],
-                    "video_category": gt["video_category"],
-                    "video_subcategory": gt["video_subcategory"],
-                    "url": gt["url"],
-                    "questions": [question]
-                }
-                images.append(imgs)
-                tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
-                questions.append(question_dict)
-                sample_ids.append(question["question_id"])
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )
+                for img in video_frames
+            )
+        )
+
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+        answer = ""
+        metadata = ""
+
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+
+
+def get_evaluation_dataloader(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    num_workers,
+):
+    """Build evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == 'MMMU':
+        # Note: single_image=True uses only one image like in the MMMU repo example.
+        # single_image=False uses all images in the sample.
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            single_image=True,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+        )
     else:
-        raise NotImplementedError("unsupported task")
+        raise NotImplementedError(f"unsupported task {task}")
 
-    return images, tile_counts, samples, sample_ids, questions, answers
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_world_size = parallel_state.get_data_parallel_world_size()
 
+    sampler = torch.utils.data.DistributedSampler(
+        dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
+    )
+    # TODO: Batched inference is not supported yet.
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
+    )
+
+    return dataloader
 
-def generate_samples(model):
+
+def generate_samples(model, config: EvaluationConfig):
     """Text generation using a trained vision language model."""
     args = get_args()
-    images, tile_counts, samples, sample_ids, questions, answers = get_evaluation_dataset(
-        args.task,
-        args.input_image_path,
-        args.gt_path,
+
+    rank = torch.distributed.get_rank()
+
+    dataloader = get_evaluation_dataloader(
+        config.task,
+        config.input_image_path,
+        config.gt_path,
         args.img_h,
         args.img_w,
         args.use_tiling,
         args.max_num_tiles,
         args.use_thumbnail,
-        args.num_samples_per_partition,
-        args.num_partitions,
-        args.partition_id,
-        args.num_frames
+        config.num_samples_per_partition,
+        config.num_partitions,
+        config.partition_id,
+        args.num_frames,
+        args.num_workers,
     )
+
     num_img_embeddings_per_tile = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim,
-        args.disable_vision_class_token, 1)
-    num_samples = len(sample_ids)
-    idx = 0
-    while idx < num_samples:
-        imgs = torch.stack(images[idx]).cuda()
-        num_tiles = tile_counts[idx].cuda()
-        sample_id = sample_ids[idx]
+        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+    )
+
+    for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
+        imgs = imgs.to("cuda")
+        num_tiles = num_tiles.to("cuda")
 
-        prompt = get_prompt(args.task, questions, idx, args.prompt_format)
+        prompt = get_prompt(config.task, question, config.prompt_format)
 
-        forward_step = partial(
-            VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
+        forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
 
-        if torch.distributed.get_rank() == 0:
+        if rank == 0:
             resp_sentences, _, _, _ = generate_and_post_process(
                 model,
                 forward_step=forward_step,
                 prompts=[prompt],
-                tokens_to_generate=args.out_seq_length,
-                top_k_sampling=args.top_k,
-                top_p_sampling=args.top_p,
+                tokens_to_generate=config.out_seq_length,
+                top_k_sampling=config.top_k,
+                top_p_sampling=config.top_p,
                 add_BOS=False,
-                temperature=args.temperature,
+                temperature=config.temperature,
                 random_seed=args.seed,
                 detokenize_segments=False,
             )
 
             for prompt, generation in zip([prompt], resp_sentences):
+                if isinstance(sample_id, torch.Tensor):
+                    sample_id = sample_id.item()
+
                 output = {"sample_id": sample_id, "prompt": prompt}
 
                 output_name = ""
-                if args.task == "captioning":
+                if config.task == "captioning":
                     output_name = "caption"
-                elif args.task in ("TextVQA", "VQAv2", "ChartQA"):
+                elif config.task in ("TextVQA", "VQAv2", "ChartQA"):
                     output_name = "answer"
-                elif args.task in ("MMMU"):
+                elif config.task in ("MMMU"):
                     output_name = "text"
-                elif args.task == "VideoMME":
+                elif config.task == "VideoMME":
                     output_name = "response"
-                    output = questions[idx]
+                    output = question
 
-                generated = get_generated(prompt, args.prompt_format, generation)
-                if args.task == "VideoMME":
+                generated = get_generated(generation, config.prompt_format)
+                if config.task == "VideoMME":
                     output["questions"][0][output_name] = generated
                 else:
                     output[output_name] = generated
 
-                if args.task == "captioning":
-                    output["ground_truth"] = answers[sample_id]
-                elif args.task in ("TextVQA", "VQAv2"):
-                    output["gt_answer"] = [ans for ans in answers[idx]]
-                elif args.task == "ChartQA":
-                    output["gt_answer"] = [answers[idx]]
-                elif args.task == "MMMU":
-                    sample = samples[idx]
-
+                if config.task == "captioning":
+                    output["ground_truth"] = answers
+                elif config.task in ("TextVQA", "VQAv2"):
+                    output["gt_answer"] = [ans for ans in answers]
+                elif config.task == "ChartQA":
+                    output["gt_answer"] = [answers]
+                elif config.task == "MMMU":
                     prediction = generated
-                    if sample["question_type"] == "multiple-choice":
+                    if metadata["question_type"] == "multiple-choice":
                         prediction = parse_multi_choice_response(
-                            generated, sample["all_choices"], sample["index2ans"]
+                            generated, metadata["all_choices"], metadata["index2ans"]
                         )
 
                     output["prediction"] = prediction
@@ -429,27 +718,69 @@ def generate_samples(model):
             idx += 1
 
 
-def generate_and_write_samples(model):
-    """Generate text and write to an output file."""
+def get_evaluation_config():
+    """Get evaluation config from a config file or command-line arguments."""
     args = get_args()
+    if args.config_path:
+        with open(args.config_path, "r") as f:
+            config_dict = yaml.safe_load(f)
 
-    for output in generate_samples(model):
-        if torch.distributed.get_rank() == 0:
-            with open(args.output_path, 'a') as f:
-                f.write(json.dumps(output) + "\n")
+        config = EvaluationConfig(**config_dict)
+    else:
+        config = EvaluationConfig(
+            task=args.task,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            out_seq_length=args.out_seq_length,
+            output_path=args.output_path,
+            input_image_path=args.input_image_path,
+            gt_path=args.gt_path,
+            num_partitions=args.num_partitions,
+            partition_id=args.partition_id,
+            num_samples_per_partition=args.num_samples_per_partition,
+            prompt_format=args.prompt_format,
+        )
+
+    # Default output path if not defined...
+    if not config.output_path:
+        os.makedirs("generated", exist_ok=True)
+        config.output_path = "generated/" + args.language_model_type
+
+    return config
+
+
+def generate_and_write_samples(model, config):
+    """Generate text and write to an output file."""
+    rank = torch.distributed.get_rank()
+
+    if rank == 0:
+        output_file = open(config.output_path, "w")
+        print(f"output path: {output_file.name}")
+
+    for output in generate_samples(model, config):
+        if rank == 0:
+            output_file.write(json.dumps(output) + "\n")
+            output_file.flush()
 
 
 class VLMForwardStep(ForwardStep):
     """Inference forward step for a multimodal model."""
 
-    def __init__(self, num_img_embeddings_per_tile, images, num_tiles, model,
-                 max_batch_size, max_sequence_length):
+    def __init__(
+        self,
+        num_img_embeddings_per_tile,
+        images,
+        num_tiles,
+        model,
+        max_batch_size,
+        max_sequence_length,
+    ):
         """Create multimodal forward step."""
         total_num_tiles = torch.sum(num_tiles).item()
-        num_img_embeddings =  num_img_embeddings_per_tile * total_num_tiles
+        num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
 
-        super().__init__(
-            model, max_batch_size, max_sequence_length + num_img_embeddings)
+        super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
         self._images = images
         self._num_tiles = num_tiles
 
@@ -461,6 +792,7 @@ def _forward(self, tokens, position_ids, attention_mask):
             attention_mask=None,
             inference_params=self.inference_params,
             num_image_tiles=self._num_tiles,
+            runtime_gather_output=True,
         )
 
     def __call__(self, tokens, position_ids, attention_mask):
@@ -468,101 +800,90 @@ def __call__(self, tokens, position_ids, attention_mask):
 
         # On the first inference iteration, we compute image tokens.
         # Update the sequence length offset by the number of image tokens.
-        num_images = (tokens == -200).sum().item()
+        num_image_tokens = (tokens == -200).sum().item()
         num_tokens = tokens.size(1)
-        if num_tokens > 1 and num_images > 0:
+        if num_tokens > 1 and num_image_tokens > 0:
             self.inference_params.sequence_len_offset += (
-                self.inference_params.key_value_memory_dict["image_tokens_count"] - num_images
+                self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens
             )
 
         return logits
 
 
-def get_prompt(task, questions, idx, prompt_format):
+def get_prompt(task, question, prompt_format):
     """Get a prompt for the evaluation task."""
     if task == "captioning":
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nProvide a one-sentence caption for provided image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
         elif prompt_format == "mistral":
-            prompt = "<image>Give a short and clear explanation of the subsequent image.\n"
+            prompt = (
+                "[INST] <image>Give a short and clear explanation of the subsequent image. [/INST]"
+            )
     elif task == "TextVQA":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
                 question
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "VQAv2":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
                 question
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "ChartQA":
-        question = questions[idx]
-
         if prompt_format == "llama3":
             prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
-                questions
+                question
             )
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
                 question
             )
     elif task == "MMMU":
-        question = questions[idx]
-
         if prompt_format == "llama3":
-            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            prompt = prompt.format("", question)
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(question)
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
-                question
-            )
+            prompt = "[INST] {} [/INST]".format(question)
     elif task == "VideoMME":
-        question = (
+        q = (
             "Select the best answer to the following multiple-choice "
             "question based on the video. Respond with only the letter "
-            "(A, B, C, or D) of the correct option.\n")
-        question += (questions[idx]["questions"][0]["question"] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][0] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][1] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][2] + "\n")
-        question += (questions[idx]["questions"][0]["choices"][3] + "\n")
+            "(A, B, C, or D) of the correct option.\n"
+        )
+        q += question["questions"][0]["question"] + "\n"
+        q += question["questions"][0]["choices"][0] + "\n"
+        q += question["questions"][0]["choices"][1] + "\n"
+        q += question["questions"][0]["choices"][2] + "\n"
+        q += question["questions"][0]["choices"][3] + "\n"
 
         if prompt_format == "llama3":
-            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            prompt = prompt.format("", question)
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(q)
         elif prompt_format == "mistral":
-            prompt = "<image>\n{}".format(
-                question
-            )
+            prompt = "[INST] <image>\n{} [/INST]".format(q)
 
     return prompt
 
 
-def get_generated(prompt, prompt_format, prompt_and_generation):
+def get_generated(prompt_and_generation, prompt_format):
     """Strip prompt and other unnecessary text from generation."""
-    start = len(prompt.replace("<image>", ""))
     if prompt_format == "llama3":
-        start += len("<|begin_of_text|>")
-        start += 1
+        generated = prompt_and_generation.split(
+            "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        )[-1]
+        generated = generated.split("<|eot_id|>")[0]
     elif prompt_format == "mistral":
-        start += len("<s><unk><s> ")
+        generated = prompt_and_generation.split("[/INST]")[-1]
+        generated = generated.split("</s>")[0]
 
-    generated = prompt_and_generation[start:]
-    generated = generated.replace("<s> ", "")
-    generated = generated.split("<|eot_id|>")[0]
-    generated = generated.split("</s>")[0]
     generated = generated.strip()
     generated = generated.split("\n\n")[0]
     generated = generated.split("\n")[0]
@@ -577,15 +898,16 @@ def _decorate_tokenize(f):
         # When tokenizing, replace <image> with the image token index (-200)
         def wrapper(prompt):
             tokens = tokenizer_image_token(args, prompt, f)
+
             return tokens
 
         return wrapper
 
     def _decorate_detokenize(f):
-        # When detokenizing, replace image token index (-200) with a dummy value.
+        # When detokenizing, skip image token index.
         def wrapper(tokens):
             tokens = np.array(tokens)
-            tokens[tokens == IMAGE_TOKEN_INDEX] = 0
+            tokens = tokens[tokens != IMAGE_TOKEN_INDEX]
             tokens = tokens.tolist()
 
             return f(tokens)
@@ -617,9 +939,12 @@ def wrapped_model_provider(pre_process, post_process):
         _ = load_checkpoint(model, None, None)
 
     model = model[0]
+
     model.eval()
 
-    generate_and_write_samples(model)
+    config = get_evaluation_config()
+
+    generate_and_write_samples(model, config)
 
 
 if __name__ == "__main__":
diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh
index 93a0a91366..46fc996055 100755
--- a/examples/multimodal/sft_mistral_clip.sh
+++ b/examples/multimodal/sft_mistral_clip.sh
@@ -37,7 +37,6 @@ fi
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
-DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml"
 
 DEBUG=0
 if [[ $DEBUG -eq 1 ]]; then
@@ -101,7 +100,6 @@ OPTIONS=" \
     --tokenizer-type HuggingFaceTokenizer \
     --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
     --data-path ${DATA_TRAIN} \
-    --valid-path ${DATA_VALID} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
     --save-interval 500 \
     --save ${FINETUNE_DIR} \
diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh
index 30d1b06ab4..b78969ab59 100755
--- a/examples/multimodal/text_generation_mistral_clip.sh
+++ b/examples/multimodal/text_generation_mistral_clip.sh
@@ -4,7 +4,6 @@ export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=0
 
-INPUT_METADATA_PATH="placeholder"
 GROUNDTRUTH_PATH="placeholder"
 NUM_FRAMES=1
 
@@ -15,11 +14,6 @@ while [[ $# -gt 0 ]]; do
             shift
             shift
             ;;
-        --input-metadata-path)
-            INPUT_METADATA_PATH="$2"
-            shift
-            shift
-            ;;
         --num-frames)
             NUM_FRAMES="$2"
             shift
@@ -112,7 +106,6 @@ do
         --no-load-rng \
         --no-load-optim \
         --input-image-path ${INPUT_IMAGE_PATH} \
-        --input-metadata-path ${INPUT_METADATA_PATH} \
         --num-partitions ${NUM_PARTITIONS} \
         --partition-id ${PARTITION_ID} \
         --output-path ${OUTPUT_PATH}-${TASK}-${PARTITION_ID}.jsonl \
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
index e1cad7814e..386cdc03d0 100644
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -1,131 +1,29 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 """Pretrain or SFT multimodal."""
-from copy import deepcopy
-from functools import partial
+import json
 import os
 import sys
-import warnings
+from functools import partial
 
 import torch
+import yaml
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir, os.path.pardir)))
 
-from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
+from config import EvaluationConfig
+from dataloader_provider import train_valid_test_dataloaders_provider
+from evaluate_textvqa import textvqa_eval
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+from run_text_generation import generate_samples, patch_tokenizer
+
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
-from megatron.core.parallel_state import get_tensor_model_parallel_rank
-from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
-from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.core.models.multimodal.llava_model import LLaVAModel
-from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te
-from megatron.training import pretrain
-from dataloader_provider import train_valid_test_dataloaders_provider
-
-def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True,
-    parallel_output=True) -> LLaVAModel:
-    """Builds the model.
-
-    Args:
-        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
-        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
-            will live on only a subset of the pipeline stages (specifically, only the first stage).
-        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
-            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
-        parallel_output (bool): Enable parallel model output.
-
-    Returns:
-        model: A multimodal model.
-    """
-    args = get_args()
-
-    use_te = args.use_te
-
-    print_rank_0('building a multimodal model ...')
-
-    num_image_embeddings = get_num_image_embeddings(args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1)
-    old_seq_length = args.seq_length
-    args.seq_length = args.encoder_seq_length = num_image_embeddings
-    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
-        warnings.warn(f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})")
-
-    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
-
-    assert args.decoder_seq_length is not None, "Please provide --decoder-seq-length to set the language model sequence length"
-    assert args.decoder_seq_length > max_num_image_embeddings, "Language model sequence length must be greater than the maximum number of image embeddings"
-    if args.decoder_seq_length > args.max_position_embeddings:
-        args.max_position_embeddings = args.decoder_seq_length
-        warnings.warn(f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length")
-
-    base_config = core_transformer_config_from_args(get_args())
-    base_config.language_model_type = args.language_model_type
-    base_config.vision_model_type = args.vision_model_type
-    base_config.calculate_per_token_loss = True
-
-    language_config = deepcopy(base_config)
-    language_config = get_language_model_config(language_config)
-
-    if use_te:
-        language_transformer_layer_spec = get_layer_spec_te(is_vit=False)   # TENorm detects LayerNorm/RMS automatically.
-    else:
-        language_transformer_layer_spec = get_layer_spec(is_vit=False, normalization=language_config.normalization)
-
-    vision_config = deepcopy(base_config)
-    vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling)
-
-    vision_model_type = args.vision_model_type
-    if vision_model_type == "clip":
-        if use_te:
-            vision_transformer_layer_spec = get_layer_spec_te(is_vit=True)  # TENorm detects LayerNorm/RMS automatically.
-        else:
-            vision_transformer_layer_spec = get_layer_spec(is_vit=True, normalization=vision_config.normalization)
-    else:
-        raise RuntimeError("unsupported vision model type", vision_model_type)
-
-    vision_projection_config = deepcopy(base_config)
-    vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size)
-
-    if args.encoder_pipeline_model_parallel_size > 0:
-        assert args.encoder_pipeline_model_parallel_size == 1, "vision model and projection can only live on 1 pipeline stage."
-        vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
-        vision_projection_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
-        if args.encoder_tensor_model_parallel_size > 0:
-            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
-            vision_projection_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
-
-    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
-
-    model = LLaVAModel(
-        language_transformer_config=language_config,
-        language_transformer_layer_spec=language_transformer_layer_spec,
-        language_vocab_size=args.padded_vocab_size,
-        language_max_sequence_length=args.decoder_seq_length,
-        vision_transformer_config=vision_config,
-        vision_transformer_layer_spec=vision_transformer_layer_spec,
-        drop_vision_class_token=args.disable_vision_class_token,
-        vision_projection_config=vision_projection_config,
-        vision_projection_layer_spec=vision_projection_layer_spec,
-        vision_projection_type="mlp",
-        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
-        parallel_output=parallel_output,
-        language_position_embedding_type=args.position_embedding_type,
-        language_rotary_percent=args.rotary_percent,
-        pre_process=pre_process,
-        post_process=post_process,
-        add_encoder=add_encoder,
-        add_decoder=add_decoder,
-        img_h=args.img_h,
-        img_w=args.img_w,
-        patch_dim=args.patch_dim,
-        language_rotary_base=args.rotary_base,
-    )
-
-    model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False)
-
-    return model
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import is_last_rank
 
 
 def get_batch(data_iterator):
@@ -314,32 +212,6 @@ def forward_step(data_iterator, model: LLaVAModel):
 
     return output_tensor, partial(loss_func, loss_mask)
 
-def add_multimodal_extra_args(parser):
-    """Extra arguments."""
-    group = parser.add_argument_group(title='multimodal arguments')
-    group.add_argument('--valid-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--dataset-config', type=str, default=None)
-    group.add_argument("--prompt-path", type=str, default=None)
-    group.add_argument('--freeze-LM', action='store_true', default=False)
-    group.add_argument('--freeze-ViT', action='store_true', default=False)
-    group.add_argument('--language-model-type', type=str, required=True)
-    group.add_argument('--vision-model-type', type=str, default="clip")
-    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
-    group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False)
-    group.add_argument("--use-te", action="store_true", default=False)
-    group.add_argument("--dataloader-save", type=str, default=None, help="Energon dataloader state save path")
-    group.add_argument("--use-tiling", action="store_true", default=False, help="Use input image tiling")
-    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
-    group.add_argument("--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile")
-    group.add_argument("--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.")
-    group.add_argument("--num-frames", type=int, default=1, help="Number of frames to regularly sample from the video as input to the model.")
-
-    return parser
-
 
 def llava_embedding_ranks(pp_ranks):
     """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
@@ -375,6 +247,64 @@ def llava_position_embedding_ranks(pp_ranks):
         return [pp_ranks[epp]]
 
 
+
+def run_online_eval(model):
+    """Run an evaluation benchmark during training."""
+    args = get_args()
+
+    # Online evaluation config is not defined. Do nothing.
+    if not args.online_evaluation_config:
+        return []
+
+    with open(args.online_evaluation_config, "r") as f:
+        config_dict = yaml.safe_load(f)
+
+    config = EvaluationConfig(**config_dict)
+
+    patch_tokenizer(args)
+
+    # The inference code assumes the first rank is the leader.
+    # Tensorboard writer is on the last rank.
+    # We must write to a storage space that all ranks see.
+    output_dir = os.path.join(args.save, "online_eval")
+    os.makedirs(output_dir, exist_ok=True)
+    config.output_path = os.path.join(output_dir, f"{config.task}.jsonl")
+
+    if torch.distributed.get_rank() == 0:
+        output_file = open(config.output_path, "w")
+
+    with torch.no_grad():
+        for output in generate_samples(model[0].module, config):
+            if torch.distributed.get_rank() == 0:
+                output_file.write(json.dumps(output) + "\n")
+
+    if torch.distributed.get_rank() == 0:
+        output_file.close()
+
+    # Make sure the first rank is done writing so that the last rank can run eval.
+    torch.distributed.barrier()
+
+    if not is_last_rank():
+        return []
+
+    if config.task.lower() == "textvqa":
+        avg_acc = textvqa_eval(config.output_path)
+
+        return [{"textvqa accuracy": avg_acc}]
+    else:
+        raise NotImplementedError(f"online evaluation of {config.task} not implemented yet")
+
+
+def write_online_eval_to_tensorboard(data, iteration, writer):
+    """Write online evaluation data to Tensorboard."""
+    if not writer:
+        return
+
+    for item in data:
+        for k, v in item.items():
+            writer.add_scalar(k, v, iteration)
+
+
 if __name__ == "__main__":
     train_valid_test_dataloaders_provider.is_distributed = True
 
@@ -385,6 +315,8 @@ def llava_position_embedding_ranks(pp_ranks):
         forward_step,
         args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
         extra_args_provider=add_multimodal_extra_args,
+        process_non_loss_data_func=write_online_eval_to_tensorboard,
         get_embedding_ranks=llava_embedding_ranks,
         get_position_embedding_ranks=llava_position_embedding_ranks,
+        non_loss_data_func=run_online_eval
     )
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
index 96f2c316c5..077d94eb77 100644
--- a/megatron/core/dist_checkpointing/strategies/torch.py
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -10,7 +10,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import torch
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from torch.distributed import checkpoint
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharded_tensor import Shard, ShardedTensorMetadata, TensorProperties
@@ -448,8 +448,9 @@ def __init__(
         nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None,
         **kwargs,
     ) -> None:
-        # `dedup_replicated_tensors` was deprecated in 2.3 - avoids tons of warnings during saving
-        if packaging.version.Version(torch.__version__) <= packaging.version.Version("2.2"):
+        # `dedup_replicated_tensors` was deprecated in 2.3; this check avoids warnings
+        # during saving.
+        if PkgVersion(torch.__version__) <= PkgVersion("2.2"):
             kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors
         super().__init__(*args, **kwargs)
         self.nd_flattened_global_shapes = nd_flattened_global_shapes or {}
diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py
new file mode 100644
index 0000000000..38fbdea8f6
--- /dev/null
+++ b/megatron/core/export/data_type.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+DataType = Enum('DataType', ["bfloat16", "float16", "float32"])
diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py
new file mode 100644
index 0000000000..2cc1e208be
--- /dev/null
+++ b/megatron/core/export/export_config.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+
+
+@dataclass
+class ExportConfig:
+    """Base configuration for Megatron Core Export
+
+    These parameters control the export setting for trtllm
+    """
+
+    inference_tp_size: int = 1
+
+    inference_pp_size: int = 1
+
+    use_parallel_embedding: bool = False
+
+    use_embedding_sharing: bool = False
diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py
new file mode 100644
index 0000000000..6a33d6440e
--- /dev/null
+++ b/megatron/core/export/model_type.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+ModelType = Enum(
+    'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
+)
diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
new file mode 100644
index 0000000000..e729fec410
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
+from tensorrt_llm.plugin import PluginConfig
+
+
+class TRTLLMEngineBuilder:
+    """A utility class to build TRTLLM engine"""
+
+    @staticmethod
+    def build_and_save_engine(
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            model_type (ModelType, optional): ModelType enum. Defaults to ModelType.gpt.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+        architecture = (
+            "LLaMAForCausalLM"
+            if trtllm_model_config.architecture == "LlamaForCausalLM"
+            else trtllm_model_config.architecture
+        )
+        try:
+            model_cls = getattr(tensorrt_llm.models, architecture)
+        except:
+            raise AttributeError(f"Could not find TRTLLM model for architecture: {architecture}!")
+
+        logger.set_level("info")
+        plugin_config = PluginConfig()
+        plugin_config.gpt_attention_plugin = gpt_attention_plugin
+        plugin_config.gemm_plugin = gemm_plugin
+        if paged_kv_cache:
+            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+        else:
+            plugin_config.paged_kv_cache = False
+        plugin_config.remove_input_padding = remove_input_padding
+        plugin_config.use_paged_context_fmha = paged_context_fmha
+        plugin_config.multiple_profiles = multiple_profiles
+
+        if max_seq_len is None:
+            max_seq_len = max_input_len + max_output_len
+
+        max_num_tokens, opt_num_tokens = check_max_num_tokens(
+            max_num_tokens=max_num_tokens,
+            opt_num_tokens=opt_num_tokens,
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_beam_width=max_beam_width,
+            remove_input_padding=remove_input_padding,
+            enable_context_fmha=plugin_config.context_fmha,
+            tokens_per_block=tokens_per_block,
+            multiple_profiles=multiple_profiles,
+        )
+
+        build_dict = {
+            'max_input_len': max_input_len,
+            'max_output_len': max_output_len,
+            'max_batch_size': max_batch_size,
+            'max_beam_width': max_beam_width,
+            'max_seq_len': max_seq_len,
+            'max_num_tokens': max_num_tokens,
+            'opt_num_tokens': opt_num_tokens,
+            'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+            'gather_context_logits': False,
+            'gather_generation_logits': False,
+            'strongly_typed': False,
+            'builder_opt': None,
+            'use_refit': use_refit,
+            'multiple_profiles': multiple_profiles,
+        }
+        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+        if use_lora_plugin is not None:
+            # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+            # build_config.plugin_config._lora_plugin = use_lora_plugin
+            lora_config = LoraConfig(
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source='nemo',  # TODO : NEED TO SEE HOW TO HANDLE THIS FOR MCORE
+                max_lora_rank=max_lora_rank,
+                lora_target_modules=lora_target_modules,
+            )
+            build_config.lora_config = lora_config
+
+        model = model_cls.from_config(trtllm_model_config)
+        model = optimize_model(
+            model,
+            use_parallel_embedding=trtllm_model_config.use_parallel_embedding,
+            share_embedding_table=trtllm_model_config.share_embedding_table,
+        )
+        preprocess_weights(trtllm_model_weights, trtllm_model_config)
+        model.load(trtllm_model_weights)
+        engine = build_trtllm(model, build_config)
+        engine.save(engine_dir)
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
new file mode 100644
index 0000000000..cad9315034
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.model_to_trllm_mapping.falcon_model import FALCON_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gemma_model import GEMMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_next_model import GPT_NEXT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.llama_model import LLAMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.starcoder_model import STARCODER_DICT
+
+DEFAULT_CONVERSION_DICT = {
+    ModelType.llama: LLAMA_DICT,
+    ModelType.falcon: FALCON_DICT,
+    ModelType.gemma: GEMMA_DICT,
+    ModelType.starcoder: STARCODER_DICT,
+    ModelType.gpt: GPT_DICT,
+    ModelType.gptnext: GPT_NEXT_DICT,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
new file mode 100644
index 0000000000..d1469d02ba
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+FALCON_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
new file mode 100644
index 0000000000..47a0211706
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GEMMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
new file mode 100644
index 0000000000..eda27600c6
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+GPT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
new file mode 100644
index 0000000000..ac5f84ef1b
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GPT_NEXT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
new file mode 100644
index 0000000000..5fd2067081
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+LLAMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
new file mode 100644
index 0000000000..dce61d26c5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+STARCODER_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py
new file mode 100644
index 0000000000..2ed09398c2
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_config.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_CONFIG = {
+    ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.gemma: tensorrt_llm.models.GemmaConfig,
+    ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig,
+}
diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py
new file mode 100644
index 0000000000..f45ff1786e
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_type.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_TYPE_STRING = {
+    ModelType.gpt: 'GPTForCausalLM',
+    ModelType.gptnext: 'GPTForCausalLM',
+    ModelType.starcoder: 'GPTForCausalLM',
+    ModelType.mixtral: 'LlamaForCausalLM',
+    ModelType.llama: 'LlamaForCausalLM',
+    ModelType.gemma: 'GemmaForCausalLM',
+    ModelType.falcon: 'FalconForCausalLM',
+}
diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py
new file mode 100644
index 0000000000..d8bef18b33
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_helper.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.engine_builder.trtllm_engine_builder import TRTLLMEngineBuilder
+from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+    DEFAULT_CONVERSION_DICT,
+)
+from megatron.core.export.trtllm.trt_model_config import TRT_MODEL_CONFIG
+from megatron.core.export.trtllm.trt_model_type import TRT_MODEL_TYPE_STRING
+
+# pylint: disable=line-too-long
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TRTLLMHelper:
+    """TRTLLM Helper class to convert export and build TRTLLM model."""
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        model_type: ModelType,
+        trtllm_conversion_dict: dict = {},
+        position_embedding_type: str = 'learned_absolute',
+        max_position_embeddings: int = None,
+        rotary_percentage: int = 1.0,
+        rotary_base: int = 10000,
+        moe_tp_mode: int = 2,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+        seq_len_interpolation_factor: float = None,
+        moe_renorm_mode=None,
+        share_embeddings_and_output_weights=False,
+    ):
+        """Constructor for the TRTLLMHelper
+
+        There are two public API's supported  by this helper.
+        a) get_trtllm_pretrained_config_and_model_weights
+        b) build_and_save_engine
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            model_type (ModelType): The type of the input model. Enum (megatron.core.export.model_type.ModelType)
+            conversion_dict (dict, optional): A conversion dictionary that will map your model layer names to trtllm equivalent layer names. Sample dictionaries are given megatron/core/export/model_mapping. NOTE: Ingore layer numbers in the model layer names. (e.g) decoder.layers.0.attention_qkv.weight will be decoder.layers.attention_qkv.weight in the mapping dictionary. Defaults to {}.
+            position_embedding_type (str, optional): The position embedding type. Defaults to None.
+            max_position_embeddings (int, optional): Max posistion embeddings value. Defaults to None.
+            rotary_percentage (int, optional): The rotary percentage if using rope embedding. Defaults to 1.0.
+            rotary_base (int, optional): The rotary base (theta value) if using rope embeddings. Defaults to 10000.
+            moe_tp_mode (int, optional): TRTLLM Config. Defaults to 2.
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+            seq_len_interpolation_factor (float, optional): The sequence length interpolation factor if using rope embeddings. Defaults to None.
+            moe_renorm_mode (optional) : Renormalization mode if using mixture of experts. Defaults to None.
+            share_embeddings_and_output_weights (bool, optional): True if input and output layers share weights. Defaults to False.
+        """
+
+        self.transformer_config = transformer_config
+        self.model_type = model_type
+        self.trtllm_conversion_dict = DEFAULT_CONVERSION_DICT[model_type]
+        self.trtllm_conversion_dict.update(trtllm_conversion_dict)
+        assert position_embedding_type in [
+            'learned_absolute',
+            'rope',
+        ], f"Position embedding type should be one of learned_absolute, rope. You entered {position_embedding_type}"
+        self.position_embedding_type = position_embedding_type
+        self.max_position_embeddings = max_position_embeddings
+        self.rotary_percentage = rotary_percentage
+        self.rotary_base = rotary_base
+        self.moe_tp_mode = moe_tp_mode
+        self.multi_query_mode = multi_query_mode
+        self.activation = activation
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        self.moe_renorm_mode = moe_renorm_mode
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+    def _get_trtllm_config(
+        self,
+        export_config: ExportConfig,
+        world_size: int,
+        gpus_per_node: int,
+        vocab_size_padded: int,
+        dtype: DataType,
+    ):
+        """Get TRTLLM Config
+
+        Returns appropriate TRTLLM PretrainedConfig used by TRTLLM for building engine
+
+        Args:
+            export_config (ExportConfig): The export config that defines inference tp , pp size etc.
+            world_size (int): The number of gpus (Mostly TP * PP)
+            gpus_per_node (int): Num gpus per node
+            vocab_size_padded (int): Padded vocab size
+            dtype (DataType): The datatype or model precision
+
+        Returns:
+            GPTConfig or the LLamaConfig or the PretrainedConfig constructed from your model config
+        """
+        hidden_act = self.activation
+        hidden_act = (
+            hidden_act.split("-")[-1]
+            if self.transformer_config.num_moe_experts
+            else non_gated_version(hidden_act)
+        )
+
+        config = {
+            'architecture': TRT_MODEL_TYPE_STRING[self.model_type],
+            'dtype': dtype.name,
+            'num_hidden_layers': self.transformer_config.num_layers,
+            'num_attention_heads': self.transformer_config.num_attention_heads,
+            'num_key_value_heads': (
+                self.transformer_config.num_query_groups
+                if self.transformer_config.num_query_groups
+                else self.transformer_config.num_attention_heads
+            ),
+            'head_size': self.transformer_config.kv_channels,
+            'hidden_size': self.transformer_config.hidden_size,
+            'intermediate_size': self.transformer_config.ffn_hidden_size,
+            'norm_epsilon': self.transformer_config.layernorm_epsilon,
+            'vocab_size': vocab_size_padded,
+            'position_embedding_type': (
+                "rope_gpt_neox" if self.position_embedding_type == "rope" else "learned_absolute"
+            ),
+            'max_position_embeddings': self.max_position_embeddings,
+            'hidden_act': hidden_act,
+            'use_parallel_embedding': export_config.use_parallel_embedding,
+            'embedding_sharding_dim': 0,
+            'share_embedding_table': export_config.use_embedding_sharing,
+            'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None},
+            'bias': self.transformer_config.add_bias_linear,
+            'apply_query_key_layer_scaling': False,
+            'rotary_pct': self.rotary_percentage,
+            'rotary_base': self.rotary_base,
+            'moe_num_experts': (
+                0
+                if self.transformer_config.moe_router_topk == 0
+                else (self.transformer_config.num_moe_experts or 1)
+            ),
+            'moe_top_k': self.transformer_config.moe_router_topk,
+            'moe_normalization_mode': self.moe_renorm_mode
+            or MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+            'moe_tp_mode': self.moe_tp_mode,
+            'logits_dtype': 'float32',
+            'world_size': world_size,
+            'tp_size': export_config.inference_tp_size,
+            'pp_size': export_config.inference_pp_size,
+            'gpus_per_node': gpus_per_node,
+        }
+
+        if self.model_type == ModelType.falcon:
+            config["new_decoder_architecture"] = (
+                False if self.transformer_config.num_layers == 32 else True
+            )
+            config["parallel_attention"] = True
+
+        if self.seq_len_interpolation_factor is not None:
+            config["rotary_scaling"] = {
+                "type": "linear",
+                "factor": float(self.seq_len_interpolation_factor),
+            }
+
+        config_cls = TRT_MODEL_CONFIG[self.model_type]
+        return config_cls(**config)
+
+    # pylint: disable=line-too-long
+    def get_trtllm_pretrained_config_and_model_weights(
+        self,
+        model_state_dict,
+        dtype: DataType,
+        export_config: ExportConfig = None,
+        on_device_distributed_conversion: bool = False,
+        vocab_size: int = None,
+        gpus_per_node: int = None,
+        state_dict_split_by_layer_numbers: bool = True,
+    ):
+        """Get TRTLLM Config and Converted Model Weights
+
+        This function returns the trtllm model weights as a list.
+        There are two modes for conversion. The default is to use a single device cpu/gpu for conversion.
+        NOTE: For faster performance, if your entire model will fit in memory, pre transfer the model state dict to cuda device and then call this function.
+        For on device conversion it returns weights which will be used on the device itself.
+        Same thing happens with the pretrained config
+
+        Args:
+            model_state_dict (dict, optional): The input model state dictionary (Entire model state loaded on CPU). Used only when on device conversion is set to False. Defaults to None.
+            False, or the model state dict of each GPU in the case of on_device conversion)
+            export_config (ExportConfig): The export config used to define inference tp size, pp size etc. Used only for on device conversion.
+            dtype (DataType): The data type of model precision
+            on_device_distributed_conversion (bool, optional): Convert on gpus in distributed setting. This assumes that the model state dict is sharded according to required inference model parallelism and that each gpu gets its part of the model state dict . Defaults to False.
+            vocab_size (int, optional): The vocabulary size. Defaults to None.
+            gpus_per_node (int, optional): The number of gpus per node. Used for on device conversion.
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . First list of trtllm converted model weights(Either on device, or a list of weights for each gpu) and the trtllm_model_configs.
+        """
+        if on_device_distributed_conversion:
+            assert (vocab_size is not None, "Need to pass in vocab_size for on device")
+            assert (
+                self.model_type in [ModelType.gpt, ModelType.gptnext, ModelType.llama],
+                "On device conversion only supported for model types gptnext and llama",
+            )
+            assert (
+                export_config is None,
+                "Export config is inferred based on the parallel state. If you want to set inference tp 2, then load the model with this TP2 setting and just pass in the model state dict. ",
+            )
+            assert (
+                gpus_per_node is not None
+            ), "Need to pass in gpus_per_node for on device conversion"
+            trtllm_model_weights_on_device, trtllm_model_config = (
+                self._get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+                    model_state_dict, dtype, vocab_size, gpus_per_node
+                )
+            )
+            return [trtllm_model_weights_on_device], [trtllm_model_config]
+
+        else:
+            assert not (
+                self.share_embeddings_and_output_weights and not export_config.use_embedding_sharing
+            ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True"
+            assert (
+                vocab_size is None
+            ), "Vocab size is inferred from the input layer for cpu conversion. So leave it as None"
+            trtllm_model_weights_list, trtllm_model_config_list = (
+                self._get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+                    export_config,
+                    model_state_dict,
+                    dtype,
+                    gpus_per_node,
+                    state_dict_split_by_layer_numbers,
+                )
+            )
+
+            return trtllm_model_weights_list, trtllm_model_config_list
+
+    def _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+        self, model_state_dict: dict, dtype: DataType, vocab_size: int, gpus_per_node: int
+    ):
+        """Get the TRTLLM Pretrained config and model weights list in a distributed setting
+
+        This function assumes the  model state dict is distributed according to model parallelism .
+        Each device gets its own model state dict
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            vocab_size (int): Tokenizer vocab size
+            gpus_per_node (int): The number of gpus per node
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+
+        distributed_trtllm_model_weights_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            multi_query_mode=self.multi_query_mode,
+            activation=self.activation,
+        )
+        distributed_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            tokenizer_vocab_size=vocab_size,
+        )
+
+        export_config = ExportConfig(
+            inference_pp_size=distributed_trtllm_model_weights_converter.inference_pp_size,
+            inference_tp_size=distributed_trtllm_model_weights_converter.inference_tp_size,
+            use_parallel_embedding=True,
+            use_embedding_sharing=self.share_embeddings_and_output_weights,
+        )
+
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+
+        trtllm_model_config = self._get_trtllm_config(
+            export_config=export_config,
+            world_size=world_size,
+            gpus_per_node=gpus_per_node,
+            vocab_size_padded=vocab_size,
+            dtype=dtype,
+        )
+
+        model_parallel_rank = (
+            distributed_trtllm_model_weights_converter.pp_rank
+            * distributed_trtllm_model_weights_converter.inference_tp_size
+            + distributed_trtllm_model_weights_converter.tp_rank
+        )
+
+        trtllm_model_config.mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=model_parallel_rank,
+            tp_size=export_config.inference_tp_size,
+            pp_size=export_config.inference_pp_size,
+        )
+
+        return distributed_trtllm_model_weights_converter.trtllm_model_weights, trtllm_model_config
+
+    def _get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+        self,
+        export_config: ExportConfig,
+        model_state_dict: dict,
+        dtype: DataType,
+        gpus_per_node=None,
+        state_dict_split_by_layer_numbers=True,
+    ):
+        """Get the TRTLLM Pretrained config and model weights list (one per gpu rank) on single device (CPU/GPU)
+
+        This function assumes the entire model state dict is present in CPU or on one GPU
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            gpus_per_node (int, optional): Number of gpus per node
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+        trtllm_model_configs_list = []
+        trtllm_model_weights_list = []
+
+        single_device_trtllm_model_weights_converter = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config=export_config,
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            activation=self.activation,
+            multi_query_mode=self.multi_query_mode,
+        )
+        # Convert the input model state dict to trtllm model weights dictionary
+        single_device_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        vocab_size_padded = single_device_trtllm_model_weights_converter.get_padded_vocab_size()
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+        gpus_per_node = gpus_per_node or export_config.inference_tp_size
+
+        for gpu_rank in range(world_size):
+            mapping = tensorrt_llm.Mapping(
+                world_size=world_size,
+                rank=gpu_rank,
+                tp_size=export_config.inference_tp_size,
+                pp_size=export_config.inference_pp_size,
+            )
+
+            # Important to create a new instance everytime so that the list elements have differnt rank values in the mapping object
+            trtllm_model_config = self._get_trtllm_config(
+                export_config=export_config,
+                world_size=world_size,
+                gpus_per_node=gpus_per_node,
+                vocab_size_padded=vocab_size_padded,
+                dtype=dtype,
+            )
+            trtllm_model_config.mapping = mapping
+            trtllm_model_configs_list.append(trtllm_model_config)
+
+            # Get the model weights for each rank and append it to the trtllm_model_weights_list
+            trtllm_model_weights_per_gpu = (
+                single_device_trtllm_model_weights_converter.get_local_model_weights_per_gpu(
+                    mapping, trtllm_model_config
+                )
+            )
+            trtllm_model_weights_list.append(trtllm_model_weights_per_gpu)
+
+        return trtllm_model_weights_list, trtllm_model_configs_list
+
+    def build_and_save_engine(
+        self,
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Max size of prompt embedding table. Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+
+        TRTLLMEngineBuilder.build_and_save_engine(
+            engine_dir,
+            trtllm_model_weights,
+            trtllm_model_config,
+            max_input_len,
+            max_output_len,
+            max_batch_size,
+            lora_ckpt_list,
+            use_lora_plugin,
+            max_lora_rank,
+            lora_target_modules,
+            max_prompt_embedding_table_size,
+            paged_kv_cache,
+            remove_input_padding,
+            paged_context_fmha,
+            use_refit,
+            max_num_tokens,
+            max_seq_len,
+            opt_num_tokens,
+            max_beam_width,
+            tokens_per_block,
+            multiple_profiles,
+            gpt_attention_plugin,
+            gemm_plugin,
+        )
diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py
new file mode 100644
index 0000000000..0cf805dcb6
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_layers.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from enum import Enum
+from typing import Tuple
+
+
+class TRTLLMLayers(Enum):
+    """TRTLLM Layer names
+
+    This Enum will be used to map input model layer names to TRTLLM Layer names
+    """
+
+    # ONE TIME LAYERS (NOT ASSOCIATED TO TRANSFORMER BLOCK)
+    # Input layers
+    position_embedding = 'transformer.position_embedding.weight'
+    vocab_embedding = 'transformer.vocab_embedding.weight'
+    lm_head = 'lm_head.weight'
+
+    # Output layers
+    final_layernorm_weight = 'transformer.ln_f.weight'
+    final_layernorm_bias = 'transformer.ln_f.bias'
+
+    # TRANSFORMER LAYERS
+    # Attention block related layers
+    input_layernorm_weight = 'transformer.layers.input_layernorm.weight'
+    input_layernorm_bias = 'transformer.layers.input_layernorm.bias'
+    attention_qkv_weight = 'transformer.layers.attention.qkv.weight'
+    attention_qkv_bias = 'transformer.layers.attention.qkv.bias'
+    attention_dense_weight = 'transformer.layers.attention.dense.weight'
+    attention_dense_bias = 'transformer.layers.attention.dense.bias'
+
+    # mlp layers
+    mlp_fc_weight = 'transformer.layers.mlp.fc.weight'
+    mlp_fc_bias = 'transformer.layers.mlp.fc.bias'
+    post_layernorm_weight = 'transformer.layers.post_layernorm.weight'
+    post_layernorm_bias = 'transformer.layers.post_layernorm.bias'
+    mlp_projection_weight = 'transformer.layers.mlp.proj.weight'
+    mlp_projection_bias = 'transformer.layers.mlp.proj.bias'
+
+    # mixture of expert layers
+    mlp_router_weight = 'transformer.layers.mlp.router.weight'
+    mlp_fc_weight_mixture_of_experts = 'transformer.layers.mlp.fc.weight.expert'
+    mlp_projection_weight_mixture_of_experts = 'transformer.layers.mlp.proj.weight.expert'
+
+    @staticmethod
+    def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]:
+        """Helper function to return layer name and number
+        Given an input layer e.g decoder.layers.2.self_attention.linear_qkv.weight,
+        this function returns decoder.layers.self_attention.linear_qkv.weight and layernumber 2.
+        In case no layer number is present, it returns None for the layer number
+        Args:
+            layer_name (dict): The input layer name
+
+        Returns:
+            Tuple[str, int]: The layer name , layer number (layer number could be None)
+        """
+        # Use regular expression to find the number specifically after 'layers.'
+        match = re.search(r'(?<=layers\.)\d+(?=\.)', layer_name)
+        if match:
+            # Extract the number and remove it from the layer name
+            number = match.group(0)
+            layer_name_without_number = re.sub(r'\.{}\.'.format(number), '.', layer_name)
+            return layer_name_without_number, int(number)
+        else:
+            # Return the original name if no number is found
+            return layer_name, None
+
+    # pylint: disable=line-too-long
+    @staticmethod
+    def rename_input_layer_names_to_trtllm_layer_names(
+        model_state_dict: dict,
+        trtllm_conversion_dict: dict,
+        state_dict_split_by_layer_numbers: bool = True,
+    ) -> dict:
+        """Helper function to rename model layer names to TRTLLM Layer names
+
+        We go through each layer (keys) in the model state dict,
+        and map it to the equivalent TRTLLMLayer name (megatron/core/export/trtllm/trtllm).
+        If we have a layer number associated with layer, we extract it out,
+        map the original layer name to equivalent trtllm layer name and add layer number back.
+        CPU Conversion will pass in model state dict without layer numbers
+        (i.e decoder.layers.mlp.linear_fc1.weight of shape [num_layers, hidden_dim, 4 * hidden_dim]) .
+        GPU conversion will pass model state dict with each layer seperated
+        (i.e decoder.layers.2.mlp.linear_fc1.weight of shape [hidden_dim, 4 * hidden_dim]).
+
+        Args:
+            model_state_dict (dict): The original model state dict
+            trtllm_conversion_dict (dict): The conversion dictionary mapping input model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Raises:
+            ValueError: In case the keys dont match to trtllm keys or if all model layers are not mapped to equivalent trtllm keys
+
+        Returns:
+            dict: The model state dict with the key (i.e original model layer name) replaced by trtllm layer names
+        """
+        for original_model_layer_name in list(model_state_dict.keys()):
+            if "_extra_state" in original_model_layer_name:
+                del model_state_dict[original_model_layer_name]
+                continue
+
+            original_layer_name_without_number, layer_number = (
+                TRTLLMLayers.return_layer_name_and_number(original_model_layer_name)
+            )
+            if 'layers' in original_layer_name_without_number and state_dict_split_by_layer_numbers:
+                assert (
+                    layer_number is not None
+                ), f"Layer number is None for {original_model_layer_name} and state_dict_split_by_layer_numbers is set to True. Consider setting it False"
+
+            if original_layer_name_without_number not in trtllm_conversion_dict:
+                raise ValueError(
+                    f'Unable to rename key {original_layer_name_without_number}. Provide an appropriate mapping in the trtllm_conversion_dict when you initialize TRTLLMHelper'
+                )
+
+            trtllm_layer = trtllm_conversion_dict[original_layer_name_without_number]
+            assert isinstance(
+                trtllm_layer, TRTLLMLayers
+            ), f"{trtllm_layer} is not supported for conversion. Please use one of the TRTLLMLayerNames we provided in megatron/core/export/trtllm/trtllm_layer_names"
+
+            value = model_state_dict.pop(original_model_layer_name)
+
+            if layer_number is not None:
+                trtllm_layer_name_with_number = re.sub(
+                    r'(?<=layers\.)', f'{layer_number}.', trtllm_layer.value
+                )
+                model_state_dict[trtllm_layer_name_with_number] = value
+            else:
+                model_state_dict[trtllm_layer.value] = value
+
+        return model_state_dict
+
+
+# These layers are not associated within the transformer block.
+# So they dont have a layer number (i.e independant of number of layers in the model)
+NON_TRANSFORMER_LAYERS_NAMES = [
+    TRTLLMLayers.vocab_embedding.value,
+    TRTLLMLayers.position_embedding.value,
+    TRTLLMLayers.lm_head.value,
+    TRTLLMLayers.final_layernorm_weight.value,
+    TRTLLMLayers.final_layernorm_bias.value,
+]
+
+
+def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str:
+    """Get TRTLayer name without prefix
+
+    Given a layer e.g TRTLLMLayers.attention_qkv_weight it returns 'attention.qkv.weight'
+
+    Args:
+        layer (TRTLLMLayers): The TRTLLMLayer
+
+    Returns:
+        str: The TRTLLMLayers suffix (i.e Removing transformer.layers. fromt he layer name)
+    """
+    layer_name_without_prefix = layer.value.replace("transformer.layers.", "")
+    return layer_name_without_prefix
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..035e23a16c
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from tqdm import tqdm
+
+from megatron.core import parallel_state
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+# pylint: disable=line-too-long
+class DistributedTRTLLMModelWeightsConverter:
+    """The TRTLLM Converter class used for GPU (on device) conversion
+
+    This class is used to convert models sharded and on gpus. (It assumes that the model is already sharded appropriate to how you want to export it). (i.e) If you want to export to tp2pp2, then load the model in tp2pp2 setting and pass in their respective state dictionaries
+    """
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterGPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights.
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+        self.inference_pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        self.inference_tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        self.tp_group = parallel_state.get_tensor_model_parallel_group()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+        assert (
+            vp_size is None or vp_size == 1
+        ), "Virtual parallelism is not supported in GPU Converter. Gather the VP chunks and use PP config."
+
+    def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: str):
+        assert torch.is_tensor(val), f"Expected a tensor for {layer_name} but got {type(val)}"
+        val = val.to(self.storage_type)
+        val = val.detach().contiguous()
+        if val.ndim >= 2:
+            val = torch.transpose(val.reshape(val.shape[0], -1), 0, 1)
+        if layer_name not in self.trtllm_model_weights:
+            self.trtllm_model_weights[layer_name] = torch.empty(
+                val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True
+            )
+        self.trtllm_model_weights[layer_name] = val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight))
+        ):
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                vals, gates = [[n] for n in torch.chunk(val, 2, axis=-1)]
+                gate_layer_name = layer_name.replace("fc", "gate")
+                self._add_to_trtllm_model_weights(val=gates[0], layer_name=gate_layer_name)
+                val = vals[0]
+
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = (
+                qkv_hidden_dim
+                // (self.transformer_config.num_attention_heads + 2 * self.num_kv_heads)
+                * self.inference_tp_size
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head)
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            split_vals = torch.concatenate(
+                [qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=0
+            )
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            val = val.reshape(
+                hidden_dim, self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head
+            )
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+            split_vals = torch.concatenate(
+                [
+                    qkv[0].reshape(hidden_dim, -1),
+                    qkv[1].reshape(hidden_dim, -1),
+                    qkv[2].reshape(hidden_dim, -1),
+                ],
+                dim=1,
+            )
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
+
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by GPU converter")
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+    # ----------------Convert Embeddings----------------
+    def _get_remove_vocab_padding(self, layer_name, model_state_dict, tokenizer_vocab_size):
+        val = model_state_dict.get(layer_name, None)
+        if val is None:
+            return None
+
+        if self.inference_tp_size > 1:  # Gather padded tensor chunks
+            vocab_size_padded = val.shape[0] * self.inference_tp_size
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                vocab_size_padded, self.tp_rank, self.inference_tp_size
+            )
+            dim_size = list(val.size())
+            dim_size[0] = vocab_size_padded
+            gathered_val = torch.zeros(
+                dim_size, dtype=val.dtype, device=torch.cuda.current_device()
+            )
+            gathered_val[vocab_start_index:vocab_end_index] = val
+            torch.distributed.all_reduce(gathered_val, group=self.tp_group)
+            val = gathered_val
+        unpadded = val[:tokenizer_vocab_size]
+        if self.inference_tp_size > 1:  # Split gathered val for val parallel embedding
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                tokenizer_vocab_size, self.tp_rank, self.inference_tp_size
+            )
+            unpadded = unpadded[vocab_start_index:vocab_end_index]
+        return unpadded.T  # TRTLLM expects (vocab_size, hidden_size) so need extra transpose
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict: dict, tokenizer_vocab_size: int
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            tokenizer_vocab_size (int): The vocab size of the tokenizer
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict, trtllm_conversion_dict=trtllm_conversion_dict
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            if (
+                layer_name in TRTLLMLayers.vocab_embedding.value
+                or layer_name in TRTLLMLayers.lm_head.value
+            ):
+                # For embedding layers alone we do some pre processing
+                embed_val = self._get_remove_vocab_padding(
+                    layer_name, model_state_dict, tokenizer_vocab_size
+                )
+                model_state_dict[layer_name] = embed_val
+            # TODO : Check if this handling of position embedding is right.
+            if layer_name == TRTLLMLayers.position_embedding.value:
+                position_embedding = model_state_dict[layer_name]
+                req_position_embedding = position_embedding.chunk(self.inference_tp_size)[
+                    self.tp_rank
+                ]
+                model_state_dict[layer_name] = req_position_embedding.T
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        for layer_name, value in tqdm(
+            model_state_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..c7a98972d2
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+
+import torch
+from tqdm import tqdm
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+# pylint: disable=line-too-long
+# TODO: Writing TRT imports this way so that it can be mocked in the test_trtllm_cpu_converter.py unit test
+# TODO: Figure out how to patch it directly from the trtllm library
+def pad_vocab_size(vocab_size: int, tp_size: int):
+    """Pad vocab size based on inference size"""
+    from tensorrt_llm._utils import pad_vocab_size
+
+    return pad_vocab_size(vocab_size, tp_size)
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+class SingleDeviceTRTLLMModelWeightsConverter:
+    """Class to convert Model weights to TRTLLM weights on CPU"""
+
+    def __init__(
+        self,
+        export_config: ExportConfig,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterCPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights and also split them for each GPU rank and return as a list.
+
+        Args:
+            export_config (ExportConfig): The export config with inference tp size, pp size etc.
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.export_config = export_config
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer_name (str): The TRTLLM Layer name that we want to convert
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            val = val.to(self.storage_type).detach().contiguous()
+            self.trtllm_model_weights[layer_name] = val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+
+        def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=None):
+            """Add the input weight to trtllm_model_weights
+
+            Depending on split (Expert split/Tensor split/None) we split the input data and add accordingly
+
+            Args:
+                val (torch.Tensor): The model weight to be added
+                layer_name (str): The TRTLLMlayername as a string
+                split_type (str, optional): The split type. Defaults to None.
+            """
+            if split_type == 'expert_split':
+                for split_num, split_val in enumerate(val):
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            elif split_type == 'tensor_split':
+                for split_num, split_val in enumerate(val):
+                    if split_val.ndim >= 2:
+                        split_val = torch.transpose(split_val.reshape(split_val.shape[0], -1), 1, 0)
+
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            else:
+                if val.ndim >= 2:
+                    val = torch.transpose(val.reshape(val.shape[0], -1), 1, 0)
+                self.trtllm_model_weights[layer_name] = (
+                    val.to(self.storage_type).detach().contiguous()
+                )
+
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+        ):
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
+            _add_to_trtllm_model_weights(val=val, layer_name=layer_name, split_type=None)
+
+        elif layer_name.endswith(
+            suffix(TRTLLMLayers.attention_dense_weight)
+        ) or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=0)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                val, gate = torch.chunk(val, 2, axis=-1)
+                gate_layer_name = layer_name.replace("fc", "gate")
+                split_vals = torch.chunk(gate, self.export_config.inference_tp_size, axis=-1)
+                _add_to_trtllm_model_weights(
+                    val=split_vals, layer_name=gate_layer_name, split_type='tensor_split'
+                )
+
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = qkv_hidden_dim // (
+                self.transformer_config.num_attention_heads + 2 * self.num_kv_heads
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads, q_num + 2, size_per_head)
+
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=0)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=0)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=0)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], dim=0
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
+            # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(hidden_dim, self.num_kv_heads, q_num + 2, size_per_head)
+
+            # Split the QKV to separate variables.
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % self.export_config.inference_tp_size) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=1)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=1)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=1)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [
+                        q_split[i].reshape(hidden_dim, -1),
+                        k_split[i].reshape(hidden_dim, -1),
+                        v_split[i].reshape(hidden_dim, -1),
+                    ],
+                    dim=1,
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight_mixture_of_experts)):
+            w1, w3 = torch.chunk(val, 2, axis=1)
+            # w1 splits
+            split_w1s = torch.chunk(w1, self.export_config.inference_tp_size, axis=1)
+            # w3 splits
+            split_w3s = torch.chunk(w3, self.export_config.inference_tp_size, axis=1)
+
+            split_vals = [torch.concatenate(item, dim=1) for item in zip(split_w3s, split_w1s)]
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight_mixture_of_experts)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by converter")
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict, state_dict_split_by_layer_numbers=True
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            # For vocab embedding layer alone we pad the weights to be divisible by inference tp size
+            if (
+                layer_name == TRTLLMLayers.vocab_embedding.value
+                and self.export_config.use_parallel_embedding
+            ):
+                val = model_state_dict[TRTLLMLayers.vocab_embedding.value]
+                vocab_size = val.shape[0]
+                if vocab_size % self.export_config.inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(
+                        vocab_size, self.export_config.inference_tp_size
+                    )
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+                    model_state_dict[layer_name] = val
+
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        transformer_layers_dict = {}
+        # Convert the transformer layers
+        if state_dict_split_by_layer_numbers:
+            # Already model dict is split by layer numbers
+            transformer_layers_dict = model_state_dict
+        else:
+            # Here we split the model state dict into individual layers
+            for layer_name in list(model_state_dict.keys()):
+                value = model_state_dict.pop(layer_name)
+                for layer_number in range(self.transformer_config.num_layers):
+                    # e.g transformer.layers.mlp.fc.bias => transformer.layers.2.mlp.fc.bias
+                    layer_name_with_layer_number = re.sub(
+                        r'(?<=layers\.)', f'{layer_number}.', layer_name
+                    )
+                    transformer_layers_dict[layer_name_with_layer_number] = value[layer_number]
+
+        for layer_name, value in tqdm(
+            transformer_layers_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
+
+    def get_padded_vocab_size(self) -> int:
+        """Return the paded vocab size
+
+        We extract the lm head and vocab embedding and use that to determine padded_vocab_size
+
+        Returns:
+            int: Padded vocab size
+        """
+        lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+        vocab_size = self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value].shape[0]
+        vocab_size_padded = (
+            vocab_size
+            if lm_head_weight is None
+            else pad_vocab_size(vocab_size, self.export_config.inference_tp_size)
+        )
+        return vocab_size_padded
+
+    def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config: dict):
+        """Get the trtllm model weights split per gpu
+
+        Given the trtllm mapping information (tp, pp rank etc) we split the model weights in a list, with each element of the list corresponding to the weights of each gpu rank
+
+        Args:
+            mapping : The trtllm mapping information
+            trtllm_model_config (dict): The trtllm model config
+        """
+
+        def _split(torch_tensor, tp_size, idx, dim=0):
+            """Splits the np tensor v on dim and return the idx's slice."""
+            if tp_size == 1:
+                return torch_tensor
+            if len(torch_tensor.shape) == 1:
+                return torch.chunk(torch_tensor, tp_size)[idx].contiguous()
+            else:
+                return torch.chunk(torch_tensor, tp_size, axis=dim)[idx].contiguous()
+
+        pp_layer_range = mapping.pp_layers(self.transformer_config.num_layers)
+
+        trtllm_model_weights_per_gpu = {}
+        for layer_name, value in self.trtllm_model_weights.items():
+            if layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+                continue
+
+            # Happens in the case of TP split or expert split
+            if layer_name.endswith(".bin"):
+                if layer_name.endswith(f"{mapping.tp_rank}.bin"):
+                    layer_name = layer_name.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
+
+            layer_num = int(layer_name.split(".")[2])
+            if layer_num in pp_layer_range:
+                layer_name = layer_name.replace(
+                    f"layers.{layer_num}", f"layers.{layer_num - pp_layer_range[0]}"
+                )
+            else:
+                continue
+            if (
+                hasattr(trtllm_model_config, 'new_decoder_architecture')
+                and trtllm_model_config.new_decoder_architecture
+                and "post_layernorm" in layer_name
+            ):
+                layer_name = layer_name.replace("post_layernorm", "mlp_layernorm")
+
+            trtllm_model_weights_per_gpu[layer_name] = value
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                _split(
+                    self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value],
+                    mapping.tp_size,
+                    mapping.tp_rank,
+                )
+                if self.export_config.use_parallel_embedding
+                else self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value]
+            )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.vocab_embedding.value] = embedding_weight
+
+            pos_embedding_weight = self.trtllm_model_weights.get(
+                TRTLLMLayers.position_embedding.value
+            )
+            if pos_embedding_weight is not None:
+                if self.export_config.use_parallel_embedding:
+                    pos_embedding_weight = _split(
+                        pos_embedding_weight, mapping.tp_size, mapping.tp_rank
+                    )
+
+                trtllm_model_weights_per_gpu[TRTLLMLayers.position_embedding.value] = (
+                    pos_embedding_weight
+                )
+
+        if mapping.is_last_pp_rank():
+            lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+            if lm_head_weight is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.lm_head.value] = _split(
+                    lm_head_weight, mapping.tp_size, mapping.tp_rank
+                )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_weight.value] = (
+                self.trtllm_model_weights[TRTLLMLayers.final_layernorm_weight.value]
+            )
+
+            ln_f_bias = self.trtllm_model_weights.get(TRTLLMLayers.final_layernorm_bias.value)
+            if ln_f_bias is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_bias.value] = ln_f_bias
+
+        return trtllm_model_weights_per_gpu
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 751bcedb13..0dbd1a58f2 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -3,13 +3,13 @@
 import dataclasses
 import os
 import warnings
-from importlib.metadata import version
 from typing import Callable
 
 import torch
 import transformer_engine as te
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from torch import Tensor
+from torch.nn.parameter import Parameter
 
 from megatron.core import ModelParallelConfig, parallel_state
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
@@ -19,33 +19,25 @@
     get_context_parallel_group,
     get_tensor_and_expert_parallel_world_size,
     get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
 )
 from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    set_tensor_model_parallel_attributes,
+)
 from megatron.core.tensor_parallel.utils import divide
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-
-def get_te_version():
-    """Get TE version from __version__; if not available use pip's. Use caching."""
-
-    def get_te_version_str():
-        if hasattr(te, '__version__'):
-            return str(te.__version__)
-        else:
-            return version("transformer-engine")
-
-    return packaging.version.Version(get_te_version_str())
-
-
-_te_version = get_te_version()
+from megatron.core.utils import get_te_version, is_te_min_version
 
 
 def _get_extra_te_kwargs(config: TransformerConfig):
     extra_transformer_engine_kwargs = {"params_dtype": config.params_dtype}
 
-    if _te_version >= packaging.version.Version("0.12.0"):
+    if is_te_min_version("0.12.0"):
         if config.use_cpu_initialization:
             extra_transformer_engine_kwargs["device"] = 'cpu'
         else:
@@ -131,9 +123,9 @@ def __init__(
 
         extra_kwargs = _get_extra_te_kwargs(config)
 
-        if _te_version >= packaging.version.Version("0.8.0"):
+        if is_te_min_version("0.8.0"):
             if self.config.tp_comm_overlap:
-                if _te_version > packaging.version.Version("1.5.0"):
+                if is_te_min_version("1.5.0"):
                     # Use old overlap flags if they were supplied instead
                     extra_kwargs["ub_overlap_ag"] = (
                         self.config.tp_comm_overlap_ag
@@ -160,7 +152,7 @@ def __init__(
                         extra_kwargs["ub_atomic_gemm_ag"] = False
                         extra_kwargs["ub_split_rs"] = False
                         extra_kwargs["ub_atomic_gemm_rs"] = False
-                if _te_version > packaging.version.Version("1.0.0"):
+                if is_te_min_version("1.0.0", check_equality=False):
                     assert (
                         tp_comm_buffer_name is not None
                     ), "Buffer name should be set to configure communication overlap settings"
@@ -171,7 +163,7 @@ def __init__(
             rng_tracker_name = get_expert_parallel_rng_tracker_name()
         else:
             rng_tracker_name = None
-        if _te_version >= packaging.version.Version("1.7.0"):
+        if is_te_min_version("1.7.0"):
             extra_kwargs["rng_tracker_name"] = rng_tracker_name
 
         # Disable communications in TE when using SP or EP by making TE agnostic of model parallel.
@@ -268,25 +260,26 @@ def __init__(
         extra_kwargs = _get_extra_te_kwargs(config)
 
         # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm`
-        if _te_version >= packaging.version.Version("0.11.0"):
+        if is_te_min_version("0.11.0"):
             extra_kwargs["normalization"] = self.config.normalization
         elif self.config.normalization != "LayerNorm":
+            te_version = get_te_version()
             raise ValueError(
-                f"Transformer Engine v{_te_version} does not support {self.config.normalization}."
+                f"Transformer Engine v{te_version} does not support {self.config.normalization}."
             )
 
-        if _te_version >= packaging.version.Version("0.8.0"):
+        if is_te_min_version("0.8.0"):
             if self.config.tp_comm_overlap:
                 extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad
                 extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad
-                if _te_version > packaging.version.Version("1.5.0"):
+                if is_te_min_version("1.5.0", check_equality=False):
                     # Use old overlap flags if they were supplied instead
                     extra_kwargs["ub_overlap_ag"] = (
                         self.config.tp_comm_overlap_ag
                         if hasattr(self.config, "tp_comm_overlap_ag")
                         else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag
                     )
-                    if _te_version > packaging.version.Version("1.6.0.dev0"):
+                    if is_te_min_version("1.6.0.dev0", check_equality=False):
                         extra_kwargs["ub_overlap_rs_dgrad"] = (
                             self.config.tp_comm_overlap_rs_dgrad
                             if hasattr(self.config, "tp_comm_overlap_rs_dgrad")
@@ -302,7 +295,7 @@ def __init__(
                 else:
                     extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag
                     extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
-                if _te_version > packaging.version.Version("1.0.0"):
+                if is_te_min_version("1.0.0", check_equality=False):
                     assert (
                         tp_comm_buffer_name is not None
                     ), "Buffer name should be set to configure communication overlap settings"
@@ -319,7 +312,11 @@ def __init__(
             get_rng_state_tracker=(
                 get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
             ),
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             return_bias=self.te_return_bias,
             parallel_mode="column",
@@ -328,6 +325,33 @@ def __init__(
             **extra_kwargs,
         )
 
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
     def forward(self, x):
         """Forward."""
         _is_first_microbatch = (
@@ -379,7 +403,11 @@ def __init__(
             output_size=output_size,
             parallel_mode="column",
             config=config,
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             skip_bias_add=skip_bias_add,
             is_expert=is_expert,
@@ -387,6 +415,32 @@ def __init__(
             tp_comm_buffer_name=tp_comm_buffer_name,
         )
 
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix='', keep_vars=True)
@@ -424,13 +478,42 @@ def __init__(
             output_size=output_size,
             parallel_mode="row",
             config=config,
-            init_method=condition_init_method(config, init_method),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
             bias=bias,
             skip_bias_add=skip_bias_add,
             skip_weight_param_allocation=False,  # We don't currently use this for row parallel layers # pylint: disable=line-too-long
             is_expert=is_expert,
             tp_comm_buffer_name=tp_comm_buffer_name,
         )
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            input_size_per_partition = divide(input_size, world_size)
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                input_size_per_partition,
+                1,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(torch.empty(output_size, dtype=config.params_dtype))
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+                setattr(self.bias, 'sequence_parallel', config.sequence_parallel)
 
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
@@ -459,6 +542,9 @@ def __init__(
         attn_mask_type: AttnMaskType,
         attention_type: str,
         attention_dropout: float = None,
+        softmax_scale: float = None,
+        k_channels: int = None,
+        v_channels: int = None,
     ):
         self.config = config
         self.te_forward_mask_type = False
@@ -475,25 +561,25 @@ def __init__(
             )
 
         extra_kwargs = {}
-        if _te_version >= packaging.version.Version("0.11.0"):
+        if is_te_min_version("0.11.0"):
             extra_kwargs["num_gqa_groups"] = self.config.num_query_groups
         elif self.config.num_query_groups != self.config.num_attention_heads:
             raise ValueError(
-                f"Transformer Engine v{_te_version} does not support Grouped Query Attention, "
+                f"Transformer Engine v{get_te_version()} does not support Grouped Query Attention, "
                 f"use a newer version of Transformer Engine. "
                 f"(num_query_groups ({self.config.num_query_groups}) != "
                 f"num_attention_heads ({self.config.num_attention_heads}))"
             )
 
-        if _te_version >= packaging.version.Version("0.10.0"):
+        if is_te_min_version("0.10.0"):
             extra_kwargs["attention_type"] = attention_type
             # older version don't need attention_type
 
-        if _te_version > packaging.version.Version("0.12.0"):
+        if is_te_min_version("0.12.0", check_equality=False):
             self.te_forward_mask_type = True
 
         # Only Transformer-Engine version >= 1.0.0 supports context parallelism
-        if _te_version >= packaging.version.Version("1.0.0"):
+        if is_te_min_version("1.0.0"):
             if getattr(TEDotProductAttention, "cp_stream") is None:
                 TEDotProductAttention.cp_stream = torch.cuda.Stream()
             extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False)
@@ -516,15 +602,26 @@ def __init__(
 
         if config.window_size is not None:
             # Check version
-            assert _te_version >= packaging.version.Version("1.2.0"), (
-                f"Transformer-Engine version ({str(_te_version)}) must be >= 1.2.0 to support"
+            assert is_te_min_version("1.2.0"), (
+                f"Transformer-Engine v{get_te_version()} must be >= 1.2.0 to support"
                 "sliding window attention."
             )
             extra_kwargs['window_size'] = config.window_size
 
+        if is_te_min_version("1.10.0"):
+            # TE 1.10.0 introduces the ability to set the different k and v channels
+            kv_channels = (
+                (k_channels, v_channels)
+                if k_channels is not None and v_channels is not None
+                else self.config.kv_channels
+            )
+            extra_kwargs['softmax_scale'] = softmax_scale
+        else:
+            kv_channels = self.config.kv_channels
+
         super().__init__(
             num_attention_heads=self.config.num_attention_heads,
-            kv_channels=self.config.kv_channels,
+            kv_channels=kv_channels,
             attention_dropout=(
                 self.config.attention_dropout if attention_dropout is None else attention_dropout
             ),
@@ -554,18 +651,25 @@ def forward(
         )
         # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
         # after init
-        if self.config.apply_rope_fusion and _te_version > packaging.version.Version("0.13.0"):
+        if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False):
             self.qkv_format = 'bshd'
 
         qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
 
-        if _te_version < packaging.version.Version("1.3.0"):
+        if get_te_version() < PkgVersion("1.3.0"):
             # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
             # copies (#555)
             # These two arguments did not exist prior to 1.3.0
             packed_seq_kwargs.pop("max_seqlen_q", None)
             packed_seq_kwargs.pop("max_seqlen_kv", None)
 
+        if get_te_version() < PkgVersion("1.8.0"):
+            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
+            # in each individual sequence in THD format dataset
+            # These two arguments did not exist prior to 1.8.0
+            packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
+            packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
+
         if self.config.apply_rope_fusion and qkv_format == 'bshd':
             query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
             # In PyTorch, the following two tensors are in fact the same:
@@ -578,7 +682,7 @@ def forward(
                 value = value.as_strided(value.shape, key.stride())
 
         if self.te_forward_mask_type:
-            if qkv_format == 'thd' and _te_version >= packaging.version.Version("1.7.0"):
+            if qkv_format == 'thd' and is_te_min_version("1.7.0"):
                 # thd format uses flash attention with cuDNN kernel which requires is_padding=True,
                 # so the only acceptable mask types are `padding_causal` and `padding`. These do not
                 # necessarily indicate there are padded tokens in the sequence.
@@ -603,7 +707,7 @@ def forward(
             return core_attn_out
 
 
-if _te_version >= packaging.version.Version("1.9.0.dev0"):
+if is_te_min_version("1.9.0.dev0"):
 
     class TEGroupedLinear(te.pytorch.GroupedLinear):
         """
@@ -865,10 +969,10 @@ def __init__(
         override_linear_precision: tuple = (False, False, False),
     ):
         extra_kwargs = _get_extra_te_kwargs(config)
-        if _te_version >= packaging.version.Version("1.6.0.dev0"):
+        if is_te_min_version("1.6.0.dev0"):
             extra_kwargs["fp8_dpa"] = config.fp8_dot_product_attention
             extra_kwargs["fp8_mha"] = config.fp8_multi_head_attention
-        if _te_version < packaging.version.Version("1.8.0"):
+        if get_te_version() < PkgVersion("1.8.0"):
             extra_kwargs["interval"] = config.fp8_interval
         elif config.fp8_interval != 1:
             warnings.warn("fp8_interval is deprecated and ignored from Transformer-Engine v1.8.0.")
@@ -921,7 +1025,7 @@ def te_checkpoint(
     """Checkpointing with Transformer-Engine."""
     from transformer_engine.pytorch.distributed import checkpoint
 
-    if _te_version >= packaging.version.Version("1.5.0"):
+    if is_te_min_version("1.5.0"):
         return checkpoint(
             forward_func,
             hidden_states,
@@ -967,7 +1071,7 @@ def get_cpu_offload_context(
         enabled, num_layers, model_layers, activation_offloading, weight_offloading
     ):
         """Get CPU offload context and sync function."""
-        if _te_version >= packaging.version.Version("1.10.0.dev0"):
+        if is_te_min_version("1.10.0.dev0"):
             context, sync_func = _get_cpu_offload_context(
                 enabled, num_layers, model_layers, activation_offloading, weight_offloading
             )
diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py
index 496a288bae..fe8160228b 100644
--- a/megatron/core/inference/engines/mcore_engine.py
+++ b/megatron/core/inference/engines/mcore_engine.py
@@ -13,47 +13,66 @@
 
 
 class MCoreEngine(AbstractEngine):
+    """The Megatron core backend constructor
+
+    This is the backend that does a simple forward pass on the model.
+    Supports any model that is callable (Accepts the inputs and outputs the tensor)
+
+    Args:
+        text_generation_controller (SimpleTextGenerationController): A text generation
+            controller that will be used to define how to preprocess prompts, generate
+            outputs and detokenizer the output tokens.
+        max_batch_size : The maxinum number of requests to process at once
+        random_seed (int, optional): Use a random seed if you want deterministic
+            results. Defaults to None.
+    """
+
     def __init__(
         self,
         text_generation_controller: SimpleTextGenerationController,
         max_batch_size,
         random_seed: int = None,
     ):
-        """The Megatron core backend constructor
-
-        This is the backend that does a simple forward pass on the model. Supports any model that is callable (Accepts the inputs and outputs the tensor)
-
-        Args:
-            text_generation_controller (SimpleTextGenerationController): A text generation controller that will be used to define how to preprocess prompts, generate outputs and detokenizer the output tokens.
-            max_batch_size : The maxinum number of requests to process at once
-            random_seed (int, optional): Use a random seed if you want deterministic results. Defaults to None.
-        """
-
         self.text_generation_controller = text_generation_controller
         self.random_seed = random_seed
         self.scheduler = Scheduler(max_batch_size=max_batch_size)
 
-    def generate(self, prompts: List[str], common_inference_params: CommonInferenceParams) -> dict:
+    def generate(
+        self,
+        prompts: List[str],
+        add_BOS: bool = False,
+        encoder_prompts: List[str] = None,
+        common_inference_params: CommonInferenceParams = None,
+    ) -> dict:
         """The megatron core inference backend generate function
 
-        This backend returns the output generations as a dictionary. It returns the prompt tokens along with the generated tokens, the prompt plus the generated string and the output log probabilities if requested
+        This backend returns the output generations as a dictionary.
+        It returns the prompt tokens along with the generated tokens, the prompt
+        plus the generated string and the output log probabilities if requested
 
         Args:
             prompts (List[str]): All the prompts as a list of strings
+            add_BOS (bool): Whether to add BOS token to beginning of prompts
+            encoder_prompts (List[dict]): All the encoder prompts as a list of strings
             common_inference_params (CommonInferenceParams): The inference parameters
 
         Returns:
-            List[InferenceRequest]: The output is list of inference requests containing the generated tokens, texts and log probs if required
+            List[InferenceRequest]: The output is list of inference requests containing the
+            generated tokens, texts and log probs if required
         """
         # TODO :M core- get rng state tracker
         if self.random_seed:
             torch.random.manual_seed(self.random_seed)
 
-        for prompt in prompts:
-            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt)
+        for i in range(len(prompts)):
+            prompt = prompts[i]
+            encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
+            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
+
             self.scheduler.add_request(
                 prompt=prompt,
                 prompt_tokens=prompt_tokens,
+                encoder_prompt=encoder_prompt,
                 inference_parameters=common_inference_params,
             )
 
@@ -68,7 +87,9 @@ def run_engine(self):
         Runs the engine until there are no requests in the queue.
 
         Args:
-            dynamic_generation (bool, optional): Set this to True, if you want to enable dynamic batching. Mainly used with an inference server. Defaults to False.
+            dynamic_generation (bool, optional): Set this to True, if you want
+                to enable dynamic batching. Mainly used with an inference server.
+                Defaults to False.
         """
         while self.scheduler.have_requests_pending():
             active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy()
diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py
index a03834c7e4..4825dfd366 100644
--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
@@ -10,6 +10,8 @@
 
 # class syntax
 class Status(Enum):
+    """Enum for status"""
+
     WAITING_IN_QUEUE = 1
     ACTIVE_AND_GENERATING_TOKENS = 2
     ACTIVE_BUT_NOT_GENERATING_TOKENS = 3
@@ -18,12 +20,19 @@ class Status(Enum):
 
 @dataclass
 class InferenceRequest:
+    """Class for one inference request
+
+    Containing relevant data for an inference request
+
+    """
+
     request_id: str
     prompt: str
     inference_parameters: CommonInferenceParams
     prompt_tokens: List[int]
     arrival_time: float
     status: Status
+    encoder_prompt: str = None
     generated_text: str = None
     generated_tokens: torch.Tensor = None
     generated_log_probs: torch.Tensor = None
diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
new file mode 100644
index 0000000000..10e1da4812
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from argparse import Namespace
+from collections import deque
+from typing import Any, List, Tuple
+
+import numpy
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.models.T5 import T5Model
+
+
+class T5InferenceWrapper(AbstractModelInferenceWrapper):
+    """Constructor for the model inference wrapper
+
+    The wrapper prepares the model for inference, provides the required input
+    data, and runs the forward pass
+
+    Args:
+        model (T5Model): The T5 model (MCore or legacy)
+        args (Namespace): The command line arguments that were passed
+    """
+
+    def __init__(self, model: T5Model, args: Namespace):
+        super().__init__(model, args)
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None
+    ):
+        """A utility function for preparing model for inference
+
+        This function is called before the forward pass. It puts the model in eval mode, builds
+        position ids, and creates attention masks so that required slices can be extracted during
+        the forward pass.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            encoder_prompts (dict): List of string of encoder input prompts
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+        """
+
+        super().prep_model_for_inference(prompts_tokens=prompts_tokens)
+
+        encoder_prompts_tokens_list = [
+            self.tokenize_encoder_prompt(encoder_prompt, tokenizer)
+            for encoder_prompt in encoder_prompts
+        ]
+        self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
+            encoder_prompts_tokens_list, self.model.max_sequence_length, tokenizer
+        )
+
+        # create batch mask for encoder_prompt (self.batch_input_tokens) and
+        # decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
+        decoder_prompts_tokens = self.prompts_tokens.cpu().numpy()
+        encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy()
+        self.batch_mask_encoder = []
+        self.batch_mask_decoder = []
+        self.batch_mask_encoder_decoder = []
+        for i in range(len(self.prompts_tokens)):
+            self.batch_mask_encoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    encoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+            self.batch_mask_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], decoder_prompts_tokens[i]
+                )
+                * T5MaskedWordPieceDataset._make_history_mask(decoder_prompts_tokens[i])
+            )
+            self.batch_mask_encoder_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+        self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda()
+        self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda()
+        self.batch_mask_encoder_decoder = torch.tensor(
+            numpy.array(self.batch_mask_encoder_decoder)
+        ).cuda()
+        self.batch_mask_encoder = self.batch_mask_encoder < 0.5
+        self.batch_mask_decoder = self.batch_mask_decoder < 0.5
+        self.batch_mask_encoder_decoder = self.batch_mask_encoder_decoder < 0.5
+
+    def tokenize_encoder_prompt(
+        self, encoder_prompt: str, tokenizer
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the encoder_prompt
+
+        Args:
+            encoder_prompt (str): The encoder_prompt
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing string
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+
+        # if there is the word "<mask>" in prompt, replacing it with special_additional_token,
+        # similar to processing step in megatron/core/datasets/t5_dataset.py
+        divided_encoder_prompt_list = encoder_prompt.split("<mask>")
+        masks_count = len(divided_encoder_prompt_list) - 1
+        sentinels = deque(tokenizer.additional_special_tokens_ids)
+
+        encoder_prompt_tokens = []
+        for divided_encoder_prompt in divided_encoder_prompt_list:
+            divided_encoder_prompt_tokens = tokenizer.tokenize(divided_encoder_prompt)
+            encoder_prompt_tokens.extend(divided_encoder_prompt_tokens)
+            if masks_count > 0:
+                sentinel = sentinels.popleft()
+                encoder_prompt_tokens.extend([sentinel])
+
+        return encoder_prompt_tokens
+
+    def pad_encoder_prompts_tokens(
+        self, encoder_prompts_tokens_list: List[List[int]], max_sequence_length: int, tokenizer
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            encoder_prompts_tokens_list (List[List[int]]): A list containing the
+                encoder_input_tokens
+            max_sequence_length (int): Maximum of the length of the encoder inputs tokens
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_sequence_length]
+        """
+
+        for encoder_prompt_tokens in encoder_prompts_tokens_list:
+            padding_size = max_sequence_length - len(encoder_prompt_tokens)
+            encoder_prompt_tokens.extend([tokenizer.pad] * padding_size)
+
+        return torch.tensor(encoder_prompts_tokens_list).cuda()
+
+    def get_batch_for_context_window(
+        self, context_start_position: int, context_end_position: int
+    ) -> List:
+        """Returns the inference data given context window
+
+        This function gets called iteratively in a loop . Given the start and end context
+        positions , it extracts the appropriate data.
+
+        Args:
+            context_start_position (int): Start of the context window. During
+                the first inference step it is mostly 0
+            context_end_position (int): End of the context window. During the
+                last inference step it will mostly be the max generated sequence length.
+
+        Returns:
+            List: A list of inputs that will be used by your model in the forward step
+        """
+
+        # rerun encoder every step
+        # T5 inference not yet support kv_cache
+        encoder_tokens2use = self.batch_encoder_prompts_tokens
+        decoder_tokens2use = self.prompts_tokens[:, :context_end_position]
+        encoder_mask2use = self.batch_mask_encoder
+        decoder_mask2use = self.batch_mask_decoder[:, :context_end_position, :context_end_position]
+        encoder_decoder_mask2use = self.batch_mask_encoder_decoder[:, :context_end_position, :]
+        data_at_step_idx = [
+            encoder_tokens2use,
+            decoder_tokens2use,
+            encoder_mask2use,
+            decoder_mask2use,
+            encoder_decoder_mask2use,
+        ]
+
+        return data_at_step_idx
+
+    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
+        """Utility to carry out simple forward pass for TP or no model parallel models
+
+        Runs a very simple forward pass for model. Used  in the case of models without
+        any parallelism or only tensor parallelism.
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt
+                model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        [encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = (
+            inference_input
+        )
+        tokens = decoder_tokens
+
+        # T5 inference not yet support kv_cache
+        logits = self.model(
+            encoder_tokens,
+            decoder_tokens,
+            encoder_mask,
+            decoder_mask,
+            encoder_decoder_mask,
+            inference_params=None,
+        )
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+
+        return logits
diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py
index abcb325185..00ab81b4ab 100644
--- a/megatron/core/inference/scheduler.py
+++ b/megatron/core/inference/scheduler.py
@@ -2,7 +2,7 @@
 import time
 import typing
 from collections import OrderedDict
-from typing import Dict, List
+from typing import Dict
 
 import torch
 
@@ -12,14 +12,16 @@
 
 
 class Scheduler:
-    def __init__(self, max_batch_size: int):
-        """Scheduler for handling requests to inference engine
+    """Scheduler for handling requests to inference engine
 
-        This class is responsible for handing of all the incomign requests
+    This class is responsible for handing of all the incomign requests
 
-        Args:
-            max_batch_size (int): The max batch size that we can pass to the inference engine at a time.
-        """
+    Args:
+        max_batch_size (int): The max batch size that we can pass to the
+            inference engine at a time.
+    """
+
+    def __init__(self, max_batch_size: int):
         self.max_batch_size = max_batch_size
         self.active_request_pool: Dict[int, InferenceRequest] = OrderedDict()
         self.waiting_request_pool: Dict[int, InferenceRequest] = OrderedDict()
@@ -30,16 +32,19 @@ def add_request(
         self,
         prompt: str,
         prompt_tokens: torch.Tensor,
-        inference_parameters: CommonInferenceParams,
+        encoder_prompt: str = None,
+        inference_parameters: CommonInferenceParams = None,
         arrival_time: float = None,
     ):
         """Add an incoming request
 
-        This method will add the request to either the active pool or the waiting pool depending on the batch size.
+        This method will add the request to either the active pool or the waiting pool
+        depending on the batch size.
 
         Args:
             prompt (str): Input prompt string
             prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized
+            encoder_prompt (str): Encoder input string
             inference_parameters (CommonInferenceParams): The inference parameters
             arrival_time (float, optional): The incoming request time. Defaults to None.
         """
@@ -61,6 +66,7 @@ def add_request(
             arrival_time=arrival_time,
             prompt_tokens=prompt_tokens,
             status=status,
+            encoder_prompt=encoder_prompt,
         )
 
         if status == status.ACTIVE_BUT_NOT_GENERATING_TOKENS:
@@ -79,7 +85,8 @@ def have_requests_pending(self) -> bool:
     def add_earliest_waiting_request_to_active_pool(self):
         """Utility to add the waiting request to active pool
 
-        This method will add the earliest request (FIFO) that is in the waiting request pool to the active request pool.
+        This method will add the earliest request (FIFO) that is in the waiting request
+        pool to the active request pool.
         """
         assert (
             len(self.active_request_pool) < self.max_batch_size
@@ -94,11 +101,15 @@ def add_earliest_waiting_request_to_active_pool(self):
     def update_requests_pools(self, result_dict: typing.OrderedDict[int, InferenceRequest] = None):
         """Update request pool status
 
-        This method will full up the active request pool, if it has less than max batch size elements from the waiting request pool.
-        If provided with a request dict, it will put the completed requests into the completed request pool and add waiting request into active pool.
+        This method will full up the active request pool, if it has less than max batch size
+        elements from the waiting request pool.
+        If provided with a request dict, it will put the completed requests into the completed
+        request pool and add waiting request into active pool.
 
         Args:
-            result (typing.OrderedDict[int, InferenceRequest], optional): The result returned by the engine. A dictionary with keys as the request ids, and values as the requests. Defaults to None
+            result (typing.OrderedDict[int, InferenceRequest], optional): The result returned
+                by the engine. A dictionary with keys as the request ids, and values as the
+                requests. Defaults to None
         """
         for result_request_id in list(result_dict.keys()):
             active_request = self.active_request_pool[result_request_id]
diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..61beff0211
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import OrderedDict
+
+import torch
+
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+
+
+class EncoderDecoderTextGenerationController(SimpleTextGenerationController):
+    """The text generation controller for encoder-decoder architecture
+
+    This class ingherits from SimpleTextGenerationController, adding features
+    relating to encoder input encoder_prompt
+
+    """
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        encoder_prompts = list(
+            map(lambda request: request.encoder_prompt, active_requests.values())
+        )
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=prompts_tokens, encoder_prompts=encoder_prompts, tokenizer=self.tokenizer
+        )
diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
index e4db83f6b3..0667af8373 100644
--- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
@@ -14,15 +14,18 @@
 
 
 class SimpleTextGenerationController:
-    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
-        """The basic text generation controller
+    """The basic text generation controller
 
-        This class is responsible for tokenizing the input , running the inference, sampling and also detokenizing the output
+    This class is responsible for tokenizing the input , running the inference, sampling
+    and also detokenizing the output
 
-        Args:
-            inference_wrapped_model (AbstractModelInferenceWrapper): A model that is wrapped using the specs given in the abstract_model_inference_wrapper.py
-            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
-        """
+    Args:
+        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
+            is wrapped using the specs given in the abstract_model_inference_wrapper.py
+        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
+    """
+
+    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
         self.inference_wrapped_model = inference_wrapped_model
         self.tokenizer = tokenizer
 
@@ -31,7 +34,9 @@ def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, token
             parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
         )
 
-    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    def tokenize_prompt(
+        self, prompt: str, add_BOS: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Utility to tokenize the input prompts
 
         Args:
@@ -40,13 +45,19 @@ def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         Returns:
             torch.Tensor: Returns the tokenized prompt
         """
-        return self.tokenizer.tokenize(prompt)
+        prompt_tokens = self.tokenizer.tokenize(prompt)
+
+        if add_BOS:
+            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
+
+        return prompt_tokens
 
     def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
         """Detokenize the output generations
 
         Args:
-            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt tokens plus the generated tokens
+            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
+            tokens plus the generated tokens
 
         Returns:
             str: The detokenized output
@@ -62,11 +73,15 @@ def sample_from_logits(
     ) -> torch.Tensor:
         """Samples the logits to generate outputs
 
-        Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples
+        Given the logits of the last token, this function samples it
+        according to the parameters defined in common_inference_params
+        and returns the samples
 
         Args:
-            last_token_logits (torch.Tensor): The last token logits. A tensor of size [batch_size, vocab_size]
-            common_inference_params (CommonInferenceParams): The paramters to use for inference
+            last_token_logits (torch.Tensor): The last token logits. A tensor of
+                size [batch_size, vocab_size]
+            common_inference_params (CommonInferenceParams): The paramters to use
+                for inference
             vocab_size (int): Obtained from the tokenizer. Defaults to None
 
         Returns:
@@ -141,23 +156,35 @@ def update_generation_status(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Checks which prompts have reached an end condition
 
-        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True. The generated sequence lengths increase as we keep generating, until that prompts hits an end condition. The generation_started tensor determines which prompts have started generating.
+        We check which prompts have reached an end condition and set the corresponding
+        flags of the is_generation_done_tensor to True. The generated sequence lengths
+        increase as we keep generating, until that prompts hits an end condition. The
+        generation_started tensor determines which prompts have started generating.
 
         Args:
-            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest generated tokens. A tensor of shape [batch_size, max_seq_len] (i.e max_seq_len = max_prompt_len + tokens_to_generate)
-            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has started generating tokens.
-            current_context_end_position (int): An integer indicating which position to extract from the prompts tokens to get the latest generated tokens.
-            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size]. True indicates the prompt at that index has reached end condition.
-            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size]. Each value represents the generated sequence lengths for that prompt.
+            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
+                generated tokens. A tensor of shape [batch_size, max_seq_len]
+                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
+            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
+                indicates the prompt at that index has started generating tokens.
+            current_context_end_position (int): An integer indicating which position to
+                extract from the prompts tokens to get the latest generated tokens.
+            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
+                True indicates the prompt at that index has reached end condition.
+            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
+                Each value represents the generated sequence lengths for that prompt.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean is_generation_done_tensor and the generated_sequence_lengths after updating it
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
+                is_generation_done_tensor and the generated_sequence_lengths after updating it
         """
         latest_samples = updated_prompts_tokens[:, current_context_end_position]
-        # Make sure we are checking eod criterion only for prompts that have started generating (i.e) We only look at the generated tokenns and not the input tokens.
+        # Make sure we are checking eod criterion only for prompts that have started generating
+        # (i.e) We only look at the generated tokenns and not the input tokens.
         reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
         is_generation_done_tensor = is_generation_done_tensor | reached_eod
-        # We increment generated sequence lengths when that prompt has not hit the EOD and generation has started
+        # We increment generated sequence lengths when that prompt has not hit the
+        # EOD and generation has started
         generated_sequence_lengths += ~is_generation_done_tensor & generation_started
 
         return is_generation_done_tensor, generated_sequence_lengths
@@ -178,7 +205,9 @@ def pad_input_prompt_tokens(
             num_tokens_togenerate (int): The number of tokens to generate for each prompt
 
         Returns:
-            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e) max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate, with extra indices for each tensor padded with mask id.
+            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
+            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
+            with extra indices for each tensor padded with mask id.
         """
         max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
 
@@ -193,13 +222,16 @@ def generate_output_tokens_dynamic_batch(
     ) -> OrderedDict[int, InferenceRequest]:
         """Utility to generate the output tokens and probabilities for the prompts
 
-        This utility generates the output tokens for a dynamic batch. It will run one forward step at a time, and pass control back to the engine, which will update the request pool and call this method again.
+        This utility generates the output tokens for a dynamic batch. It will run one forward step
+        at a time, and pass control back to the engine, which will update the request pool and call
+        this method again.
 
         Args:
             active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
 
         Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests after running one forward step.
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+            after running one forward step.
         """
         raise Exception("Not implemented yet")
 
@@ -208,7 +240,9 @@ def generate_all_output_tokens_static_batch(
     ) -> OrderedDict[int, InferenceRequest]:
         """Utility to generate the all the output tokens and probabilities for the prompts .
 
-        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        This utility generates the output tokens for a static batch. It runs the forward steps till
+        all prompts complete generation, updates the status of these requests to completed, adds
+        the generated result and returns these requests
 
         Args:
             active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
@@ -252,8 +286,9 @@ def generate_all_output_tokens_static_batch(
         generated_sequence_lengths = torch.zeros(batch_size).cuda()
 
         with torch.no_grad():
-            self.inference_wrapped_model.prep_model_for_inference(
-                prompts_tokens=batch_prompt_tokens
+
+            self.prep_model_for_inference(
+                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
             )
 
             context_start_position = 0
@@ -275,14 +310,17 @@ def generate_all_output_tokens_static_batch(
                         tensor=logits,
                     )
 
-                # Indicates which of the input prompts have started generating tokens. A 1D boolean tensor with [batch_size] elements (i.e) The shortest prompts will start generating first and so on
+                # Indicates which of the input prompts have started generating tokens.
+                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
+                # prompts will start generating first and so on
                 generation_started = prompt_lengths_in_batch <= context_end_position
                 last_token_logits = logits[:, -1, :]
                 sampled_logits = self.sample_from_logits(
                     last_token_logits, common_inference_params, self.tokenizer.vocab_size
                 )
 
-                # Substitute the sampled logits only for only the prompts that have started generating tokens
+                # Substitute the sampled logits only for only the prompts that
+                # have started generating tokens
                 batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
                     generation_started
                 ]
@@ -302,7 +340,8 @@ def generate_all_output_tokens_static_batch(
 
                 context_start_position = context_end_position
 
-                # Check end of generation status for each tensor and update generated sequence lengths
+                # Check end of generation status for each tensor
+                # and update generated sequence lengths
                 (is_generation_done_tensor, generated_sequence_lengths) = (
                     self.update_generation_status(
                         updated_prompts_tokens=batch_prompt_tokens,
@@ -348,3 +387,14 @@ def generate_all_output_tokens_static_batch(
             request.generated_text = self.detokenize_generations(required_result_tokens)
 
         return active_requests
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index caae41cb4a..f2751673e4 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -182,8 +182,8 @@ class ModelParallelConfig:
 
     tp_comm_atomic_ag: bool = False
     """Deprecated from TransformerEngine v1.6.0.
-        If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather both
-       done atomically. Don't care if tp_comm_overlap is False.
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       both done atomically. Don't care if tp_comm_overlap is False.
     """
 
     tp_comm_split_rs: bool = True
@@ -213,6 +213,11 @@ class ModelParallelConfig:
        If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled
     """
 
+    tp_comm_bootstrap_backend: str = 'nccl'
+    """
+       Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo'
+    """
+
     ###################
     # Pipeline Parallel
     ###################
@@ -257,7 +262,8 @@ class ModelParallelConfig:
 
     wgrad_deferral_limit: int = 0
     """This value tunes the number of micro-batches for which the embedding weight gradient compute
-       needs to be deferred to pipeline flush, this argument is invalid if `defer_embedding_wgrad_compute` is False.
+       needs to be deferred to pipeline flush, this argument is invalid if
+       `defer_embedding_wgrad_compute` is False.
        Defaults to 0, which means all micro-batches are deferred.
     """
 
@@ -276,7 +282,9 @@ class ModelParallelConfig:
     """Tells the number of transformer layers for which activations has to be offloaded."""
 
     _cpu_offloading_context: ContextManager = (
-        None  # Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible.
+        None
+        # Used for internal use only, not to be set by a user.
+        # TODO: Need to move to the 'right' place when possible.
     )
     """For internal use only, do not set."""
 
@@ -297,7 +305,8 @@ class ModelParallelConfig:
 
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization.
-        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
+        details.
         """
         if self.sequence_parallel:
             if self.tensor_model_parallel_size <= 1:
@@ -324,11 +333,12 @@ def __post_init__(self):
 
         if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0:
             raise ValueError(
-                "Wgrad deferral limit should be greater than or equal to 0 when this optimization is enabled!"
+                "Wgrad deferral limit should be greater than or equal to 0 when it is enabled!"
             )
 
         if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
             if self.sequence_parallel is False:
                 raise ValueError(
-                    "When using expert parallelism and tensor parallelism, sequence parallelism must be used"
+                    "When using expert parallelism and tensor parallelism, sequence parallelism "
+                    "must be used"
                 )
diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
index 42da1889a9..ecdcdbc260 100644
--- a/megatron/core/models/T5/t5_spec.py
+++ b/megatron/core/models/T5/t5_spec.py
@@ -52,7 +52,7 @@ def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
         submodules=TransformerLayerSubmodules(
             self_attention=ModuleSpec(
                 module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=TELayerNormColumnParallelLinear,
                     core_attention=TEDotProductAttention,
@@ -94,6 +94,7 @@ def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
             pre_cross_attn_layernorm=TENorm,
             cross_attention=ModuleSpec(
                 module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=CrossAttentionSubmodules(
                     linear_q=TEColumnParallelLinear,
                     linear_kv=TEColumnParallelLinear,
@@ -122,7 +123,7 @@ def encoder_model_with_local_spec() -> ModuleSpec:
             input_layernorm=LNImpl,
             self_attention=ModuleSpec(
                 module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
                     core_attention=DotProductAttention,
@@ -170,6 +171,7 @@ def decoder_model_with_local_spec() -> ModuleSpec:
             pre_cross_attn_layernorm=LNImpl,
             cross_attention=ModuleSpec(
                 module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
                 submodules=CrossAttentionSubmodules(
                     linear_q=ColumnParallelLinear,
                     linear_kv=ColumnParallelLinear,
diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
index d9d1be449c..eb08d4cfd6 100644
--- a/megatron/core/models/bert/bert_model.py
+++ b/megatron/core/models/bert/bert_model.py
@@ -1,15 +1,14 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 import os
-from importlib.metadata import version
+import warnings
 from typing import Literal, Optional
 
 import torch
-from pkg_resources import packaging
 from torch import Tensor
 
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
+from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec
 from megatron.core.models.bert.bert_lm_head import BertLMHead
 from megatron.core.models.bert.pooler import Pooler
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
@@ -20,11 +19,14 @@
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import get_linear_layer
+from megatron.core.utils import get_te_version as _get_te_version
+from megatron.core.utils import is_te_min_version
 
 
 def get_te_version():
-    """Returns the installed version of transformer engine"""
-    return packaging.version.Version(version("transformer-engine"))
+    """Included for backwards compatibility."""
+    warnings.warn("`get_te_version` will be deprecated in a future release")
+    return _get_te_version()
 
 
 class BertModel(LanguageModule):
@@ -91,9 +93,7 @@ def __init__(
         # megatron core pipelining currently depends on model type
         self.model_type = ModelType.encoder_or_decoder
 
-        self.attn_mask_dimensions = self._santiy_check_attention_and_get_attn_mask_dimension(
-            transformer_layer_spec
-        )
+        self.attn_mask_dimensions = self._sanity_check_attention_and_get_attn_mask_dimension()
 
         # Embeddings.
         if self.pre_process:
@@ -152,44 +152,71 @@ def __init__(
         if self.pre_process or self.post_process:
             self.setup_embeddings_and_output_layer()
 
-    def _santiy_check_attention_and_get_attn_mask_dimension(
-        self, transformer_layer_spec: ModuleSpec
-    ) -> str:
+    # pylint: disable=line-too-long
+    def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str:
         """We do some checks and return attention mask dimensions for self attention
 
         Transformer engine library underwent a lot of change. So we need to change dimensions of
         the attention mask depending on the TE version. We also santiy check some arguments.
 
         1. If we use local version of attention dimension of the mask is [b,1,s,s]
-        2. If we use transformer engine < 1.7
-          (Flash and Fused attention not supported. We use unfused path).
-          Attn mask dimension is [b,1,s,s]
-        2. If we use transformer engine >= 1.7
-          (Flash and fused attention supported with attn mask dimension [b,1,1,s]).
-          Unfused path will use attn mask dimension [b,1,s,s] with attn mask type arbitrary.
-          Default if you dont set any NVTE_ATTN flag will just use unfused path.
+        2. If we use transformer engine > 1.10 we support all 3 backends with padding mask and [b,1,s,s]
+        3. If we use transformer engine >= 1.7 but less than 1.10
+          a ) Flash and Fused attention uses padding mask with [b,1,1,s]
+          b ) Unfused attention works with arbitrary mask with [b,1,s,s]
+        4. If we use transformer engine < 1.7
+          Flash and fused attention is not supported. Unfused attention will work with padding mask [b,1,s,s]
+
+        Default if you dont set any NVTE_ATTN flag will it will just use the fused path for transformer engine version >= 1.7 and unfused path for other
 
         Args:
-            transformer_layer_spec (ModuleSpec): _description_
+            transformer_layer_spec (ModuleSpec): The transformer layer spec
 
         Returns:
-            str: _description_
+            str: A string showing the format of the attn mask dimensions
         """
-        attn_mask_dimensions = "b1ss"
-        if transformer_layer_spec == bert_layer_with_transformer_engine_spec:
-            if get_te_version() >= packaging.version.Version("1.7.0"):
-                # pylint: disable=line-too-long
-                if os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0':
-                    assert (
-                        transformer_layer_spec.submodules.self_attention.params['attn_mask_type']
-                        == AttnMaskType.arbitrary
-                    ), "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
-                else:
+        attn_mask_dimensions = None
+        # For local layer spec we just use b1ss
+        if self.transformer_layer_spec == bert_layer_local_spec:
+            attn_mask_dimensions = "b1ss"
+        else:
+            attn_mask_type = self.transformer_layer_spec.submodules.self_attention.params[
+                'attn_mask_type'
+            ]
+            flash_attention_enabled = os.getenv('NVTE_FLASH_ATTN') == '1'
+            fused_attention_enabled = os.getenv('NVTE_FUSED_ATTN') == '1'
+            # For TE >= 1.10 (We always use padding mask and use b11s)
+            if is_te_min_version("1.10.0"):
+                attn_mask_dimensions = "b11s"
+                if attn_mask_type != AttnMaskType.padding:
+                    warnings.warn(
+                        f'For TE versions >= 1.10 , flash/fused/unfused support padding mask. Setting attention mask from {attn_mask_type} to padding'
+                    )
+                    self.transformer_layer_spec.submodules.self_attention.params[
+                        'attn_mask_type'
+                    ] = AttnMaskType.padding
+            # For 1.7 >= TE < 1.10 flash and fused path use padding mask with b11s and unfused path uses arbitrary mask with b1ss
+            elif is_te_min_version("1.7.0"):
+                if flash_attention_enabled or fused_attention_enabled:
                     attn_mask_dimensions = "b11s"
+                else:
+                    if attn_mask_type != AttnMaskType.arbitrary:
+                        warnings.warn(
+                            f'For TE versions >= 1.7 but < 1.10 , unfused path supports only arbitrary mask. Setting attention mask from {attn_mask_type} to arbitray'
+                        )
+                        self.transformer_layer_spec.submodules.self_attention.params[
+                            'attn_mask_type'
+                        ] = AttnMaskType.arbitrary
+                    attn_mask_dimensions = "b1ss"
+            # For TE < 1.7 we only support unfused attention with b1ss and padding mask
             else:
-                assert (
-                    os.getenv('NVTE_FLASH_ATTN') == '0' and os.getenv('NVTE_FUSED_ATTN') == '0'
-                ), "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
+                attn_mask_dimensions = "b1ss"
+                assert not flash_attention_enabled and not fused_attention_enabled, (
+                    "Flash and fused attention is not supported with transformer engine version "
+                    "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+                    "engine >= 1.7"
+                )
+
         return attn_mask_dimensions
 
     def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py
index e69de29bb2..865f96da5d 100644
--- a/megatron/core/models/common/embeddings/__init__.py
+++ b/megatron/core/models/common/embeddings/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from .rope_utils import apply_rotary_pos_emb
+from .rotary_pos_embedding import RotaryEmbedding
+from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
new file mode 100644
index 0000000000..accb251961
--- /dev/null
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+import logging
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+
+logger = logging.getLogger(__name__)
+
+try:
+    from apex.transformer.functional import (
+        fused_apply_rotary_pos_emb,
+        fused_apply_rotary_pos_emb_thd,
+    )
+
+    HAVE_APPLY_ROPE_FUSION = True
+except ImportError:
+    HAVE_APPLY_ROPE_FUSION = False
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor:
+    """Get the position embedding on the current context parallel rank.
+
+    Args:
+        pos_emb (Tensor): Positional embedding tensor
+        seq_dim (int): Sequence dimension
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor(
+        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
+    ).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(
+        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
+    )
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
+    """Change sign so the last dimension becomes [-odd, +even]
+
+    Args:
+        x (Tensor): Input tensor
+
+    Returns:
+        Tensor: Tensor rotated half
+    """
+    if not rotary_interleaved:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1 = x[:, :, :, ::2]
+        x2 = x[:, :, :, 1::2]
+        x_new = torch.stack((-x2, x1), dim=-1)
+        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
+
+
+def _apply_rotary_pos_emb_bshd(
+    t: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """Apply rotary positional embedding to input tensor T.
+
+    check https://kexue.fm/archives/8265 for detailed formulas
+
+    Args:
+        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+
+    Returns:
+        Tensor: The input tensor after applying RoPE
+    """
+    rot_dim = freqs.shape[-1]
+
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    if multi_latent_attention:
+        x1 = t[..., 0::2]
+        x2 = t[..., 1::2]
+        t = torch.cat((x1, x2), dim=-1)
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    cos_ = (torch.cos(freqs) * mscale).to(t.dtype)
+    sin_ = (torch.sin(freqs) * mscale).to(t.dtype)
+
+    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+    return torch.cat((t, t_pass), dim=-1)
+
+
+def _apply_rotary_pos_emb_thd(
+    t: Tensor,
+    cu_seqlens: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """A baseline implementation of applying RoPE for `thd` format.
+
+    Args:
+        t (Tensor): Input tensor T is of shape [t, h, d]
+        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
+        with shape [b + 1] and dtype torch.int32.
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
+
+    Returns:
+        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
+    """
+
+    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    return torch.cat(
+        [
+            _apply_rotary_pos_emb_bshd(
+                x.unsqueeze(1),
+                freqs[: x.size(0)],
+                rotary_interleaved=rotary_interleaved,
+                multi_latent_attention=multi_latent_attention,
+                mscale=mscale,
+            )
+            for x in torch.split(t, seqlens)
+        ]
+    ).squeeze(1)
+
+
+def apply_rotary_pos_emb(
+    t: Tensor,
+    freqs: Tensor,
+    config: TransformerConfig,
+    cu_seqlens: Optional[Tensor] = None,
+    mscale: float = 1.0,
+):
+    """
+    Reroute to the appropriate apply_rotary_pos_emb function depending on
+    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
+    """
+    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
+        # setting apply_rope_fusion in config to False
+        # so that subsequent queries to this config also return False
+        config.apply_rope_fusion = False
+        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
+            logger.warning(
+                "Setting apply_rope_fusion to false because its implementation"
+                " is not included in Apex. Try upgrading to the latest version"
+            )
+            apply_rotary_pos_emb.printed_fused_warning = True
+
+    if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved:
+        logger.warning(
+            "rotary_interleaved is not supported with multi_latent_attention, setting it to False"
+        )
+        config.rotary_interleaved = False
+
+    if config.apply_rope_fusion:
+        if cu_seqlens is None:
+            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
+        else:
+            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
+    else:
+        if cu_seqlens is None:
+            return _apply_rotary_pos_emb_bshd(
+                t,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
+        else:
+            return _apply_rotary_pos_emb_thd(
+                t,
+                cu_seqlens,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index 0a4e5bf6de..5232faec60 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -2,58 +2,50 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.transformer.transformer_block import TransformerBlock
 
 import logging
+import math
 
 import torch
 from torch import Tensor, nn
 
 from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import (  # for backward compatibility; pylint: disable=unused-import
+    _apply_rotary_pos_emb_bshd,
+    _apply_rotary_pos_emb_thd,
+    _rotate_half,
+    apply_rotary_pos_emb,
+    get_pos_emb_on_this_cp_rank,
+)
 
 logger = logging.getLogger(__name__)
 
-try:
-    from apex.transformer.functional import (
-        fused_apply_rotary_pos_emb,
-        fused_apply_rotary_pos_emb_thd,
-    )
 
-    HAVE_APPLY_ROPE_FUSION = True
-except ImportError:
-    HAVE_APPLY_ROPE_FUSION = False
-
-
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
-
-
-def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
-    cp_size = parallel_state.get_context_parallel_world_size()
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cp_idx = torch.tensor(
-        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
-    ).cuda(non_blocking=True)
-    pos_emb = pos_emb.view(
-        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
-    )
-    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
-    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
-    return pos_emb
+__all__ = ['RotaryEmbedding']
 
 
 class RotaryEmbedding(nn.Module):
     """Rotary Embedding for language model.
 
     Args:
-        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None
-        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000.
-        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on the GPU. Defaults to False
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained
+            from transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position
+            embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE
+            for longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        rope_scaling (bool, optional): Apply rope scaling as used in llama 3.1
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly
+            on the GPU. Defaults to False
     """
 
     def __init__(
@@ -63,6 +55,7 @@ def __init__(
         rotary_interleaved: bool = False,
         seq_len_interpolation_factor: float = None,
         rotary_base: int = 10000,
+        rope_scaling: bool = False,
         use_cpu_initialization: bool = False,
     ) -> None:
         super().__init__()
@@ -78,6 +71,44 @@ def __init__(
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
 
+        if rope_scaling:
+            self.inv_freq = self._apply_scaling(self.inv_freq)
+
+    def _apply_scaling(
+        self,
+        freqs,
+        factor=8,
+        low_freq_factor=1,
+        high_freq_factor=4,
+        original_max_position_embeddings=8192,
+    ):
+        # This implementation is adapted from:
+        # https://github.com/huggingface/transformers/blob/2a5a6ad18aa22e98429bb5ecb880660328030ea0/src/transformers/modeling_rope_utils.py#L303-L343
+
+        factor = factor  # `8` in the original implementation
+        low_freq_factor = low_freq_factor  # `1` in the original implementation
+        high_freq_factor = high_freq_factor  # `4` in the original implementation
+        old_context_len = original_max_position_embeddings  # `8192` in the original implementation
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        wavelen = 2 * math.pi / freqs
+        # wavelen < high_freq_wavelen: do nothing
+        # wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen, freqs / factor, freqs)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+            high_freq_factor - low_freq_factor
+        )
+        smoothed_inv_freq = (
+            1 - smooth_factor
+        ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+        inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+        return inv_freq_llama
+
     def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
         """Forward pass of RoPE embedding.
 
@@ -111,7 +142,8 @@ def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
         # emb [seq_length, .., dim]
         emb = emb[:, None, None, :]
         if parallel_state.get_context_parallel_world_size() > 1:
-            # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank
+            # slice rotary_pos_emb along sequence dimension and select the parition of the current
+            # CP rank
             emb = get_pos_emb_on_this_cp_rank(emb, 0)
         return emb
 
@@ -130,8 +162,9 @@ def get_rotary_seq_len(
 
         Args:
             inference_params : Used during Inference time
-            transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model
-            transformer_input (Tensor): _description_
+            transformer (TransformerBlock): The transformer block (decoder/encoder) used
+                by the model
+            transformer_input (Tensor): Input tensor to the transformer
             transformer_config (TransformerConfig): Transformer config used by the model
 
         Returns:
@@ -151,102 +184,3 @@ def get_rotary_seq_len(
         rotary_seq_len *= transformer_config.context_parallel_size
 
         return rotary_seq_len
-
-
-def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
-    """Change sign so the last dimension becomes [-odd, +even]
-
-    Args:
-        x (Tensor): Input tensor
-
-    Returns:
-        Tensor: Tensor rotated half
-    """
-    if not rotary_interleaved:
-        x1, x2 = torch.chunk(x, 2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1 = x[:, :, :, ::2]
-        x2 = x[:, :, :, 1::2]
-        x_new = torch.stack((-x2, x1), dim=-1)
-        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
-
-
-def apply_rotary_pos_emb_bshd(t: Tensor, freqs: Tensor, rotary_interleaved: bool = False) -> Tensor:
-    """Apply rotary positional embedding to input tensor T.
-
-    check https://kexue.fm/archives/8265 for detailed formulas
-
-    Args:
-        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
-
-    Returns:
-        Tensor: The input tensor after applying RoPE
-    """
-    rot_dim = freqs.shape[-1]
-
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    cos_ = torch.cos(freqs).to(t.dtype)
-    sin_ = torch.sin(freqs).to(t.dtype)
-
-    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
-    return torch.cat((t, t_pass), dim=-1)
-
-
-def apply_rotary_pos_emb_thd(
-    t: Tensor, cu_seqlens: Tensor, freqs: Tensor, rotary_interleaved: bool = False
-) -> Tensor:
-    """A baseline implementation of applying RoPE for `thd` format.
-
-    Args:
-        t (Tensor): Input tensor T is of shape [t, h, d]
-        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
-        with shape [b + 1] and dtype torch.int32.
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
-
-    Returns:
-        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
-    """
-
-    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-    return torch.cat(
-        [
-            apply_rotary_pos_emb_bshd(x.unsqueeze(1), freqs[: x.size(0)])
-            for x in torch.split(t, seqlens)
-        ]
-    ).squeeze(1)
-
-
-def apply_rotary_pos_emb(
-    t: Tensor, freqs: Tensor, config: TransformerConfig, cu_seqlens: Optional[Tensor] = None
-):
-    """
-    Reroute to the appropriate apply_rotary_pos_emb function depending on
-    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
-    """
-    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
-        # setting apply_rope_fusion in config to False so that subsequent queries to this config also return False
-        config.apply_rope_fusion = False
-        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
-            logger.warning(
-                "Setting apply_rope_fusion to false because its implementation"
-                " is not included in Apex. Try upgrading to the latest version"
-            )
-            apply_rotary_pos_emb.printed_fused_warning = True
-    if config.apply_rope_fusion:
-        if cu_seqlens is None:
-            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
-        else:
-            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
-    else:
-        if cu_seqlens is None:
-            return apply_rotary_pos_emb_bshd(t, freqs, rotary_interleaved=config.rotary_interleaved)
-        else:
-            return apply_rotary_pos_emb_thd(
-                t, cu_seqlens, freqs, rotary_interleaved=config.rotary_interleaved
-            )
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
new file mode 100644
index 0000000000..14d147ea34
--- /dev/null
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+import logging
+import math
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+
+logger = logging.getLogger(__name__)
+
+
+class YarnRotaryEmbedding(RotaryEmbedding):
+    """Yarn Rotary Embedding for language model.
+
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from
+            transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for
+            longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (float, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on
+            the GPU. Defaults to False
+        scaling_factor (float, optional): Scaling factor for Yarn RoPE. Defaults to 1.0.
+        original_max_position_embeddings (int, optional): Original maximum position embeddings
+            length. Defaults to 4096.
+        beta_fast (float, optional): Fast beta value for Yarn RoPE. Defaults to 32.
+        beta_slow (float, optional): Slow beta value for Yarn RoPE. Defaults to 1.
+        mscale (float, optional): Mscale value for Yarn RoPE. Defaults to 1.
+        mscale_all_dim (float, optional): Mscale all dim value for Yarn RoPE. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_percent: float = 1.0,
+        rotary_interleaved: bool = False,
+        seq_len_interpolation_factor: float = None,
+        rotary_base: float = 10000.0,
+        use_cpu_initialization: bool = False,
+        scaling_factor: float = 1.0,
+        original_max_position_embeddings: int = 4096,
+        beta_fast: float = 32.0,
+        beta_slow: float = 1.0,
+        mscale: float = 1.0,
+        mscale_all_dim: float = 0.0,
+    ):
+        self.dim = kv_channels
+        self.rotary_base = rotary_base
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq_extra = 1.0 / (
+            self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        self.inv_freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        super().__init__(
+            kv_channels,
+            rotary_percent,
+            rotary_interleaved,
+            seq_len_interpolation_factor,
+            rotary_base,
+            use_cpu_initialization,
+        )
+
+    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+
+        assert (
+            not self.rotary_interleaved
+        ), "Yarn RoPE does not support interleaved rotary embeddings"
+
+        if self.inv_freq_extra.device.type == 'cpu':
+            # move `inv_freq_extra` to GPU once at the first micro-batch forward pass
+            self.inv_freq_extra = self.inv_freq_extra.to(device=torch.cuda.current_device())
+
+        if self.inv_freq_inter.device.type == 'cpu':
+            # move `inv_freq_inter` to GPU once at the first micro-batch forward pass
+            self.inv_freq_inter = self.inv_freq_inter.to(device=torch.cuda.current_device())
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.dim,
+            self.rotary_base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to(
+            device=self.inv_freq_extra.device, dtype=torch.float32
+        )
+        inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask
+
+        seq = (
+            torch.arange(
+                max_seq_len, device=self.inv_freq_extra.device, dtype=self.inv_freq_extra.dtype
+            )
+            + offset
+        )
+
+        freqs = torch.outer(seq, inv_freq)
+
+        _mscale = float(
+            _yarn_get_mscale(self.scaling_factor, self.mscale)
+            / _yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        if parallel_state.get_context_parallel_world_size() > 1:
+            # slice rotary_pos_emb along sequence dimension
+            # and select the parition of the current CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0)
+        return emb, _mscale
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(
+    num_rotations: float, dim: int, rotary_base: float = 10000, max_position_embeddings: int = 2048
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(rotary_base)
+    )
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+    low_rot: float,
+    high_rot: float,
+    dim: int,
+    rotary_base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> tuple[int, int]:
+    low = math.floor(_yarn_find_correction_dim(low_rot, dim, rotary_base, max_position_embeddings))
+    high = math.ceil(_yarn_find_correction_dim(high_rot, dim, rotary_base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor:
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
index d469f5e4ce..1db68dc886 100755
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -11,6 +11,10 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
 from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
@@ -49,6 +53,7 @@ def get_gpt_layer_with_transformer_engine_spec(
     num_experts: Optional[int] = None,
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
     fp8: Optional[str] = None,
 ) -> ModuleSpec:
     """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
@@ -66,34 +71,63 @@ def get_gpt_layer_with_transformer_engine_spec(
     mlp = _get_mlp_module_spec(
         use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8
     )
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                    # TENorm significantly harms convergence when used
-                    # for QKLayerNorm; we instead use the Apex implementation.
-                    q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
-                    k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=TEColumnParallelLinear,
+                        linear_q_down_proj=TEColumnParallelLinear,
+                        linear_q_up_proj=TEColumnParallelLinear,
+                        linear_kv_down_proj=TEColumnParallelLinear,
+                        linear_kv_up_proj=TEColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                        kv_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    ),
                 ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                input_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=TELayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        # TENorm significantly harms convergence when used
+                        # for QKLayerNorm; we instead use the Apex implementation.
+                        q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                        k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
 
 
 def get_gpt_layer_local_spec(
     num_experts: Optional[int] = None,
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
 ) -> ModuleSpec:
     """Use this spec for an implementation using only modules in Megatron-Core.
 
@@ -109,31 +143,58 @@ def get_gpt_layer_local_spec(
     mlp = _get_mlp_module_spec(
         use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
     )
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=LNImpl,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                    q_layernorm=LNImpl if qk_layernorm else IdentityOp,
-                    k_layernorm=LNImpl if qk_layernorm else IdentityOp,
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=ColumnParallelLinear,
+                        linear_q_down_proj=ColumnParallelLinear,
+                        linear_q_up_proj=ColumnParallelLinear,
+                        linear_kv_down_proj=ColumnParallelLinear,
+                        linear_kv_up_proj=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        kv_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
                 ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl if num_experts else IdentityOp,
+                input_layernorm=LNImpl if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
             ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=LNImpl,
-            mlp=mlp,
-            mlp_bda=get_bias_dropout_add,
-            sharded_state_dict_keys_map={
-                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
-                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
-            },
-        ),
-    )
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=LNImpl,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        k_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map={
+                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+                },
+            ),
+        )
 
 
 def _get_mlp_module_spec(
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 20f83976c4..bd52f89680 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -69,6 +69,7 @@ def __init__(
         position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
         rotary_percent: float = 1.0,
         rotary_base: int = 10000,
+        rope_scaling: bool = False,
         seq_len_interpolation_factor: Optional[float] = None,
     ) -> None:
         super().__init__(config=config)
@@ -90,9 +91,11 @@ def __init__(
         # TODO: remove this dependency ?
         self.model_type = ModelType.encoder_or_decoder
 
-        # These 2 attributes are needed for TensorRT-LLM export.
+        # These 4 attributes are needed for TensorRT-LLM export.
         self.max_position_embeddings = max_sequence_length
         self.rotary_percent = rotary_percent
+        self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
 
         if self.pre_process:
             self.embedding = LanguageModelEmbedding(
@@ -102,13 +105,14 @@ def __init__(
                 position_embedding_type=position_embedding_type,
             )
 
-        if self.position_embedding_type == 'rope':
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
             self.rotary_pos_emb = RotaryEmbedding(
                 kv_channels=self.config.kv_channels,
                 rotary_percent=rotary_percent,
                 rotary_interleaved=self.config.rotary_interleaved,
                 seq_len_interpolation_factor=seq_len_interpolation_factor,
                 rotary_base=rotary_base,
+                rope_scaling=rope_scaling,
                 use_cpu_initialization=self.config.use_cpu_initialization,
             )
 
@@ -185,12 +189,17 @@ def forward(
         inference_params: InferenceParams = None,
         packed_seq_params: PackedSeqParams = None,
         extra_block_kwargs: dict = None,
+        runtime_gather_output: Optional[bool] = None,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoeder and finally into the post
         processing layer (optional).
 
         It either returns the Loss values if labels are given  or the final hidden units
+
+        Args:
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
         """
         # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
         # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
@@ -207,7 +216,7 @@ def forward(
 
         # Rotary positional embeddings (embedding is None for PP intermediate devices)
         rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
             rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
                 inference_params, self.decoder, decoder_input, self.config
             )
@@ -230,7 +239,9 @@ def forward(
         output_weight = None
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
-        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+        logits, _ = self.output_layer(
+            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+        )
 
         if has_config_logger_enabled(self.config):
             payload = OrderedDict(
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
index a8ddc94ced..074cfaae93 100644
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -76,6 +76,7 @@ def __init__(
         img_w: int = 336,
         patch_dim: int = 14,
         language_rotary_base: int = 10000,
+        language_rope_scaling: bool = False,
     ) -> None:
         super().__init__(config=language_transformer_config)
 
@@ -112,6 +113,7 @@ def __init__(
                 pre_process=self.pre_process,
                 post_process=self.post_process,
                 rotary_base=language_rotary_base,
+                rope_scaling=language_rope_scaling,
             )
             self.share_embeddings_and_output_weights = (
                 self.language_model.share_embeddings_and_output_weights
@@ -123,6 +125,16 @@ def __init__(
 
         class_token_len = 1
         if self.add_encoder:
+            self._drop_vision_class_token = drop_vision_class_token
+            add_class_token = True
+            if vision_transformer_config.vision_model_type == "siglip":
+                class_token_len = 0
+                add_class_token = False
+                error_msg = (
+                    "Siglip does not support vision class token, "
+                    "set disable-vision-class-token to False."
+                )
+                assert not self._drop_vision_class_token, error_msg
             self.vision_model = CLIPViTModel(
                 vision_transformer_config,
                 vision_transformer_layer_spec,
@@ -130,8 +142,9 @@ def __init__(
                 img_w=img_w,
                 class_token_len=class_token_len,
                 patch_dim=patch_dim,
+                model_subtype=vision_transformer_config.vision_model_type,
+                add_class_token=add_class_token,
             )
-            self._drop_vision_class_token = drop_vision_class_token
             # Map (intermediate) vision model outputs to the language model input dimension.
             self.vision_projection = MultimodalProjector(
                 vision_projection_config,
@@ -153,7 +166,12 @@ def __init__(
                 )
 
         self._img_seq_len = get_num_image_embeddings(
-            img_h, img_w, patch_dim, drop_vision_class_token, class_token_len
+            img_h,
+            img_w,
+            patch_dim,
+            vision_transformer_config.vision_model_type,
+            drop_vision_class_token,
+            class_token_len,
         )
 
     def shared_embedding_or_output_weight(self):
@@ -351,7 +369,9 @@ def _preprocess_data(
             ]
 
             # Put image embeddings to image positions.
-            final_embedding[images_mask] = image_embeddings.reshape(-1, embed_dim).contiguous()
+            final_embedding[images_mask] = (
+                image_embeddings.permute(1, 0, 2).reshape(-1, embed_dim).contiguous()
+            )
 
         # Create the final labels and loss mask (if this is the last language model stage).
         final_labels, final_loss_mask = None, None
@@ -429,6 +449,7 @@ def forward(
         inference_params: Optional[InferenceParams] = None,
         num_image_tiles: Optional[List[int]] = None,
         image_token_index: Optional[int] = IMAGE_TOKEN_INDEX,
+        runtime_gather_output: Optional[bool] = None,
     ) -> torch.Tensor:
         """Forward function of the LLaVA model.
 
@@ -445,6 +466,8 @@ def forward(
             inference_params (InferenceParams): Inference-time parameters including KV cache.
             num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image.
             image_token_index (int): ID for input images.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
 
         Returns:
             output (torch.Tensor): Loss of shape [b, s] if labels are provided,
@@ -463,7 +486,9 @@ def forward(
             image_embeddings = None
         elif self.add_encoder and not has_images:
             # If no images provided, use an empty image embeddings tensor.
-            image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device)
+            image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device).reshape(
+                0, 0, 0
+            )
         elif self.add_encoder and has_images:
             image_embeddings = self.vision_model(images)  # [num_tiles, img_seq_len, h_vision]
             if self._drop_vision_class_token:
@@ -528,6 +553,7 @@ def forward(
             decoder_input=combined_embeddings,
             labels=new_labels,
             inference_params=inference_params,
+            runtime_gather_output=runtime_gather_output,
         )
 
         if labels is None or loss_mask is None:
diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py
index f9ed05f470..d4b5c9684b 100644
--- a/megatron/core/models/retro/config.py
+++ b/megatron/core/models/retro/config.py
@@ -4,11 +4,9 @@
 
 import os
 from dataclasses import dataclass
-from importlib.metadata import version
-
-from pkg_resources import packaging
 
 from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import is_te_min_version
 
 
 @dataclass
@@ -65,8 +63,7 @@ def __post_init__(self) -> None:
         super().__post_init__()
 
         # Validate Transformer Engine version.
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version >= packaging.version.Version("1.3"):
+        if is_te_min_version("1.3"):
             try:
                 assert os.getenv("NVTE_FLASH_ATTN") == "0"
                 assert os.getenv("NVTE_FUSED_ATTN") == "0"
diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
index 110a8687f7..53c3feddee 100644
--- a/megatron/core/models/vision/clip_vit_model.py
+++ b/megatron/core/models/vision/clip_vit_model.py
@@ -33,12 +33,22 @@ def __init__(
         transformer_config: TransformerConfig,
         transformer_layer_spec: ModuleSpec,
         ln_pre_impl: Union[ModuleSpec, type] = TENorm,
+        ln_post_impl: Union[ModuleSpec, type] = TENorm,
         add_class_token: bool = True,
         class_token_len: int = 1,
         patch_dim: int = 14,
         img_h: int = 336,
         img_w: int = 336,
+        model_subtype: str = "clip",
     ) -> None:
+
+        error_msg = f"CLIPViTModel model subtype {model_subtype} is not supported."
+        assert model_subtype in ["clip", "siglip"], error_msg
+
+        if model_subtype == "siglip":
+            assert class_token_len == 0, "SigLIP does not support class tokens."
+            assert not add_class_token, "SigLIP does not support class tokens."
+
         super().__init__(config=transformer_config)
 
         if has_config_logger_enabled(transformer_config):
@@ -61,12 +71,34 @@ def __init__(
 
         self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0)
 
+        self.ln_pre = None
+        self.ln_post = None
+        if model_subtype == "clip":
+            self.ln_pre = build_module(
+                ln_pre_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = False
+            padding = 0
+        if model_subtype == "siglip":
+            self.ln_post = build_module(
+                ln_post_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = True
+            padding = "valid"
+
         self.conv1 = torch.nn.Conv2d(
             in_channels=3,
             out_channels=self.visual_hidden_size,
             kernel_size=self.patch_dim,
             stride=self.patch_dim,
-            bias=False,
+            bias=conv_bias,
+            padding=padding,
         )
 
         self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
@@ -79,13 +111,6 @@ def __init__(
                 torch.randn(1, self.class_token_len, self.visual_hidden_size)
             )
 
-        self.ln_pre = build_module(
-            ln_pre_impl,
-            config=transformer_config,
-            hidden_size=self.visual_hidden_size,
-            eps=transformer_config.layernorm_epsilon,
-        )
-
         self.model_type = ModelType.encoder_or_decoder
 
         # Transformer layers.
@@ -134,7 +159,8 @@ def forward(
 
         assert x.shape[1] == self.seq_length, f"{x.shape[1]} != {self.seq_length}"
         x = x + self.position_embeddings(self.position_ids)
-        x = self.ln_pre(x)
+        if self.ln_pre:
+            x = self.ln_pre(x)
         x = x.permute(1, 0, 2)  # [b, s, h] -> [s, b, h]
         # `permute` can make the tensor non-contiguous, breaking pipelining.
         x = x.contiguous()
@@ -142,17 +168,23 @@ def forward(
         x = self.decoder(x, attention_mask)
         x = x.permute(1, 0, 2)  # [s, b, h] -> [b, s, h]
         x = x.contiguous()
-
+        if self.ln_post:
+            x = self.ln_post(x)
         return x
 
 
-def get_num_image_embeddings(img_h, img_w, patch_dim, disable_vision_class_token, class_token_len):
+def get_num_image_embeddings(
+    img_h, img_w, patch_dim, vision_model_type, disable_vision_class_token, class_token_len
+):
     """Get the number of image embeddings per image tile."""
-    add_class_token = not disable_vision_class_token
+    if vision_model_type == "siglip":
+        keep_class_token = False
+    elif vision_model_type == "clip":
+        keep_class_token = not disable_vision_class_token
 
     num_patches_per_dim_h = img_h // patch_dim
     num_patches_per_dim_w = img_w // patch_dim
     num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-    num_image_embeddings_per_tile = num_patches + (class_token_len if add_class_token else 0)
+    num_image_embeddings_per_tile = num_patches + (class_token_len if keep_class_token else 0)
 
     return num_image_embeddings_per_tile
diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py
index 16bd95a7b4..5850e512ca 100644
--- a/megatron/core/num_microbatches_calculator.py
+++ b/megatron/core/num_microbatches_calculator.py
@@ -320,6 +320,8 @@ def __init__(
             if rank == 0:
                 logger.info(
                     f'decreasing batch size from {global_batch_size} to {running_global_batch_size}'
+                    f'to keep divisiblity by micro_batch_size={micro_batch_size} * '
+                    f'data_parallel_size={data_parallel_size}'
                 )
             self.num_micro_batches = (
                 running_global_batch_size // micro_batch_times_data_parallel_size
@@ -424,7 +426,7 @@ def __init__(
         self.rampup_samples_per_increment = self.ramup_samples / num_increments
 
         # Initialize number of microbatches.
-        self.update(0, False)
+        self.update(0, consistency_check=False, verbose=True)
 
     def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = False) -> None:
         """Update number of microbatches.
@@ -450,10 +452,13 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool =
         if old_current_global_batch_size != self.current_global_batch_size:
             global_batch_size_changed = True
         if self.rank == 0 and global_batch_size_changed and verbose:
-            logger.info(
-                f'ramping up batch size from {old_current_global_batch_size} to '
-                f'{self.current_global_batch_size}'
-            )
+            if old_current_global_batch_size is None:
+                logger.info(f'setting initial batch size to {self.current_global_batch_size}')
+            else:
+                logger.info(
+                    f'ramping up batch size from {old_current_global_batch_size} to '
+                    f'{self.current_global_batch_size}'
+                )
 
         # Check consistency of the current global batch size.
         if consistency_check and not self.decrease_batch_size_if_needed:
@@ -477,7 +482,9 @@ def update(self, consumed_samples: int, consistency_check: bool, verbose: bool =
             if self.rank == 0 and global_batch_size_changed and verbose:
                 logger.info(
                     f'decreasing batch size from {self.current_global_batch_size} to '
-                    f'{self.current_running_global_batch_size}'
+                    f'{self.current_running_global_batch_size} to keep divisiblity by '
+                    f'micro_batch_size={self.micro_batch_size} * '
+                    f'data_parallel_size={self.data_parallel_size}'
                 )
             assert (
                 self.current_running_global_batch_size % self.micro_batch_times_data_parallel_size
diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index bc385ad268..6135dc52c8 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,7 +2,7 @@
 
 
 MAJOR = 0
-MINOR = 9
+MINOR = 10
 PATCH = 0
 PRE_RELEASE = 'rc0'
 
diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py
index fe63e13e99..dff0cc5992 100644
--- a/megatron/core/packed_seq_params.py
+++ b/megatron/core/packed_seq_params.py
@@ -6,9 +6,15 @@
 
 @dataclass
 class PackedSeqParams:
-    # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format,
+    '''
+    parameters to TEDotProductAttention and fused rope kernels for the
+    `thd` (packed) sequence format
+    '''
+
     qkv_format: str = None
     cu_seqlens_q: Tensor = None
     cu_seqlens_kv: Tensor = None
+    cu_seqlens_q_padded: Tensor = None
+    cu_seqlens_kv_padded: Tensor = None
     max_seqlen_q: Tensor = None
     max_seqlen_kv: Tensor = None
diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt
index 08ed5eeb4b..a03ef133e7 100644
--- a/megatron/core/requirements.txt
+++ b/megatron/core/requirements.txt
@@ -1 +1,2 @@
-torch
\ No newline at end of file
+torch
+packaging
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index ff0be00bb8..903b4ed873 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -69,6 +69,8 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
 
 
 def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    """Set default model parallel attributes if not set explicitly already."""
+
     def maybe_set(attribute, value):
         if not hasattr(tensor, attribute):
             setattr(tensor, attribute, value)
@@ -78,6 +80,8 @@ def maybe_set(attribute, value):
 
 
 def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    """Copy model parallel attributes from one tensor to another."""
+
     def maybe_copy(attribute):
         if hasattr(source_tensor, attribute):
             setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
@@ -116,21 +120,22 @@ def _initialize_affine_weight_cpu(
     params_dtype=torch.float32,
     rank=None,
     world_size=None,
+    skip_set_tensor_parallel_attributes=False,
 ):
     """Initialize affine weight for model parallel.
 
     Build the master weight on all processes and scatter
     the relevant chunk."""
 
-    set_tensor_model_parallel_attributes(
-        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
-    )
+    if not skip_set_tensor_parallel_attributes:
+        set_tensor_model_parallel_attributes(
+            tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+        )
 
     # Initialize master weight
     master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
     init_method(master_weight)
     master_weight = master_weight.to(dtype=params_dtype)
-
     # Split and copy
     per_partition_per_stride_size = divide(per_partition_size, stride)
     weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
@@ -219,6 +224,11 @@ def __init__(
                 _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
 
     def forward(self, input_):
+        """Forward.
+
+        Args:
+            input_ (torch.Tensor): Input tensor.
+        """
         if self.tensor_model_parallel_size > 1:
             # Build the mask.
             input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
@@ -278,6 +288,7 @@ class LinearWithFrozenWeight(torch.autograd.Function):
     @staticmethod
     @custom_fwd
     def forward(ctx, input, weight, bias, allreduce_dgrad):
+        """Forward with frozen weight."""
         ctx.save_for_backward(weight)
         ctx.allreduce_dgrad = allreduce_dgrad
         output = torch.matmul(input, weight.t())
@@ -288,6 +299,7 @@ def forward(ctx, input, weight, bias, allreduce_dgrad):
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        """Backward with frozen weight."""
         (weight,) = ctx.saved_tensors
         grad_input = grad_output.matmul(weight)
 
@@ -389,6 +401,7 @@ def forward(
         grad_output_buffer,
         wgrad_deferral_limit,
     ):
+        """Forward."""
         ctx.save_for_backward(input, weight)
         ctx.use_bias = bias is not None
         ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
@@ -418,6 +431,7 @@ def forward(
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        """Backward."""
         input, weight = ctx.saved_tensors
         use_bias = ctx.use_bias
         grad_output_buffer = ctx.grad_output_buffer
@@ -847,7 +861,12 @@ def __init__(
             )
         )
 
-    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
+    def forward(
+        self,
+        input_: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        runtime_gather_output: Optional[bool] = None,
+    ):
         """Forward of ColumnParallelLinear
 
         Args:
@@ -855,6 +874,8 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
                 3D tensor whose order of dimension is [sequence, batch, hidden]
             weight (optional):
                 weight tensor to use, compulsory when skip_weight_param_allocation is True.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `gather_output` arg in the constructor will be used.
 
         Returns:
             - output
@@ -927,7 +948,13 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
             ),
             allreduce_dgrad=allreduce_dgrad,
         )
-        if self.gather_output:
+
+        gather_output = self.gather_output
+        # Use the runtime gather output if it's set explicitly.
+        if runtime_gather_output is not None:
+            gather_output = runtime_gather_output
+
+        if gather_output:
             # All-gather across the partitions.
             assert not self.sequence_parallel
             output = gather_from_tensor_model_parallel_region(output_parallel)
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
index 3724f81648..4b144d4163 100644
--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
@@ -16,7 +16,7 @@
     get_expert_model_parallel_rank,
     get_tensor_model_parallel_rank,
 )
-from megatron.core.utils import safely_set_viewless_tensor_data
+from megatron.core.utils import is_te_min_version, safely_set_viewless_tensor_data
 
 from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
 
@@ -175,6 +175,8 @@ def initialize_rng_tracker(use_te_rng_tracker: bool = False):
         return
 
     if use_te_rng_tracker:
+        if not is_te_min_version("1.5.0"):
+            raise RuntimeError("use_te_rng_tracker requires TransformerEngine version >= 1.5")
         from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker
 
         _CUDA_RNG_STATE_TRACKER = TECudaRNGStatesTracker()
diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py
index 7cc10776b7..0e3cdcfa57 100644
--- a/megatron/core/transformer/__init__.py
+++ b/megatron/core/transformer/__init__.py
@@ -2,5 +2,5 @@
 
 from .module import MegatronModule
 from .spec_utils import ModuleSpec, build_module
-from .transformer_config import TransformerConfig
+from .transformer_config import MLATransformerConfig, TransformerConfig
 from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index 6f81787f67..850dec88e1 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -6,7 +6,7 @@
 import torch
 
 from megatron.core import parallel_state, tensor_parallel
-from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.parallel_state import (
     get_data_parallel_group,
     get_data_parallel_rank,
@@ -146,14 +146,14 @@ def custom_forward(*inputs):
 
         return hidden_states
 
-    def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype):
+    def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype):
         """Allocate memory to store kv cache during inference."""
 
         return torch.empty(
             inference_max_sequence_length,
             batch_size,
             self.num_query_groups_per_partition,
-            self.hidden_size_per_attention_head,
+            dim,
             dtype=dtype,
             device=torch.cuda.current_device(),
         )
@@ -178,10 +178,10 @@ def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_p
             inf_max_seq_length = inference_params.max_sequence_length
             inf_max_batch_size = inference_params.max_batch_size
             inference_key_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, key.dtype
+                inf_max_seq_length, inf_max_batch_size, key.shape[-1], key.dtype
             )
             inference_value_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, value.dtype
+                inf_max_seq_length, inf_max_batch_size, value.shape[-1], value.dtype
             )
             inference_params.key_value_memory_dict[self.layer_number] = (
                 inference_key_memory,
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
index a60a22c0f3..2588980b5b 100644
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -6,6 +6,8 @@
 
 import torch
 
+from megatron.core.transformer.module import MegatronModule
+
 try:
     from transformer_engine.pytorch import make_graphed_callables
     from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
@@ -233,11 +235,16 @@ def __call__(self, megatron_module, args, kwargs):
                 break
 
         if runner is None:
-            runner = self.create_cudagraph_module(megatron_module, args, kwargs)
-            self.cudagraph_runners.append(runner)
-            logging.getLogger(__name__).info(
-                f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
-            )
+            if self.training and torch.is_grad_enabled():
+                runner = self.create_cudagraph_module(megatron_module, args, kwargs)
+                self.cudagraph_runners.append(runner)
+                logging.getLogger(__name__).info(
+                    f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
+                )
+            else:
+                # No cudagraphs were found in inference mode, so fallback to eager since
+                # tensor.requires_grad is needed to correctly trace the backward graph.
+                return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
 
         tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs)
         out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch)
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
index bbac3fa4a2..d5c014cabf 100644
--- a/megatron/core/transformer/dot_product_attention.py
+++ b/megatron/core/transformer/dot_product_attention.py
@@ -40,6 +40,7 @@ def __init__(
         attn_mask_type: AttnMaskType,
         attention_type: str,
         attention_dropout: float = None,
+        softmax_scale: float = None,
     ):
         super().__init__(config=config)
 
@@ -67,10 +68,14 @@ def __init__(
         self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
 
         coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if softmax_scale is None:
+            self.softmax_scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head)
+        else:
+            self.softmax_scale = softmax_scale
+
         if self.config.apply_query_key_layer_scaling:
             coeff = self.layer_number
-            self.norm_factor *= coeff
+            self.softmax_scale /= coeff
 
         self.scale_mask_softmax = FusedScaleMaskSoftmax(
             input_in_fp16=self.config.fp16,
@@ -143,7 +148,7 @@ def forward(
             query.transpose(0, 1),  # [b * np, sq, hn]
             key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
             beta=0.0,
-            alpha=(1.0 / self.norm_factor),
+            alpha=self.softmax_scale,
         )
 
         # change view to [b, np, sq, sk]
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index ee4bb690b7..02a2cccca5 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -327,6 +327,7 @@ def topk_softmax_with_capacity(
     pad_to_capacity: bool = False,
     drop_policy: str = "probs",
     use_pre_softmax: bool = False,
+    deterministic_mode: bool = False,
 ):
     """Apply capacity and padding to the top-k selection.
     Args:
@@ -366,7 +367,10 @@ def topk_softmax_with_capacity(
 
     if capacity_factor is None:
         # TopK without capacity
-        tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts)
+        if deterministic_mode:
+            tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts)
+        else:
+            tokens_per_expert = torch.histc(top_indices, bins=num_experts, min=0, max=num_experts)
         return probs, top_indices, tokens_per_expert
     else:
         # TopK with capacity
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index 8894dc1df3..3e85ec53c5 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -74,7 +74,8 @@ def routing(self, logits: torch.Tensor):
             logits (torch.Tensor): Logits tensor.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of tensors representing max probs and the indices.
+            Tuple[torch.Tensor, torch.Tensor]:
+                Tuple of tensors representing max probs and the indices.
         """
         raise NotImplementedError("Routing function not implemented.")
 
@@ -155,6 +156,7 @@ def aux_loss_load_balancing(self, logits: torch.Tensor):
             pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
             drop_policy=self.config.moe_token_drop_policy,
             use_pre_softmax=self.config.moe_router_pre_softmax,
+            deterministic_mode=self.config.deterministic_mode,
         )
 
         if self.training:
@@ -172,8 +174,10 @@ def apply_load_balancing_loss(
         """Applies auxiliary loss to the MoE layer.
 
         Args:
-            probs (torch.Tensor): The probs output by the router for each token. [num_tokens, num_experts]
-            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert. [num_experts]
+            probs (torch.Tensor):
+                The probs output by the router for each token. [num_tokens, num_experts]
+            num_local_tokens_per_expert (torch.Tensor):
+                The number of tokens per expert. [num_experts]
             activation (torch.Tensor): The activation tensor to attach the gradient function to.
 
         Returns:
@@ -279,6 +283,7 @@ def routing(self, logits: torch.Tensor):
                 pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
                 drop_policy=self.config.moe_token_drop_policy,
                 use_pre_softmax=self.config.moe_router_pre_softmax,
+                deterministic_mode=self.config.deterministic_mode,
             )
         else:
             raise ValueError(f"Unsupported MoE routing type: {self.routing_type}")
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
index e23ea4ea0f..db1b1920fa 100644
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -184,13 +184,23 @@ def token_permutation(
                 self.global_local_map = None
 
         with torch.no_grad():
-            tokens_per_expert = torch.bincount(
-                local_indices.view(-1), minlength=self.config.num_moe_experts
-            )
-            if self.num_local_experts < self.config.num_moe_experts:
-                tokens_per_expert = tokens_per_expert[
-                    self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
-                ]
+            # The indices of local_indices that give its sorted order along dim 0.
+            self.indices = torch.argsort(local_indices, dim=0)
+            if self.config.deterministic_mode:
+                tokens_per_expert = torch.bincount(
+                    local_indices.view(-1), minlength=self.config.num_moe_experts
+                )
+                if self.num_local_experts < self.config.num_moe_experts:
+                    tokens_per_expert = tokens_per_expert[
+                        self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
+                    ]
+            else:
+                tokens_per_expert = torch.histc(
+                    local_indices,
+                    bins=self.num_local_experts,
+                    min=self.local_expert_indices[0],
+                    max=self.local_expert_indices[-1],
+                )
             tokens_per_expert = tokens_per_expert.cpu().to(torch.long)
 
         # Stage2: permute the tokens locally so that they are grouped by their expert assignment
@@ -382,7 +392,14 @@ def preprocess(self, indices: torch.Tensor) -> torch.Tensor:
         Returns:
             torch.Tensor: Tensor containing the number of tokens assigned to local expert.
         """
-        num_local_tokens_per_expert = torch.bincount(indices.view(-1), minlength=self.num_experts)
+        if self.config.deterministic_mode:
+            num_local_tokens_per_expert = torch.bincount(
+                indices.view(-1), minlength=self.num_experts
+            )
+        else:
+            num_local_tokens_per_expert = torch.histc(
+                indices, bins=self.num_experts, min=0, max=self.num_experts
+            )
         # num_local_tokens_per_expert: [num_experts]
 
         tp_rank = parallel_state.get_tensor_model_parallel_rank()
diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py
index 66fe86aee5..b905fc99be 100644
--- a/megatron/core/transformer/moe/upcycling_utils.py
+++ b/megatron/core/transformer/moe/upcycling_utils.py
@@ -56,7 +56,40 @@ def _covert_to_moe_state_dict(state_dict, moe_model):
         router_key = mlp_weight_key.replace('mlp.linear_fc1.weight', 'mlp.router.weight')
         new_state_dict[router_key] = moe_state_dict[router_key].data.data.clone()
 
-    if mlp.config.moe_grouped_gemm:
+    use_te_grouped_gemm = 'decoder.layers.0.mlp.experts.linear_fc1.weight0' in moe_state_dict
+
+    if mlp.config.moe_grouped_gemm and use_te_grouped_gemm:
+        for mlp_weight_key in mlp_fc1_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc1.weight', f'mlp.experts.linear_fc1.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for mlp_weight_key in mlp_fc2_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc2.weight', f'mlp.experts.linear_fc2.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for extra_state_key in mlp_fc1_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc1._extra_state', 'mlp.experts.linear_fc1._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+        for extra_state_key in mlp_fc2_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc2._extra_state', 'mlp.experts.linear_fc2._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+    elif mlp.config.moe_grouped_gemm:
         for mlp_weight_key in mlp_fc1_weight_keys:
             weight_tensor = new_state_dict.pop(mlp_weight_key)
             shape = weight_tensor.shape
@@ -76,6 +109,7 @@ def _covert_to_moe_state_dict(state_dict, moe_model):
             )
             new_key = mlp_weight_key.replace('mlp.linear_fc2.weight', 'mlp.experts.weight2')
             new_state_dict[new_key] = weight_tensor
+
     else:
 
         def covert_to_experts(keys):
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
new file mode 100644
index 0000000000..d637e2b448
--- /dev/null
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings import (
+    YarnRotaryEmbedding,
+    _yarn_get_mscale,
+    apply_rotary_pos_emb,
+)
+from megatron.core.transformer.attention import Attention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+
+
+@dataclass
+class MLASelfAttentionSubmodules:
+    """Submodules for the MLA self-attention layer."""
+
+    linear_q_proj: Union[ModuleSpec, type] = None
+    linear_q_down_proj: Union[ModuleSpec, type] = None
+    linear_q_up_proj: Union[ModuleSpec, type] = None
+    linear_kv_down_proj: Union[ModuleSpec, type] = None
+    linear_kv_up_proj: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    kv_layernorm: Union[ModuleSpec, type] = None
+
+
+class MultiLatentAttention(Attention):
+    """Multi-Latent Attention layer abstract class.
+
+    This layer only contains common modules required for the "self attn" and
+    "cross attn" specializations.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: Union[MLASelfAttentionSubmodules],
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+    ) -> None:
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        assert (
+            world_size == 1
+        ), "MLA is not supported with Tensor Parallelism yet, \
+        use Expert Parallelism and Pipeline Parallelism for better performance."
+
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attention_type=attention_type,
+            attn_mask_type=attn_mask_type,
+        )
+
+        self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
+
+        self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
+
+        mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale)
+        self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim)
+
+        self.rotary_pos_emb = YarnRotaryEmbedding(
+            self.config.qk_pos_emb_head_dim,
+            rotary_base=self.config.rotary_base,
+            scaling_factor=self.config.rotary_scaling_factor,
+            original_max_position_embeddings=self.config.max_position_embeddings,
+            beta_fast=self.config.beta_fast,
+            beta_slow=self.config.beta_slow,
+            mscale=self.config.mscale,
+            mscale_all_dim=self.config.mscale_all_dim,
+        )
+
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+        )
+
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        position_ids=None,
+    ):
+        assert rotary_pos_emb is None, "Rotary position embeddings should not be passed into MLA."
+
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
+        query, key, value = self.get_query_key_value_tensors(
+            hidden_states,
+            key_value_states,
+            position_ids,
+            packed_seq_params,
+            inference_params=inference_params,
+        )
+
+        # ===================================================
+        # Adjust key, value for inference
+        # ===================================================
+        # rotary_pos_emb = None
+        key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb=None
+        )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        # Need corresponding TE change
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query, key, value, attention_mask, packed_seq_params=packed_seq_params
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                packed_seq_params=packed_seq_params,
+                attn_mask_type=attn_mask_type,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+
+class MLASelfAttention(MultiLatentAttention):
+    """MLA Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: MLASelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        if self.config.q_lora_rank is None:
+            # Not projectiing query
+            self.linear_q_proj = build_module(
+                submodules.linear_q_proj,
+                self.config.hidden_size,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        else:
+
+            self.linear_q_down_proj = build_module(
+                submodules.linear_q_down_proj,
+                self.config.hidden_size,
+                self.config.q_lora_rank,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+            self.linear_q_up_proj = build_module(
+                submodules.linear_q_up_proj,
+                self.config.q_lora_rank,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        self.linear_kv_down_proj = build_module(
+            submodules.linear_kv_down_proj,
+            self.config.hidden_size,
+            self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.linear_kv_up_proj = build_module(
+            submodules.linear_kv_up_proj,
+            self.config.kv_lora_rank,
+            self.config.num_attention_heads * (self.config.qk_head_dim + self.config.v_head_dim),
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        if self.config.q_lora_rank is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.config.q_lora_rank,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+
+        self.kv_layernorm = build_module(
+            submodules.kv_layernorm,
+            hidden_size=self.config.kv_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def get_query_key_value_tensors(
+        self,
+        hidden_states,
+        key_value_states=None,
+        position_ids=None,
+        packed_seq_params=None,
+        inference_params=None,
+    ):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # s = sequence length, b = batch size, h = hidden size, n = num attention heads
+        # Attention heads [s, b, n*h]
+        assert (
+            hidden_states.ndim == 3
+        ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D"
+        q_len, bsz, _ = hidden_states.size()
+
+        if self.config.q_lora_rank is not None:
+            q_compressed, _ = self.linear_q_down_proj(hidden_states)
+            q_compressed = self.q_layernorm(q_compressed)
+            q, _ = self.linear_q_up_proj(q_compressed)
+        else:
+            # hidden_states:[s, b, 2048], q: [s, b, n * 192]
+            q, _ = self.linear_q_proj(hidden_states)
+
+        # q: [s, b, n, 192]
+        q = q.view(q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim)
+
+        # q: [s, b, n, 128], q_pos_emb: [s, b, n, 64]
+        q_no_pe, q_pos_emb = torch.split(
+            q, [self.config.qk_head_dim, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv_combined: [s, b, 576]
+        kv_combined, _ = self.linear_kv_down_proj(hidden_states)
+
+        # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64]
+        kv_compressed, k_pos_emb = torch.split(
+            kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv: [s, b, 2048]
+        kv, _ = self.linear_kv_up_proj(self.kv_layernorm(kv_compressed))
+
+        # kv: [s, b, n, 256]
+        kv = kv.view(
+            q_len,
+            bsz,
+            self.num_attention_heads_per_partition,
+            self.config.qk_head_dim + self.config.v_head_dim,
+        )
+
+        # k_no_pe: [s, b, n, 128], value: [s, b, n, 128]
+        k_no_pe, value = torch.split(kv, [self.config.qk_head_dim, self.config.v_head_dim], dim=-1)
+
+        # rotary_pos_emb:[s, b, 1, 64]
+        rotary_pos_emb = self.rotary_pos_emb(max_seq_len=self.config.max_position_embeddings)
+
+        if len(rotary_pos_emb) == 2:
+            mscale = rotary_pos_emb[1]
+            rotary_pos_emb = rotary_pos_emb[0]
+
+        if inference_params is not None:
+            # add offset to the sequence start for inference
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + q_len
+            rotary_pos_emb = rotary_pos_emb[sequence_start:sequence_end]
+
+        # [s, b, 64] -> [s, b, 1, 64]
+        k_pos_emb = torch.unsqueeze(k_pos_emb, 2)
+
+        if packed_seq_params is not None:
+            cu_seqlens_q = packed_seq_params.cu_seqlens_q
+            cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+        else:
+            cu_seqlens_q = cu_seqlens_kv = None
+
+        # q_pos_emb: [s, b, n, 64], k_pos_emb:[s, b, 1, 64]
+        q_pos_emb = apply_rotary_pos_emb(
+            q_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q, mscale=mscale
+        )
+        k_pos_emb = apply_rotary_pos_emb(
+            k_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv, mscale=mscale
+        )
+
+        # query: [s, b, n, 192]
+        query = torch.cat([q_no_pe, q_pos_emb], dim=-1)
+
+        # key: [s, b, n, 192]
+        key = torch.cat([k_no_pe, k_pos_emb], dim=-1)
+
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        return query, key, value
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 46f6796909..3a88f1ab22 100755
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -2,10 +2,8 @@
 
 from contextlib import nullcontext
 from dataclasses import dataclass
-from importlib.metadata import version
 from typing import List, Optional, Union
 
-import packaging
 import torch
 from torch import Tensor
 
@@ -19,7 +17,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import BaseTransformerLayer
 from megatron.core.transformer.utils import sharded_state_dict_default
-from megatron.core.utils import make_viewless_tensor
+from megatron.core.utils import is_te_min_version, make_viewless_tensor
 
 try:
     from megatron.core.extensions.transformer_engine import (
@@ -375,10 +373,9 @@ def get_cuda_graph_optional_args(
         optional_inputs = {}
         optional_inputs['is_first_microbatch'] = self.current_microbatch == 0
         try:
-            import transformer_engine.pytorch as te
+            import transformer_engine.pytorch as te  # pylint: disable=unused-import
 
-            _te_version = packaging.version.Version(version("transformer-engine"))
-            if _te_version < packaging.version.Version("1.10.0"):
+            if is_te_min_version("1.10.0", check_equality=False):
                 assert not any(
                     [attention_mask, context, context_mask, rotary_pos_emb]
                 ), "Keyword Arguments not supported with CUDA graph."
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index f16a0117a3..a63171686a 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -1,14 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 from dataclasses import dataclass
-from importlib.metadata import version
 from typing import Callable, Optional, Tuple
 
 import torch.nn.functional as F
-from pkg_resources import packaging
 
 from ..model_parallel_config import ModelParallelConfig
-from ..utils import init_method_normal, scaled_init_method_normal
+from ..utils import get_te_version, init_method_normal, is_te_min_version, scaled_init_method_normal
 
 
 @dataclass
@@ -112,6 +110,9 @@ class TransformerConfig(ModelParallelConfig):
     """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the
     global batch, versus the default behavior of assuming all tokens are non-padded."""
 
+    multi_latent_attention: bool = False
+    """Whether to use multi-latent attention."""
+
     ####################
     # initialization
     ####################
@@ -262,7 +263,6 @@ class TransformerConfig(ModelParallelConfig):
     """When there are multiple experts per rank, compress multiple local (potentially small) gemms
     in a single kernel launch to improve the utilization and performance by leveraging the Grouped
     GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
-
     """
 
     moe_aux_loss_coeff: float = 0  # 1e-2 would be a good start value for load balance loss.
@@ -282,6 +282,7 @@ class TransformerConfig(ModelParallelConfig):
     moe_token_dispatcher_type: str = "allgather"
     """The type of token dispatcher to use. The default is 'allgather'.
     Options are 'allgather' and 'alltoall'."""
+
     moe_per_layer_logging: bool = False
     """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
 
@@ -504,12 +505,62 @@ def __post_init__(self):
 
         if self.num_moe_experts and self.fp8:
             # TE version below 1.7.0 will raise Error when handle zeros tokens for expert
-            te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version < packaging.version.Version("1.7.0.dev0"):
+            if not is_te_min_version("1.7.0.dev0"):
                 raise ValueError(
                     "Only transformer-engine>=1.7.0 supports MoE FP8 training, "
-                    f"but your version is {te_version}."
+                    f"but your version is {get_te_version()}."
                 )
 
             if self.moe_grouped_gemm:
                 raise ValueError("Grouped GEMM of MoE not support fp8 for now.")
+
+
+@dataclass
+class MLATransformerConfig(TransformerConfig):
+    """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers.
+
+    The initialization function has an argument for each parameter, including those in
+    ModelParallelConfig. Included YaRN RoPE parameters that is fused in MLA.
+    """
+
+    multi_latent_attention: bool = True
+    """Whether to use Multi-Latent Attention."""
+
+    q_lora_rank: int = 512
+    """Rank of Query tensor's low rank representation."""
+
+    kv_lora_rank: int = 512
+    """Rank of Key and Value tensors' low rank representation."""
+
+    qk_head_dim: int = 128
+    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim"""
+
+    qk_pos_emb_head_dim: int = 64
+    """Dimension of the position embedding in the QK projection."""
+
+    v_head_dim: int = 128
+    """Dimension of the head in the V projection."""
+
+    rotary_base: float = 10000
+    """Rotary base for the rotary embeddings."""
+
+    rotary_scaling_factor: float = 40
+    """Rotary scaling factor for the rotary embeddings."""
+
+    normalization: str = "RMSNorm"
+    """Default normalization layer for MLA models is RMSNorm."""
+
+    max_position_embeddings: int = 163840
+    """Maximum position embeddings for the original model."""
+
+    beta_fast: float = 32
+    """Beta fast for YaRN RoPE."""
+
+    beta_slow: float = 1
+    """Beta slow for YaRN RoPE."""
+
+    mscale: float = 0.707
+    """Mscale for YaRN RoPE in Multi-Latent Attention."""
+
+    mscale_all_dim: float = 0.707
+    """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention."""
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index b0de950ef6..f3910926ab 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -15,10 +15,12 @@
 from dataclasses import dataclass
 from datetime import datetime
 from functools import reduce
+from importlib.metadata import version
 from types import TracebackType
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
+from packaging.version import Version as PkgVersion
 
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.mapping import ShardedTensor
@@ -26,6 +28,33 @@
 logger = logging.getLogger(__name__)
 
 
+_te_version = None
+
+
+def get_te_version():
+    """Get TE version from __version__; if not available use pip's. Use caching."""
+
+    def get_te_version_str():
+        import transformer_engine as te
+
+        if hasattr(te, '__version__'):
+            return str(te.__version__)
+        else:
+            return version("transformer-engine")
+
+    global _te_version
+    if _te_version is None:
+        _te_version = PkgVersion(get_te_version_str())
+    return _te_version
+
+
+def is_te_min_version(version, check_equality=True):
+    """Check if minimum version of `transformer-engine` is installed."""
+    if check_equality:
+        return get_te_version() >= PkgVersion(version)
+    return get_te_version() > PkgVersion(version)
+
+
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
     assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
index 2e92a96e9e..0df0168fa5 100644
--- a/megatron/inference/gpt/model_provider.py
+++ b/megatron/inference/gpt/model_provider.py
@@ -64,6 +64,7 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
         "position_embedding_type": args.position_embedding_type,
         "rotary_percent": args.rotary_percent,
         "rotary_base": args.rotary_base,
+        "rope_scaling": args.use_rope_scaling,
     }
 
     model = model_type(**model_kwargs)
diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py
index 7e4424c7b0..21ba00c600 100644
--- a/megatron/legacy/model/rms_norm.py
+++ b/megatron/legacy/model/rms_norm.py
@@ -8,7 +8,8 @@ class RMSNorm(torch.nn.Module):
     def __init__(self,
                  dim: int,
                  eps: float = 1e-6,
-                 sequence_parallel: bool = False):
+                 sequence_parallel: bool = False,
+                 config: dict = None):
         """RMS Normaliation module
 
         Args:
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index 7d723df024..dda550551a 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -13,11 +13,11 @@
 from megatron import core
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
+from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.jit import jit_fuser
-from megatron.core.models.common.embeddings.rotary_pos_embedding import (
-    RotaryEmbedding,
-    apply_rotary_pos_emb,
-)
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.parallel_state import (
     get_tensor_and_expert_parallel_group,
@@ -1406,21 +1406,15 @@ def __init__(self, config,
         self.transformer_engine_v_0_8 = False
         if self.transformer_impl == 'transformer_engine':
             global transformer_engine
-            from importlib.metadata import version
-
             import transformer_engine
-            from pkg_resources import packaging
 
-            te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("0.8.0"):
+            if core.utils.is_te_min_version("0.8.0"):
                 self.transformer_engine_v_0_8 = True
-            if te_version >= packaging.version.Version("0.10.0"):
+            if core.utils.is_te_min_version("0.10.0"):
                 self.transformer_engine_v_0_10 = True
-            if te_version >= packaging.version.Version("0.11.0"):
+            if core.utils.is_te_min_version("0.11.0"):
                 self.transformer_engine_v_0_11 = True
 
-            del version, packaging
-
             assert not args.squared_relu, ("TransformerEngine does not support squared "
                                            "relu activation.")
 
diff --git a/megatron/training/activations.py b/megatron/training/activations.py
index fee84bddd0..c6ce9f1de1 100644
--- a/megatron/training/activations.py
+++ b/megatron/training/activations.py
@@ -16,3 +16,7 @@ def squared_relu(x: torch.Tensor) -> torch.Tensor:
 @jit_fuser
 def quick_gelu(x: torch.Tensor) -> torch.Tensor:
     return x * torch.sigmoid(1.702 * x)
+
+@jit_fuser
+def fast_gelu(x: torch.Tensor) -> torch.Tensor:
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 3dcfe4f2b2..e3d876a5f2 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -17,7 +17,7 @@
     get_config_path as get_retro_config_path,
     get_gpt_data_dir as get_retro_data_dir,
 )
-from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
 from megatron.training.activations import squared_relu
 from megatron.training.utils import update_use_dist_ckpt
 
@@ -42,6 +42,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     parser = _add_biencoder_args(parser)
     parser = _add_vision_args(parser)
     parser = _add_moe_args(parser)
+    parser = _add_mla_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_straggler_detector_args(parser)
     parser = _add_inference_args(parser)
@@ -288,7 +289,8 @@ def validate_args(args, defaults={}):
         # Overlap P2P communication is disabled if not using the interleaved schedule.
         args.overlap_p2p_comm = False
         args.align_param_gather = False
-        if args.rank == 0:
+        # Only print warning if PP size > 1.
+        if args.rank == 0 and args.pipeline_model_parallel_size > 1:
             print('WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False '
                   'since non-interleaved schedule does not support overlapping p2p communication '
                   'and aligned param AG')
@@ -654,10 +656,13 @@ def _check_arg_is_not_none(args, arg):
 
 
 def core_transformer_config_from_args(args, config_class=None):
-
+    
     # Config class.
     config_class = config_class or TransformerConfig
 
+    if args.multi_latent_attention:
+        config_class = MLATransformerConfig
+
     # Translate args to core transformer configuration
     kw_args = {}
     for f in dataclasses.fields(config_class):
@@ -842,6 +847,8 @@ def _add_network_size_args(parser):
                           help='Use interleaved rotary embedding.')
     group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
                        help='Sequence length interpolation factor for rotary embeddings.')
+    group.add_argument('--use-rope-scaling', action='store_true',
+                       help='Apply rope scaling as used in llama3.1')
     group.add_argument('--no-position-embedding',
                        action='store_false',
                        help='Disable position embedding. Deprecated: use --position-embedding-type',
@@ -876,7 +883,9 @@ def _add_network_size_args(parser):
                        help='Disable BERT binary head.',
                        dest='bert_binary_head')
     group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
-                       help='Untie embeddings and output weights.'),
+                       help='Untie embeddings and output weights.')
+    group.add_argument('--multi-latent-attention', action='store_true',
+                       help='Use multi-latent attention for model.')
     return parser
 
 
@@ -1151,6 +1160,9 @@ def _add_training_args(parser):
     group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false',
                        help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
                        dest='tp_comm_bulk_wgrad')
+    group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str,
+                       choices=['nccl', 'mpi', 'gloo'],
+                       help='Set the bootstrapping backend of Tensor parallel communications.')
     group.add_argument('--use-cpu-initialization', action='store_true',
                        default=None,
                        help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.')
@@ -1910,6 +1922,23 @@ def _add_moe_args(parser):
 
     return parser
 
+def _add_mla_args(parser):
+    group = parser.add_argument_group(title="mla")
+    group.add_argument('--q-lora-rank', type=int, default=None,
+                       help="Rank of Query tensor's low rank representation.")
+    group.add_argument('--kv-lora-rank', type=int, default=32,
+                       help="Rank of Key and Value tensors' low rank representation.")
+    group.add_argument('--qk-head-dim', type=int, default=128,
+                       help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim")
+    group.add_argument('--qk-pos-emb-head-dim', type=int, default=64,
+                       help="Dimension of the position embedding in the QK projection.")
+    group.add_argument('--v-head-dim', type=int, default=128,
+                       help="Dimension of the head in the V projection.")
+    group.add_argument('--rotary-scaling-factor', type=float, default=1.0,
+                       help="Rotary scaling factor for the rotary embeddings.")
+
+    return parser
+
 def _add_experimental_args(parser):
     group = parser.add_argument_group(title='experimental')
 
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index cb4b7ace4d..3de49f6c57 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -1131,7 +1131,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                                               'consumed_train_samples', 0)
         args.skipped_train_samples = getattr(checkpoint_args,
                                              'skipped_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples, verbose=True)
         args.consumed_valid_samples = getattr(checkpoint_args,
                                               'consumed_valid_samples', 0)
     else:
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index b2ef8a8f45..ad68ce8cb7 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -4,8 +4,6 @@
 import logging
 import random
 import os
-import packaging
-import packaging.version
 import time
 
 import numpy as np
@@ -24,6 +22,7 @@
 from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
 from megatron.core.fusions.fused_bias_gelu import bias_gelu
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
+from megatron.core.utils import get_te_version, is_te_min_version
 
 logger = logging.getLogger(__name__)
 
@@ -213,12 +212,21 @@ def _initialize_tp_communicators():
 
     input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
 
-    #We create a MPI process group, which is needed to bootstrap the pipelined
-    #tensor-model-parallel communication overlap
-    torch.distributed.new_group(backend='mpi')
-
-    te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
-                                 use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
+    if is_te_min_version("1.9.0"):
+        # The process group with the target bootstrap backend is created in Transformer Engine.
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,
+                                     bootstrap_backend = args.tp_comm_bootstrap_backend)
+    else:
+        if args.tp_comm_bootstrap_backend != 'mpi':
+            warnings.warn(
+                f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend."
+            )
+        # Create a MPI process group to help with TP communication overlap bootstrap.
+        torch.distributed.new_group(backend='mpi')
+    
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs)
 
 def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
     """Initialize torch.distributed and core model parallel."""
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
index 226ae1e799..af0d493f87 100644
--- a/megatron/training/tokenizer/tokenizer.py
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -2,15 +2,14 @@
 
 """Megatron tokenizers."""
 
-import math
-from abc import ABC, abstractmethod
 import base64
 import json
+import math
+import types
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import types
-
 from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
 
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
@@ -20,27 +19,28 @@
 def build_tokenizer(args, **kwargs):
     """Initialize tokenizer."""
     if args.rank == 0:
-        print('> building {} tokenizer ...'.format(args.tokenizer_type),
-              flush=True)
+        print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True)
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type == 'BertWordPieceLowerCase':
         assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=True, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'BertWordPieceCase':
         assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=False, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.vocab_file is not None
         assert args.merge_file is not None
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
     elif args.tokenizer_type == 'SentencePieceTokenizer':
         assert args.tokenizer_model is not None
-        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _SentencePieceTokenizer(
+            args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
         assert args.tokenizer_model is not None
         tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
@@ -65,13 +65,11 @@ def build_tokenizer(args, **kwargs):
         assert args.vocab_size is not None
         tokenizer = _NullTokenizer(args.vocab_size)
     else:
-        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(args.tokenizer_type))
+        raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type))
 
     # Add vocab size (if not already set from a checkpoint).
     if getattr(args, "padded_vocab_size", None) is None:
-        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
-                                                          args)
+        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
 
     return tokenizer
 
@@ -81,13 +79,14 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
     still having GPU friendly size."""
 
     after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * \
-        args.tensor_model_parallel_size
+    multiple = args.make_vocab_size_divisible_by * args.tensor_model_parallel_size
     after = int(math.ceil(after / multiple) * multiple)
     if args.rank == 0 and logging_enabled:
-        print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+        print(
+            ' > padded vocab (size: {}) with {} dummy tokens '
+            '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after),
+            flush=True,
+        )
     return after
 
 
@@ -97,10 +96,14 @@ def __init__(self, pretrained_model_name_or_path, **kwargs):
         try:
             import transformers
         except ImportError:
-            raise EnvironmentError(f"The transformers library must be installed to use huggingface_tokenizer_provider")
+            raise EnvironmentError(
+                f"The transformers library must be installed to use huggingface_tokenizer_provider"
+            )
 
         # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there
-        self._tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
+        )
         self._vocab = self._tokenizer.get_vocab()
         self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()}
 
@@ -146,8 +149,7 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
         self._additional_special_tokens = []
 
         # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
+        SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'}
         self._bos_token = '[BOS]'
         self.add_token(self._bos_token)
         self._bos_token_id = self.vocab.get(self._bos_token)
@@ -160,7 +162,8 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
         # These can be used as sentinel tokens in T5 model inputs
         additional_special_tokens = []
         additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)]
+        )
         self.add_additional_special_tokens(additional_special_tokens)
 
     def add_token(self, token):
@@ -195,6 +198,10 @@ def decode(self, ids):
         tokens = self.tokenizer.convert_ids_to_tokens(ids)
         return self.tokenizer.convert_tokens_to_string(tokens)
 
+    def detokenize(self, token_ids):
+        """Copy of decode() method for inference pipeline compatibility"""
+        return self.decode(token_ids)
+
     def decode_token_ids(self, token_ids):
         tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
         exclude_list = ['[PAD]', '[CLS]']
@@ -227,32 +234,37 @@ def mask(self):
 
     @property
     def bos(self):
-        """ Id of the beginning of sentence token in the vocabulary."""
+        """Id of the beginning of sentence token in the vocabulary."""
         return self._bos_token_id
 
     @property
     def eos(self):
-        """ Id of the end of sentence token in the vocabulary."""
+        """Id of the end of sentence token in the vocabulary."""
         return self._eos_token_id
 
+    @property
+    def eod(self):
+        """Copy of eod property for inference pipeline compatibility"""
+        return self.eos
+
     @property
     def bos_token(self):
-        """ Beginning of sentence token id """
+        """Beginning of sentence token id"""
         return self._bos_token
 
     @property
     def eos_token(self):
-        """ End of sentence token id """
+        """End of sentence token id"""
         return self._eos_token
 
     @property
     def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
+        """All the additional special tokens you may want to use (list of strings)."""
         return self._additional_special_tokens
 
     @property
     def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
+        """Ids of all the additional special tokens in the vocabulary (list of integers)."""
         return [self.vocab.get(token) for token in self._additional_special_tokens]
 
     @additional_special_tokens.setter
@@ -266,8 +278,9 @@ class _GPT2BPETokenizer(MegatronTokenizer):
     def __init__(self, vocab_file, merge_file):
         super().__init__(vocab_file, merge_file)
 
-        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
+        self.tokenizer = GPT2Tokenizer(
+            vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None
+        )
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
@@ -300,6 +313,7 @@ def __init__(self, model_file, vocab_extra_ids=0):
         super().__init__(model_file, vocab_extra_ids=vocab_extra_ids)
 
         import sentencepiece
+
         self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
         self._initalize(vocab_extra_ids)
 
@@ -462,7 +476,7 @@ def additional_special_tokens_ids(self):
 class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
     """SentencePieceTokenizer-Megatron wrapper"""
 
-    def __init__(self, model_file,):
+    def __init__(self, model_file):
         super().__init__(model_file, vocab_extra_ids=0)
 
     def _initalize(self, vocab_extra_ids):
@@ -502,7 +516,7 @@ def additional_special_tokens_ids(self):
 class _Llama2Tokenizer(_SentencePieceTokenizer):
     """SentencePieceTokenizer-Megatron wrapper"""
 
-    def __init__(self, model_file,):
+    def __init__(self, model_file):
         super().__init__(model_file, vocab_extra_ids=0)
 
     def _initalize(self, vocab_extra_ids):
@@ -549,10 +563,7 @@ def additional_special_tokens_ids(self):
         return None
 
 
-def reload_mergeable_ranks(
-    path: str,
-    max_vocab: Optional[int] = None,
-) -> Dict[bytes, int]:
+def reload_mergeable_ranks(path: str, max_vocab: Optional[int] = None) -> Dict[bytes, int]:
     """
     Reload our tokenizer JSON file and convert it to Tiktoken format.
     """
@@ -585,9 +596,12 @@ def reload_mergeable_ranks(
     return ranks
 
 
-PATTERN_TIKTOKEN = r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
+PATTERN_TIKTOKEN = (
+    r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
+)
 PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 
+
 class CustomTikTokenizer(MegatronTokenizer):
     def __init__(
         self,
@@ -602,9 +616,10 @@ def __init__(
             pattern=pattern,
             vocab_size=vocab_size,
             num_special_tokens=num_special_tokens,
-            special_tokens=special_tokens
+            special_tokens=special_tokens,
         )
         import tiktoken
+
         from .. import print_rank_0  # To prevent circular import.
 
         if vocab_size is None:
@@ -614,20 +629,30 @@ def __init__(
         SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
         if special_tokens is None:
             special_tokens = SPECIAL_TOKENS.copy()
-        assert len(special_tokens) == len(set(special_tokens)), f"Special tokens should be unique: {special_tokens}"
+        assert len(special_tokens) == len(
+            set(special_tokens)
+        ), f"Special tokens should be unique: {special_tokens}"
         assert len(special_tokens) <= num_special_tokens < self._vocab_size
-        assert set(SPECIAL_TOKENS) <= set(special_tokens), f"Custom special tokens should include {SPECIAL_TOKENS}"
+        assert set(SPECIAL_TOKENS) <= set(
+            special_tokens
+        ), f"Custom special tokens should include {SPECIAL_TOKENS}"
 
-        special_filler = ["<SPECIAL_{id}>".format(id=i) for i in range(len(special_tokens), num_special_tokens)]
+        special_filler = [
+            "<SPECIAL_{id}>".format(id=i) for i in range(len(special_tokens), num_special_tokens)
+        ]
         if special_filler:
             print_rank_0(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
         special_tokens = special_tokens + special_filler
         assert len(set(special_tokens)) == len(special_tokens) == num_special_tokens, special_tokens
         inner_vocab_size = self._vocab_size - num_special_tokens
 
-        token_to_id_without_special_tokens = reload_mergeable_ranks(path, max_vocab=inner_vocab_size)
+        token_to_id_without_special_tokens = reload_mergeable_ranks(
+            path, max_vocab=inner_vocab_size
+        )
         # Create space for special tokens.
-        token_to_id_without_special_tokens = {t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()}
+        token_to_id_without_special_tokens = {
+            t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()
+        }
 
         special_tokens = {t: i for i, t in enumerate(special_tokens)}
         self._unk_id = special_tokens["<unk>"]
@@ -650,7 +675,6 @@ def __init__(
         self._id_to_token = {v: k for k, v in self._token_to_id.items()}
         assert set(range(self._vocab_size)) == set(self._id_to_token.keys())
 
-
     @property
     def bos(self) -> int:
         return self._bos_id
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 7d60f41f5c..d5ee16be5f 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -28,6 +28,7 @@
 )
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.checkpointing import save_checkpoint
+from megatron.training.checkpointing import checkpoint_exists
 from megatron.legacy.model import Float16Module
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.distributed import DistributedDataParallel as DDP
@@ -205,6 +206,7 @@ def pretrain(
     args_defaults={},
     get_embedding_ranks=None,
     get_position_embedding_ranks=None,
+    non_loss_data_func=None,
 ):
     """Main training program.
 
@@ -233,6 +235,10 @@ def pretrain(
             to it. It is used for programs to add their own arguments.
         args_defaults: a dictionary from argument-name to argument-value. It
             to set already parse arguments.
+        get_embedding_ranks (TODO):
+        get_position_embedding_ranks (TODO):
+        non_loss_data_func (callable): A custom function to call during evaluation.
+            It can run e.g. benchmarks.
     """
 
     # Initalize and get arguments, timers, and Tensorboard writer.
@@ -356,7 +362,8 @@ def pretrain(
                 forward_step_func,
                 model, optimizer, opt_param_scheduler,
                 train_data_iterator, valid_data_iterator,
-                process_non_loss_data_func, config, checkpointing_context)
+                process_non_loss_data_func, config, checkpointing_context,
+                non_loss_data_func)
 
         print_datetime('after training is done')
 
@@ -381,14 +388,16 @@ def pretrain(
         evaluate_and_print_results(prefix, forward_step_func,
                                    valid_data_iterator, model,
                                    iteration, process_non_loss_data_func, config,
-                                   verbose=True, write_to_tensorboard=not args.skip_train)
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
 
     if args.do_test:
         prefix = f'iteration {iteration} on test set'
         evaluate_and_print_results(prefix, forward_step_func,
                                    test_data_iterator, model,
                                    iteration, process_non_loss_data_func, config,
-                                   verbose=True, write_to_tensorboard=not args.skip_train)
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
 
     wandb_writer = get_wandb_writer()
     if wandb_writer:
@@ -634,7 +643,8 @@ def setup_model_and_optimizer(model_provider_func,
     opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
     if args.moe_use_upcycling:
-        assert not os.path.exists(
+        torch.distributed.barrier()
+        assert not checkpoint_exists(
             args.save
         ), ("The upcycling destination directory already exists. "
             "Please check if --moe-use-upcycling is mistakenly enabled. "
@@ -642,15 +652,18 @@ def setup_model_and_optimizer(model_provider_func,
             "All subsequent runs should remove this flag. ")
         num_experts = args.num_experts
         args.num_experts = None
+        expert_model_parallel_size = args.expert_model_parallel_size
+        args.expert_model_parallel_size = 1
         dense_model_for_upcycling = get_model(model_provider_func, model_type)
         args.num_experts = num_experts
+        args.expert_model_parallel_size = expert_model_parallel_size
         _, args.num_floating_point_operations_so_far = upcycling_utils.load_and_upcycle_model(
             load_checkpoint,
             unwrapped_model,
             dense_model_for_upcycling,
             load_kwargs = {'model': dense_model_for_upcycling, 'optimizer': None, 'opt_param_scheduler': None}
         )
-        args.iteration = 0
+        args.iteration = 1
         save_checkpoint(args.iteration, model, None, None, args.num_floating_point_operations_so_far)
         torch.distributed.barrier()
         del dense_model_for_upcycling
@@ -1095,7 +1108,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
 
 def train(forward_step_func, model, optimizer, opt_param_scheduler,
           train_data_iterator, valid_data_iterator,
-          process_non_loss_data_func, config, checkpointing_context):
+          process_non_loss_data_func, config, checkpointing_context, non_loss_data_func):
     """Train the model function."""
     args = get_args()
     timers = get_timers()
@@ -1331,7 +1344,8 @@ def get_e2e_base_metrics():
             evaluate_and_print_results(prefix, forward_step_func,
                                        valid_data_iterator, model,
                                        iteration, process_non_loss_data_func,
-                                       config, False)
+                                       config, verbose=False, write_to_tensorboard=True,
+                                       non_loss_data_func=non_loss_data_func)
             eval_duration += timers('eval-time').elapsed()
             eval_iterations += args.eval_iters
             timers('eval-time').stop()
@@ -1456,7 +1470,8 @@ def evaluate(forward_step_func,
              model,
              process_non_loss_data_func,
              config,
-             verbose=False):
+             verbose=False,
+             non_loss_data_func=None):
     """Evaluation."""
     args = get_args()
     timers = get_timers()
@@ -1534,7 +1549,9 @@ def evaluate(forward_step_func,
                     return None, None, True
 
         collected_non_loss_data = None
-        if process_non_loss_data_func is not None and is_last_rank():
+        if non_loss_data_func is not None:
+            collected_non_loss_data = non_loss_data_func(model)
+        elif process_non_loss_data_func is not None and is_last_rank():
             collected_non_loss_data = forward_backward_func(
                 forward_step_func=forward_step_func,
                 data_iterator=data_iterator,
@@ -1562,7 +1579,7 @@ def evaluate(forward_step_func,
 def evaluate_and_print_results(prefix, forward_step_func,
                                data_iterator, model,
                                iteration, process_non_loss_data_func, config,
-                               verbose=False, write_to_tensorboard=True):
+                               verbose=False, write_to_tensorboard=True, non_loss_data_func=None):
     """Helper function to evaluate and dump results on screen."""
     args = get_args()
     if write_to_tensorboard:
@@ -1574,7 +1591,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
 
     total_loss_dict, collected_non_loss_data, timelimit = evaluate(
         forward_step_func, data_iterator, model,
-        process_non_loss_data_func, config, verbose)
+        process_non_loss_data_func, config, verbose, non_loss_data_func)
     # Timelimit hit during evaluation
     if timelimit:
         return
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
index f81d4dee5d..3c6c39b07f 100644
--- a/megatron/training/yaml_arguments.py
+++ b/megatron/training/yaml_arguments.py
@@ -16,7 +16,7 @@
 
 import torch.nn.functional as F
 
-from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
 
 # Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml
 # Allows for yaml to use environment variables
@@ -442,7 +442,10 @@ def squared_relu(x):
         kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
     
     # Return Transformer config.
-    return TransformerConfig(**kw_args)
+    if getattr(args, "multi_latent_attention", False):
+        return MLATransformerConfig(**kw_args)
+    else:
+        return TransformerConfig(**kw_args)
 
 def load_yaml(yaml_path):
     print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored")
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 0bd85b76e1..3b7f8db012 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -73,9 +73,9 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             transformer_layer_spec = import_module(args.spec)
         else:
             if use_te:
-                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.fp8)
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, args.fp8)
             else:
-                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
+                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention)
 
         build_model_context = nullcontext
         build_model_context_args = {}
@@ -105,7 +105,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                 share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
                 position_embedding_type=args.position_embedding_type,
                 rotary_percent=args.rotary_percent,
-                rotary_base=args.rotary_base
+                rotary_base=args.rotary_base,
+                rope_scaling=args.use_rope_scaling
             )
 
     return model
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index 7777603e53..6b1848e96c 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -46,10 +46,12 @@ def model_provider(
         model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model
     """
     args = get_args()
+    vision_model_type = "clip"
 
     num_image_embeddings = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim, args.disable_vision_class_token, 1
+        args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
     )
+
     old_seq_length = args.seq_length
     # decoder_seq_length denotes the language model sequence length.
     args.decoder_seq_length = args.seq_length + num_image_embeddings
@@ -87,6 +89,7 @@ def model_provider(
     vision_transformer_config.num_layers = args.encoder_num_layers
     vision_transformer_config.first_pipeline_num_layers = None
     vision_transformer_config.last_pipeline_num_layers = None
+    vision_transformer_config.vision_model_type = vision_model_type
 
     vision_projection_type = "mlp"
     vision_projection_config = deepcopy(language_transformer_config)
@@ -128,6 +131,7 @@ def model_provider(
         parallel_output=parallel_output,
         language_position_embedding_type=args.position_embedding_type,
         language_rotary_percent=args.rotary_percent,
+        language_rope_scaling=args.use_rope_scaling,
         pre_process=pre_process,
         post_process=post_process,
         add_encoder=add_encoder,
@@ -137,6 +141,12 @@ def model_provider(
         patch_dim=args.patch_dim,
     )
 
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
     return model
 
 
@@ -270,7 +280,18 @@ def forward_step(data_iterator, model: LLaVAModel):
 def add_vlm_extra_args(parser):
     """Extra arguments."""
     group = parser.add_argument_group(title='vision language model specific arguments')
-    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        '--freeze-LM', action='store_true', default=False, help="Freeze language model weights"
+    )
+    group.add_argument(
+        '--freeze-ViT', action='store_true', default=False, help="Freeze vision model (ViT) weights"
+    )
+    group.add_argument(
+        "--disable-vision-class-token",
+        action="store_true",
+        default=False,
+        help="Drop vision model class token",
+    )
     return parser
 
 
diff --git a/tests/functional_tests/jet_recipes/_build-mcore.yaml b/tests/functional_tests/jet_recipes/_build-mcore.yaml
new file mode 100644
index 0000000000..81b38b69ce
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-mcore.yaml
@@ -0,0 +1,11 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-pyt
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci
+    
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-nemo.yaml b/tests/functional_tests/jet_recipes/_build-nemo.yaml
new file mode 100644
index 0000000000..eb2b318ab5
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-nemo.yaml
@@ -0,0 +1,10 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-nemo
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-pyt.yaml b/tests/functional_tests/jet_recipes/_build-pyt.yaml
deleted file mode 100644
index d24836e44c..0000000000
--- a/tests/functional_tests/jet_recipes/_build-pyt.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-type: build
-format_version: 1
-maintainers: [maanug]
-spec:
-  name: mcore-pyt
-  platforms: [linux/amd64]
-  source:
-    # The image tag will be added via `jet-tests.yaml`
-    # Tags are one of {buildcache, $CI_PIPELINE_ID}
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci
-    
-
----
-type: build
-format_version: 1
-maintainers: [maanug]
-spec:
-  name: mcore-nemo
-  platforms: [linux/amd64]
-  source:
-    # The image tag will be added via `jet-tests.yaml`
-    # Tags are one of {buildcache, $CI_PIPELINE_ID}
-    image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
index 75aac2faab..088436e8ea 100644
--- a/tests/functional_tests/jet_recipes/bert.yaml
+++ b/tests/functional_tests/jet_recipes/bert.yaml
@@ -13,7 +13,7 @@ spec:
     /workspace/data/bert_data: text/the_pile/bert_shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/bert_data"
@@ -32,7 +32,7 @@ products:
     time_limit: [12000]
     test_case: 
     - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
-    - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
+    # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
     - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
     - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
     - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
index 87a6fb2c23..f14d2f0afa 100644
--- a/tests/functional_tests/jet_recipes/gpt-nemo.yaml
+++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
@@ -16,8 +16,8 @@ spec:
     cd /opt/NeMo
   
     ARGUMENTS=(
-        "DATA_PATH=''"
-        "DATA_CACHE_PATH=''"
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
         "OUTPUT_PATH={assets_dir}"
         "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "CHECKPOINT_PATH=/workspace/checkpoints"
diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
index abaef86b81..8c09d0bd13 100644
--- a/tests/functional_tests/jet_recipes/gpt.yaml
+++ b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -12,7 +12,7 @@ spec:
     /workspace/data/gpt3_data: text/the_pile/shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/gpt3_data"
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
index 7a20b1145a..3149f5664f 100644
--- a/tests/functional_tests/jet_recipes/multimodal-llava.yaml
+++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -13,11 +13,11 @@ spec:
   scope: null
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
-        "DATA_PATH=''"
-        "DATA_CACHE_PATH=''"
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
         "OUTPUT_PATH={assets_dir}"
         "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "CHECKPOINT_PATH=/workspace/checkpoints"
diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
index 947023b0eb..dbbbc508d2 100644
--- a/tests/functional_tests/jet_recipes/t5.yaml
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -13,7 +13,7 @@ spec:
     /workspace/data/t5_data: text/the_pile/t5_shard00
   script: |-
     ls
-    cd /workspace/megatron-lm
+    cd /opt/megatron-lm
 
     ARGUMENTS=(
         "DATA_PATH=/workspace/data/t5_data"
@@ -31,6 +31,12 @@ products:
   - scope: [mr]
     time_limit: [12000]
     test_case:
+    - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
   - scope: [weekly]
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
new file mode 100644
index 0000000000..5ee31bc232
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -0,0 +1,140 @@
+import copy
+import itertools
+import pathlib
+from typing import List, Optional
+
+import jetclient
+import yaml
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def flatten_products(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> jetclient.JETWorkloadManifest:
+    """Flattens a nested dict of products"""
+    workload_manifest.products = [
+        dict(zip(inp.keys(), values))
+        for inp in workload_manifest.products
+        for values in itertools.product(*inp.values())
+    ]
+
+    return workload_manifest
+
+
+def flatten_workload(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Flattens a workload with products into a list of workloads that don't have products."""
+    workload_manifest = dict(workload_manifest)
+    products = workload_manifest.pop("products")
+    workload_manifests = []
+    for product in products:
+        workload = copy.deepcopy(workload_manifest)
+        workload['spec'] = {k: v for k, v in workload['spec'] if k not in product.keys()}
+        workload['spec'] = dict(**dict(workload['spec']), **product)
+        workload_manifests.append(jetclient.JETWorkloadManifest(**workload))
+    return workload_manifests
+
+
+def load_config(config_path: str) -> jetclient.JETWorkloadManifest:
+    """Loads and parses a yaml file into a JETWorkloadManifest"""
+    with open(config_path) as stream:
+        try:
+            return jetclient.JETWorkloadManifest(**yaml.safe_load(stream))
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]:
+    """Wrapper function for doing all the fun at once."""
+    return flatten_workload(flatten_products(load_config(config_path=config_path)))
+
+
+def filter_by_test_case(
+    workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str
+) -> jetclient.JETWorkloadManifest:
+    """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.test_case == test_case
+    )
+
+    if len(workload_manifests) > 1:
+        raise ValueError("Duplicate test_case found!")
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests[0]
+
+
+def filter_by_scope(
+    workload_manifests: List[jetclient.JETWorkloadManifest], scope: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching scope."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.scope == scope
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def filter_by_model(
+    workload_manifests: List[jetclient.JETWorkloadManifest], model: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching model."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.model == model
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def load_workloads(
+    container_tag: str,
+    scope: Optional[str] = None,
+    model: Optional[str] = None,
+    test_case: Optional[str] = None,
+    container_image: Optional[str] = None,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Return all workloads from disk that match scope and platform."""
+    recipes_dir = BASE_PATH / ".." / ".." / "jet_recipes"
+    local_dir = BASE_PATH / ".." / ".." / "local_recipes"
+
+    workloads: List[jetclient.JETWorkloadManifest] = []
+    build_workloads: List[jetclient.JETClient] = []
+    for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")):
+        workloads += load_and_flatten(config_path=file)
+        if file.stem.startswith("_build"):
+            build_workloads.append(load_config(config_path=file))
+
+    if scope:
+        workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
+
+    if model:
+        workloads = filter_by_model(workload_manifests=workloads, model=model)
+
+    if test_case:
+        workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)]
+
+    for workload in list(workloads):
+        for build_workload in build_workloads:
+            if (
+                workload.spec.build == build_workload.spec.name
+            ) and build_workload not in workloads:
+                container_image = container_image or build_workload.spec.source.image
+                build_workload.spec.source.image = f"{container_image}:{container_tag}"
+                workloads.append(build_workload)
+    return workloads
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
new file mode 100644
index 0000000000..c7338d3181
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -0,0 +1,113 @@
+import pathlib
+from typing import Optional
+
+import click
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+@click.command()
+@click.option("--scope", required=True, type=str, help="Test scope")
+@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
+@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
+@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
+@click.option("--container-image", required=True, type=str, help="LTS Container tag to use")
+@click.option("--container-image-dev", required=True, type=str, help="Dev Container tag to use")
+@click.option("--container-tag", required=True, type=str, help="Container tag to use")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    scope: str,
+    a100_cluster: str,
+    h100_cluster: str,
+    output_path: str,
+    container_image: str,
+    container_image_dev: str,
+    container_tag: str,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+    test_cases = [
+        test_case
+        for test_case in common.load_workloads(scope=scope, container_tag=container_tag)
+        if test_case.type != "build"
+    ]
+
+    gitlab_pipeline = {
+        "stages": list(set([test_case.spec.model for test_case in test_cases])),
+        "default": {"interruptible": True},
+    }
+
+    for test_case in test_cases:
+        if test_case.spec.platforms == "dgx_a100":
+            cluster = a100_cluster
+        elif test_case.spec.platforms == "dgx_h100":
+            cluster = h100_cluster
+        else:
+            raise ValueError(f"Platform {test_case.spec.platforms} unknown")
+
+        script = [
+            "export PYTHONPATH=$(pwd); "
+            "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
+            f"--model {test_case.spec.model}",
+            f"--test-case {test_case.spec.test_case}",
+            f"--container-tag {container_tag}",
+            f"--cluster {cluster}",
+        ]
+
+        with open(
+            pathlib.Path(
+                BASE_PATH
+                / ".."
+                / ".."
+                / "test_cases"
+                / test_case.spec.model
+                / test_case.spec.test_case
+                / "model_config.yaml"
+            )
+        ) as stream:
+            try:
+                test_case_dict = yaml.safe_load(stream)
+            except yaml.YAMLError as exc:
+                print(exc)
+
+        if 'EXPERIMENTAL' in test_case_dict and test_case_dict['EXPERIMENTAL']:
+            script.append(f"--container-image {container_image_dev}")
+
+        if run_name is not None and wandb_experiment is not None:
+            script.append(f"--run-name {run_name}")
+            test_case.spec.model
+            script.append(
+                f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}"
+            )
+
+        gitlab_pipeline[test_case.spec.test_case] = {
+            "stage": f"{test_case.spec.model}",
+            "image": f"{container_image}:{container_tag}",
+            "tags": ["mcore-docker-node-jet"],
+            "rules": [
+                {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
+                {"if": '$CI_MERGE_REQUEST_ID'},
+            ],
+            "timeout": "7 days",
+            "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "jet-generate"}],
+            "script": [" ".join(script)],
+            "artifacts": {"paths": ["results/"]},
+        }
+
+    with open(output_path, 'w') as outfile:
+        yaml.dump(gitlab_pipeline, outfile, default_flow_style=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
new file mode 100644
index 0000000000..bc9ad22302
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
@@ -0,0 +1,62 @@
+"""Generate launch scripts for local execution.
+
+This script allows to generate pre-filled launch scripts that allow for local execution of Megatron-LM functional tests inside containerized enviroments (i.e. Slurm enroot or Docker).
+
+This script will generate scripts into `$(pwd)/test_cases`.
+"""
+
+import pathlib
+from typing import Optional
+
+import click
+import jetclient
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+
+def load_script(config_path: str) -> str:
+    with open(config_path) as stream:
+        try:
+            jetclient.JETWorkloadManifest(**yaml.safe_load(stream)).spec.script
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+@click.command()
+@click.option("--model", required=False, type=str, help="Filters all tests by matching model")
+@click.option("--scope", required=False, type=str, help="Filters all tests by matching scope")
+@click.option(
+    "--test-case", required=False, type=str, help="Returns a single test-case with matching name."
+)
+@click.option(
+    "--output-path",
+    required=True,
+    type=str,
+    help="Directory where the functional test will write its artifacts to (Tensorboard logs)",
+    default="/opt/megatron-lm",
+)
+def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
+    workloads = common.load_workloads(
+        container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
+    )
+
+    for workload in workloads:
+        if workload.type == "build":
+            continue
+        magic_values = dict(workload.spec)
+        magic_values["assets_dir"] = output_path
+
+        file_path = (
+            pathlib.Path.cwd()
+            / "test_cases"
+            / workload.spec.model
+            / f"{workload.spec.test_case}.sh"
+        )
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(file_path, "w", encoding="utf-8") as fh:
+            fh.write(workload.spec.script.format(**magic_values))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
new file mode 100644
index 0000000000..3e243c542a
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -0,0 +1,216 @@
+import os
+import pathlib
+import re
+import signal
+import sys
+import tempfile
+from typing import List, Optional, Tuple
+
+import click
+import jetclient
+import yaml
+from jetclient.services.dtos.pipeline import PipelineStatus
+
+from tests.functional_tests.python_test_utils.jet import common
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def resolve_cluster_config(cluster: str) -> str:
+    if cluster == "dgxh100_eos":
+        return "mcore/eos"
+    if cluster == "dgxa100_dracooci":
+        return "mcore/draco-oci"
+    if cluster == "dgxa100_dracooci-ord":
+        return "mcore/draco-oci-ord"
+    if cluster == "dgxh100_coreweave":
+        return "mcore/coreweave"
+    raise ValueError(f"Unknown cluster {cluster} provided.")
+
+
+def register_pipeline_terminator(pipeline: jetclient.JETPipeline):
+    def sigterm_handler(_signo, _stack_frame):
+        print(f"Trying to terminate pipeline {pipeline.jet_id}")
+        pipeline.cancel()
+        print(f"Pipeline {pipeline.jet_id} terminated")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, sigterm_handler)
+    signal.signal(signal.SIGTERM, sigterm_handler)
+
+
+def launch_and_wait_for_completion(
+    test_case: str,
+    container_image: str,
+    container_tag: str,
+    cluster: str,
+    account: str,
+    run_name: Optional[str],
+    wandb_experiment: Optional[str],
+) -> jetclient.JETPipeline:
+    pipeline = jetclient.JETClient(
+        customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod"
+    ).workloads.submit(
+        workloads=common.load_workloads(
+            test_case=test_case, container_image=container_image, container_tag=container_tag
+        ),
+        config_id=resolve_cluster_config(cluster),
+        custom_config={
+            "launchers": {cluster: {"account": account}},
+            "executors": {
+                "jet-ci": {
+                    "environments": {
+                        cluster: {
+                            "variables": {
+                                "RUN_NAME": run_name or "",
+                                "WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "",
+                                "WANDB_EXPERIMENT": wandb_experiment or "",
+                            }
+                        }
+                    }
+                }
+            },
+        },
+        wait_for_validation=True,
+    )
+
+    register_pipeline_terminator(pipeline=pipeline)
+
+    print(
+        f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}",
+        flush=True,
+    )
+
+    pipeline.wait(max_wait_time=60 * 60 * 24 * 7)
+    print(f"Pipeline terminated; status: {pipeline.get_status()}")
+    return pipeline
+
+
+def download_job_assets(job: jetclient.JETJob, iteration: int = 0) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}"
+
+    for restart_idx, log in enumerate(logs):
+        assets = log.get_assets()
+        assets_path = assets_base_path / f"restart={restart_idx}"
+        assets_path.mkdir(parents=True, exist_ok=True)
+        for log_filename in assets.keys():
+            with open(assets_path / log_filename, "w") as fh:
+                assets[log_filename].download(pathlib.Path(fh.name))
+
+
+def download_job_logs(job: jetclient.JETJob) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets = logs[0].get_assets()
+    log_filename = [key for key in assets.keys() if key.endswith(".log")][0]
+
+    with tempfile.NamedTemporaryFile() as tmp_file:
+        assets[log_filename].download(pathlib.Path(tmp_file.name))
+        with open(pathlib.Path(tmp_file.name), "r") as fh:
+            return fh.readlines()
+
+
+def parse_iterations_from_logs(logs: List[str]) -> Optional[Tuple[int, int]]:
+    for log_row in logs[::-1]:
+        match = re.search(r"iteration\s+(\d+)\s*/\s*(\d+)", log_row)
+        if match is not None:
+            return int(match.group(1)), int(match.group(2))
+
+
+@click.command()
+@click.option("--model", required=True, type=str, help="Model")
+@click.option("--test-case", required=True, type=str, help="Test case")
+@click.option(
+    "--account",
+    required=False,
+    type=str,
+    help="Slurm account to use",
+    default="coreai_dlalgo_mcore",
+)
+@click.option("--cluster", required=True, type=str, help="Cluster to run on")
+@click.option("--container-tag", required=True, type=str, help="Base image of Mcore image")
+@click.option("--container-image", required=False, type=str, help="Base image of Mcore image")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    model: str,
+    test_case: str,
+    account: str,
+    cluster: str,
+    container_tag: str,
+    container_image: Optional[str] = None,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+
+    with open(
+        pathlib.Path(
+            BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml"
+        )
+    ) as stream:
+        try:
+            test_case_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    test_type = test_case_dict['TEST_TYPE']
+
+    if test_type == "release" and (run_name is None or wandb_experiment is None):
+        print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
+        sys.exit(1)
+
+    n_attempts = 0
+    n_iteration = 0
+    while True and n_attempts < 3:
+        pipeline = launch_and_wait_for_completion(
+            test_case=test_case,
+            container_image=container_image,
+            container_tag=container_tag,
+            cluster=cluster,
+            account=account,
+            run_name=run_name,
+            wandb_experiment=wandb_experiment,
+        )
+
+        main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
+
+        logs = download_job_logs(job=main_job)
+        concat_logs = "\n".join(logs)
+        print(f"Logs:\n{concat_logs}")
+
+        download_job_assets(job=main_job, iteration=n_iteration)
+
+        if test_type != "release":
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+
+        parsed_result = parse_iterations_from_logs(logs=logs)
+        if not parsed_result:
+            print("Weird log, no iterations found")
+            n_attempts += 1
+            continue
+
+        current_iteration, total_iterations = parsed_result
+        if current_iteration == total_iterations:
+
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+        n_iteration += 1
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
index d43a3af77f..12dd359c65 100644
--- a/tests/functional_tests/shell_test_utils/_run_training.sh
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -41,15 +41,9 @@ done
 cat $TRAINING_PARAMS_PATH | envsubst >$TRAINING_PARAMS_PATH.tmp
 mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH
 
-# Run before script
-SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')
-if [[ "$SCRIPT" != null ]]; then
-    eval "$SCRIPT"
-fi;
-
 # Pull env vars to export
 ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
-for ARGUMENT in $ENV_VARS; do
+while IFS= read -r ARGUMENT; do
     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 
     KEY_LENGTH=${#KEY}
@@ -57,7 +51,13 @@ for ARGUMENT in $ENV_VARS; do
 
     export "$KEY"="$VALUE"
     echo "$KEY=$VALUE"
-done
+done <<< "$ENV_VARS"
+
+# Run before script
+SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')
+if [[ "$SCRIPT" != null ]]; then
+    eval "$SCRIPT"
+fi;
 
 # Exit earlier to leave time for properly saving checkpoint
 if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh
index 277d46add1..1bb2ea5c3c 100644
--- a/tests/functional_tests/shell_test_utils/notify.sh
+++ b/tests/functional_tests/shell_test_utils/notify.sh
@@ -1,6 +1,6 @@
 set -euxo pipefail
 
-collect_jet_jobs () {
+collect_jobs () {
   PAGE=1
   PER_PAGE=100
   RESULTS="[]"
@@ -11,7 +11,7 @@ collect_jet_jobs () {
                   -s \
                   --globoff \
                   --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
-                  "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${JET_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
               )
     # Combine the results
     RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
@@ -85,31 +85,16 @@ if [[ $DOWNSTREAM_PIPELINE_ID == null ]]; then
 
 else
     set +x
-    JET_PIPELINE_JSON=$(curl \
-                        --fail \
-                        --silent \
-                        --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
-                        "https://${GITLAB_ENDPOINT}/api/v4/projects/70847/pipelines/${DOWNSTREAM_PIPELINE_ID}/bridges?per_page=100"
-                        )
+    JOBS=$(echo "$(collect_jobs)" | jq '[.[] | {id, name, status}]')
+    echo $JOBS
     set -x
-    JET_PIPELINE_ID=$(jq '.[0].downstream_pipeline.id' <<< "$JET_PIPELINE_JSON")
 
-    set +x
-    JET_LOGS=$(echo "$(collect_jet_jobs)" \
-                | jq '[
-                    .[] 
-                    | select(.name | startswith("build/") | not)
-                    | select(.name | contains("3 logs_after") | not)
-                    | select(.name | contains("1 logs_before") | not)
-                ]'
-            ) 
-
-    FAILED_JET_LOGS=$(echo "$JET_LOGS" \
+    FAILED_JOBS=$(echo "$JOBS" \
                 | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[
                     .[] 
                     | select(.status != "success")
                     | {
-                        "name": (.name[6:] | split(" ")[0]),
+                        name,
                         id,
                         "url": ("https://" + $GITLAB_ENDPOINT + "/dl/jet/ci/-/jobs/" + (.id | tostring)),
                     }
@@ -117,29 +102,34 @@ else
             ) 
     set -x
 
-    for row in $(echo "${FAILED_JET_LOGS}" | jq -r '.[] | @base64'); do
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
         _jq() {
         echo ${row} | base64 --decode | jq -r ${1}
         }
         JOB_ID=$(_jq '.id')
-        SLURM_FAILURE=$(jet \
-                                -c -df json -th logs query --raw \
-                                -c "obj_status.s_message" \
-                                --eq obj_ci.l_job_id "$JOB_ID" \
-                            | jq '.[0].obj_status.s_message' \
-                            | tr -d '"'
-                        )
-        FAILED_JET_LOGS=$(echo "$FAILED_JET_LOGS" \
-                            | jq \
-                                --argjson JOB_ID "$JOB_ID" \
-                                --arg SLURM_FAILURE "$SLURM_FAILURE" '
-                                    .[] |= ((select(.id==$JOB_ID) += {
-                                        "slurm_failure_reason": $SLURM_FAILURE}))
-                            ')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
     done
 
-    NUM_FAILED=$(echo "$FAILED_JET_LOGS" | jq 'length')
-    NUM_TOTAL=$(echo "$JET_LOGS" | jq 'length')
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$JOBS" | jq 'length')
 
     if [[ $NUM_FAILED -eq 0 ]]; then
         BLOCKS='[
@@ -152,7 +142,7 @@ else
             }
         ]'
     else
-        BLOCKS=$(echo -e "$FAILED_JET_LOGS" \
+        BLOCKS=$(echo "$FAILED_JOBS" \
                     | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
                         [
                             {                
@@ -170,7 +160,7 @@ else
                                     "type": "mrkdwn",
                                     "text": (                               
                                         "• Job: <" +.url + "|" + .name + ">"
-                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason[-2000:] + "```"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
                                         
                                     )
                                 }
diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
new file mode 100644
index 0000000000..46be8b078e
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
@@ -0,0 +1,186 @@
+set -euxo pipefail
+
+collect_jobs () {
+  PAGE=1
+  PER_PAGE=100
+  RESULTS="[]"
+
+  while true; do
+    # Fetch the paginated results
+    RESPONSE=$(curl \
+                  -s \
+                  --globoff \
+                  --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+              )
+    # Combine the results
+    RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
+
+    # Check if there are more pages
+    if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then
+      break
+    fi
+
+    # Increment the page number
+    PAGE=$((PAGE + 1))
+  done
+
+  echo "$RESULTS"
+}
+
+CI_PIPELINE_ID=${1:-16595865}
+CI_PROJECT_ID=${CI_PROJECT_ID:-19378}
+PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID
+JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/
+CONTEXT="unit-tests-extended"
+
+# Fetch Elastic logs
+set +x
+PIPELINE_JSON=$(curl \
+                  --fail \
+                  --silent \
+                  --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs"
+                ) || ret_code=$?
+set -x
+if [[ ${ret_code:-0} -ne 0 ]]; then
+    echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist
+    exit 1
+fi
+
+UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("unit_tests"))]')
+
+if [[ $UNIT_TESTS_JOBS == null ]]; then
+    FAILED_JOBS=$(curl \
+                    --fail \
+                    --silent \
+                    --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                    "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \
+                  | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"')
+    curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        --data '
+            {
+                "blocks": [
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n"   
+                        }
+                    },
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "\n• Job: '"$FAILED_JOBS"'"   
+                        }
+                    },
+                ]
+            
+            }' \
+        $WEBHOOK_URL
+
+else
+    FAILED_JOBS=$(echo -E "$UNIT_TESTS_JOBS" \
+                | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" --arg JOB_URL "$JOB_URL" '[
+                    .[] 
+                    | select(.status != "success")
+                    | {
+                        name,
+                        id,
+                        "url": ($JOB_URL + (.id | tostring)),
+                    }
+                ]'
+            ) 
+    set -x
+
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
+        _jq() {
+        echo ${row} | base64 --decode | jq -r ${1}
+        }
+        JOB_ID=$(_jq '.id')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
+    done
+
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$UNIT_TESTS_JOBS" | jq 'length')
+
+    if [[ $NUM_FAILED -eq 0 ]]; then
+        BLOCKS='[
+            {                
+                "type": "section",
+                "text": {            
+                    "type": "mrkdwn",
+                    "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed"
+                }
+            }
+        ]'
+    else
+        BLOCKS=$(echo "$FAILED_JOBS" \
+                    | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
+                        [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed")
+                                }
+                            }
+                        ] + [
+                            .[] 
+                            | {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (                               
+                                        "• Job: <" +.url + "|" + .name + ">"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
+                                        
+                                    )
+                                }
+                            }
+                        ] + [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": ("===============================================")
+                                }
+                            }
+                        ]'
+        )
+    fi
+
+    for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do
+        _jq() {
+            echo ${row} | base64 --decode
+        }
+
+        curl \
+            -X POST \
+            -H "Content-type: application/json" \
+            --data '{"blocks": '["$(_jq)"]'}' \
+            $WEBHOOK_URL
+    done
+
+fi
\ No newline at end of file
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
index 7578d25c2d..c9c16b43c6 100644
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -4,11 +4,11 @@ set -exo pipefail
 
 echo "------ARGUMENTS LIST --------"
 for ARGUMENT in "$@"; do
+    echo $ARGUMENT
     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 
     KEY_LENGTH=${#KEY}
-    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
-
+    VALUE=$(eval echo ${ARGUMENT:$KEY_LENGTH+1})
     export "$KEY"="$VALUE"
     echo "$KEY=$VALUE"
 done
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh b/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
deleted file mode 100644
index 3ee776ce9b..0000000000
--- a/tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/bin/bash
-
-#######################################################################################
-#
-# Script for capturing a reference model.
-#
-# It will train a model until a target iteration was hit.
-#
-#
-########################################################################################
-
-set -exo pipefail
-
-echo "------ARGUMENTS LIST --------"
-for ARGUMENT in "$@"; do
-    KEY=$(echo $ARGUMENT | cut -f1 -d=)
-
-    KEY_LENGTH=${#KEY}
-    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
-
-    export "$KEY"="$VALUE"
-    echo "$KEY=$VALUE"
-done
-echo "---------------------------------"
-
-# Check that mandatory vars are set
-MANDATORY_VARS=(
-    "MODEL"
-    "VARIANT"
-    "TRAINING_SCRIPT_PATH"
-    "OUTPUT_PATH"
-    "IMAGE_TAG"
-    "NODES"
-    "PPP"
-    "PARTITION"
-    "ITERATIONS"
-    "WANDB_API_KEY"
-    "CLUSTER"
-    "DATASET"
-    "WANDB_EXPERIMENT"
-    "GPUS_PER_NODE"
-)
-for mandatory_var in "${MANDATORY_VARS[@]}"; do
-    if [[ -z "${!mandatory_var}" ]]; then
-        echo 'Providing $'$mandatory_var' is mandatory.'
-        exit 1
-    fi
-done
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)
-
-# Fetch dataset base path via JET and refresh DATA_BELDN
-DATA_PATH=$(jet -c -tf plain -th artifacts registry list -c storages.$CLUSTER.identifier -f "key == '$DATASET'")
-DATA_BLEND=$(eval echo "$DATA_BLEND")
-
-########################################################################################
-# Dont change below
-########################################################################################
-
-SLURM_LOGS=$OUTPUT_PATH/slurm_logs/
-mkdir -p $SLURM_LOGS
-
-# Container settings
-ARGUMENTS=(
-    "TRAINING_SCRIPT_PATH=${TRAINING_SCRIPT_PATH}"
-    "TEST_CASE_PATH=./tests/functional_tests/test_cases/$MODEL/$VARIANT"
-    "OUTPUT_PATH=${OUTPUT_PATH}"
-    "TENSORBOARD_PATH=${OUTPUT_PATH}/tensorboard"
-    "CHECKPOINT_PATH=${OUTPUT_PATH}/checkpoints"
-    "DATA_PATH=${DATA_PATH}"
-    "DATA_CACHE_PATH=${OUTPUT_PATH}/data-cache"
-    "WANDB_API_KEY=${WANDB_API_KEY}"
-    "WANDB_EXPERIMENT=${WANDB_EXPERIMENT}"
-    "DATA_BLEND=\"${DATA_BLEND}\""
-)
-
-if [[ -n $LOAD_PATH ]]; then
-    ARGUMENTS+=("LOAD_PATH=${LOAD_PATH}")
-fi
-
-echo ${ARGUMENTS[@]}
-
-while : 
-do
-
-if [[ $(cat "${OUTPUT_PATH}/checkpoints/latest_checkpointed_iteration.txt" || echo 0) -ge $ITERATIONS ]]; then
-    break
-fi
-
-# Fire of sbatch
-echo '#!/bin/bash' > sbatch.sh
-
-if [[ $GPUS_PER_NODE != null ]]; then
-    echo '#SBATCH --gres=gpu:8' >> sbatch.sh
-fi
-echo "#SBATCH --nodes=$NODES
-#SBATCH --account $PPP
-#SBATCH --partition $PARTITION
-#SBATCH --ntasks-per-node=1
-#SBATCH --time "04:00:00"
-#SBATCH --job-name=$PPP:mcore:release:$MODEL
-#SBATCH --dependency=singleton
-#SBATCH --output=/dev/null 
-#SBATCH --error=/dev/null
-#SBATCH --exclusive
-
-# Prepare SLURM job
-echo "SLURM_JOB_ID=\$SLURM_JOB_ID" > "$SLURM_LOGS/\${SLURM_JOB_ID}.log"
-
-srun \
-    --ntasks-per-node=1 \
-    --container-image='gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci:$IMAGE_TAG' \
-    --container-mounts='${DATA_PATH}:${DATA_PATH},${OUTPUT_PATH}:${OUTPUT_PATH}' \
-    --container-workdir=/workspace/megatron-lm \
-    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}>>'$SLURM_LOGS/\${SLURM_JOB_ID}.log' 2>&1" >> sbatch.sh
-
-set +e
-sbatch -W sbatch.sh
-set -e
-done
-
-# Write golden values into repo if this run should become a reference
-cp $OUTPUT_PATH/golden_values.json > ./golden_values.json
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
index 941e8b7bdb..bf88792152 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -25,14 +25,14 @@ MODEL_ARGS:
   --micro-batch-size: 4
   --rampup-batch-size: "384 384 97656250"
   --global-batch-size: 1152
-  --train-samples: 4882812
+  --train-samples: 19531250
   --manual-gc: true
 
   # Transformer Engine args
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
index 941e8b7bdb..9453db100c 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -32,7 +32,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
index ee149b884e..9516076dc6 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -30,7 +30,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
   --tokenizer-model: ${DATA_PATH}/tokenizer.model
   --data-path: ${DATA_BLEND}
@@ -88,6 +88,7 @@ MODEL_ARGS:
   --auto-detect-ckpt-format: true
   --load: ${LOAD_PATH}
   --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
   --save-interval: 500
 
   # Add initialization args
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
index 1fe7611a81..585d9bb2c7 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -33,7 +33,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
index d80246eecd..22607416a3 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -33,7 +33,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: GPTSentencePieceTokenizer
   --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
   --data-path: $DATA_BLEND
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
index b2f6983a62..39421a887e 100644
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -31,7 +31,7 @@ MODEL_ARGS:
   --transformer-impl: transformer_engine
 
   # Data args
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --tokenizer-type: Llama2Tokenizer
   --tokenizer-model: ${DATA_PATH}/tokenizer.model
   --data-path: ${DATA_BLEND}
@@ -89,6 +89,7 @@ MODEL_ARGS:
   --auto-detect-ckpt-format: true
   --load: ${LOAD_PATH}
   --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
   --save-interval: 500
 
   # Add initialization args
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
index bd193a724d..f4b39082a6 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13442, 9.13256, 9.12852, 9.11273, 9.05533, 9.04358, 8.98427, 8.93519, 8.89295, 8.79396]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3478477.0, 3585145.0, 3475635.0, 3384010.0, 3700478.0, 3480110.0, 3398548.0, 3454436.0, 3425849.0, 3585758.0]},"iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
index de82457c30..03e0dd0e9b 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values.json
@@ -1 +1 @@
-{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3558381.0, 3664861.0, 3555505.0, 3463866.0, 3780904.0, 3560200.0, 3478189.0, 3534510.0, 3506002.0, 3665772.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16219, 9.16263, 9.15739, 9.1412, 9.09523, 9.07236, 9.01592, 8.96749, 8.92204, 8.8314]}}
\ No newline at end of file
+{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
index 0ce1048997..96f345a702 100644
--- a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19789, 9.20022, 9.19547, 9.17248, 9.11862, 9.10315, 9.0418, 8.98727, 8.9443, 8.84512]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3718539.0, 3825032.0, 3715374.0, 3623934.0, 3940675.0, 3720162.0, 3638165.0, 3695121.0, 3666164.0, 3825842.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..bcff777664
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [19.39068, 0.66038, 0.65673, 0.66493, 0.65894, 0.6473, 0.65746, 0.64942, 0.66259, 0.65247, 0.65165, 0.64944, 0.81313, 0.65069, 0.64982, 0.65247, 0.65149, 0.65284, 0.64913, 0.6496]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.63253, 0.27412, 0.26777, 0.27338, 0.26922, 0.26445, 0.27043, 0.26308, 0.27178, 0.26246, 0.26565, 0.26691, 0.42095, 0.26741, 0.26653, 0.26546, 0.26547, 0.26403, 0.26266, 0.26606]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.0264, 0.24005, 0.23751, 0.24162, 0.24102, 0.23888, 0.24027, 0.23829, 0.24182, 0.24308, 0.24109, 0.23964, 0.23841, 0.24005, 0.23898, 0.23896, 0.24052, 0.23894, 0.24242, 0.23863]}, "forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [8.32911, 0.07441, 0.07755, 0.07578, 0.07557, 0.07223, 0.0737, 0.07404, 0.07108, 0.07174, 0.07137, 0.07162, 0.07437, 0.07185, 0.07129, 0.07247, 0.0719, 0.07573, 0.07292, 0.07122]}, "forward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.47287, 0.00053, 0.00063, 0.00048, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00063, 0.00044, 0.00046, 0.00047, 0.00045, 0.00056, 0.00046, 0.00045, 0.00046, 0.00045, 0.00044]}, "backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.1444, 0.13179, 0.12767, 0.13592, 0.1279, 0.12912, 0.13033, 0.1328, 0.13106, 0.13249, 0.12957, 0.12877, 0.13334, 0.12829, 0.12815, 0.13128, 0.12985, 0.13117, 0.12901, 0.1277]}, "backward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00065, 0.00056, 0.00066, 0.00067, 0.0006, 0.00059, 0.00064, 0.00067, 0.00068, 0.0006, 0.00056, 0.00058, 0.00059, 0.00056, 0.00064, 0.00058, 0.00049, 0.00079, 0.00081, 0.0006]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [12.49425, 0.23291, 0.228, 0.22475, 0.22786, 0.22525, 0.22534, 0.22597, 0.23004, 0.22656, 0.22342, 0.22577, 0.38374, 0.22857, 0.22673, 0.22371, 0.22908, 0.23017, 0.23145, 0.23191]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5.02478, 0.00608, 0.00441, 0.00414, 0.0093, 0.00347, 0.00363, 0.00527, 0.0093, 0.00705, 0.00369, 0.00633, 0.00834, 0.00352, 0.0034, 0.00565, 0.00346, 0.00354, 0.00341, 0.0035]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.47745, 0.00052, 0.00064, 0.00053, 0.00052, 0.0006, 0.00052, 0.00062, 0.00052, 0.00056, 0.00065, 0.00056, 0.00054, 0.00053, 0.00058, 0.00052, 0.00052, 0.00052, 0.00055, 0.00053]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.43086, 0.00036, 0.00041, 0.00037, 0.00032, 0.00037, 0.00048, 0.00044, 0.00043, 0.00045, 0.00034, 0.00044, 0.00037, 0.00043, 0.00044, 0.00032, 0.00032, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00053, 0.00034, 0.00032, 0.00033, 0.00034, 0.00031, 0.00033, 0.00035, 0.00032, 0.00033, 0.00036, 0.00035, 0.00033, 0.00033, 0.00034, 0.00035, 0.00033, 0.00034, 0.00032, 0.00035]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.26638, 0.00127, 0.00123, 0.00144, 0.00125, 0.00123, 0.00128, 0.00162, 0.00128, 0.00131, 0.00138, 0.00133, 0.00142, 0.0013, 0.00136, 0.00137, 0.00133, 0.00135, 0.00129, 0.00136]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.01282, 0.00738, 0.00728, 0.00736, 0.00738, 0.00733, 0.00738, 0.00735, 0.00731, 0.00727, 0.00897, 0.00755, 0.0073, 0.00721, 0.00734, 0.00746, 0.00736, 0.00734, 0.00737, 0.00726]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00984, 0.00108, 0.00105, 0.00108, 0.00105, 0.00105, 0.00107, 0.00104, 0.00105, 0.00106, 0.00106, 0.00105, 0.0012, 0.00106, 0.00105, 0.00105, 0.00105, 0.00106, 0.00104, 0.00106]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0011, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.0015, 0.00102, 0.00101, 0.00101, 0.00102, 0.00268, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.29197, 0.01172, 0.01152, 0.01191, 0.01165, 0.01156, 0.0117, 0.01199, 0.01159, 0.01161, 0.0134, 0.01194, 0.01269, 0.01155, 0.01172, 0.01186, 0.01173, 0.01343, 0.01172, 0.01165]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.7057, 0.68569, 0.68236, 0.69077, 0.68415, 0.67238, 0.68288, 0.67481, 0.6874, 0.67748, 0.6785, 0.67478, 0.83941, 0.6755, 0.67503, 0.67787, 0.67668, 0.67904, 0.67443, 0.67541]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..076389c3d6
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b0d00b8f83
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..c59b98b90a
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [11.55278, 0.77358, 0.76856, 0.77172, 0.75887, 0.76061, 0.75836, 0.76125, 0.76192, 0.76187, 0.76171, 0.76045, 0.7599, 0.76535, 0.76121, 0.76796, 0.76998, 0.76511, 0.76167, 0.75816]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [6.97639, 0.39525, 0.3898, 0.39437, 0.37749, 0.38195, 0.37908, 0.37821, 0.38433, 0.38023, 0.38359, 0.37973, 0.37768, 0.37754, 0.38336, 0.38173, 0.39026, 0.38845, 0.38337, 0.37691]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.32964, 0.37495, 0.37481, 0.37567, 0.37884, 0.37558, 0.37486, 0.37929, 0.37612, 0.37965, 0.37608, 0.37503, 0.37843, 0.38541, 0.37552, 0.38094, 0.37923, 0.37628, 0.37437, 0.37757]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.89543, 0.00188, 0.00211, 0.00164, 0.00165, 0.00162, 0.00162, 0.00162, 0.00184, 0.00165, 0.00164, 0.00208, 0.00162, 0.00167, 0.0016, 0.00168, 0.00165, 0.00163, 0.00164, 0.00161]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00146, 0.00105, 0.00105, 0.00102, 0.00107, 0.00107, 0.00107, 0.00109, 0.00105, 0.00106, 0.00107, 0.00106, 0.00106, 0.00106, 0.00108, 0.00108, 0.00107, 0.00104, 0.00103, 0.0011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.50022, 0.00376, 0.00381, 0.00329, 0.00321, 0.00354, 0.00371, 0.00375, 0.00366, 0.00301, 0.00349, 0.00372, 0.00349, 0.00369, 0.00297, 0.00283, 0.00369, 0.00377, 0.00388, 0.00369]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.04986, 0.02302, 0.02299, 0.02588, 0.02338, 0.0231, 0.02293, 0.0231, 0.02309, 0.02329, 0.02328, 0.02332, 0.02304, 0.02327, 0.02287, 0.02321, 0.02315, 0.0234, 0.02312, 0.02327]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0158, 0.00219, 0.00221, 0.00411, 0.0022, 0.0022, 0.00216, 0.0022, 0.00217, 0.00218, 0.00218, 0.00225, 0.00233, 0.00219, 0.00223, 0.00222, 0.00212, 0.0022, 0.00222, 0.00225]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00301, 0.00302, 0.00302, 0.00339, 0.003, 0.00302, 0.00302, 0.00301, 0.00301, 0.00301, 0.003, 0.00301, 0.00302, 0.00304, 0.003, 0.00301, 0.00299, 0.00304, 0.00303, 0.00303]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.57167, 0.03386, 0.03382, 0.03847, 0.03353, 0.03358, 0.03363, 0.03394, 0.03377, 0.03326, 0.03368, 0.03412, 0.03363, 0.03407, 0.03281, 0.03316, 0.03373, 0.03419, 0.03396, 0.034]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [13.15856, 0.82951, 0.82427, 0.83168, 0.8147, 0.81581, 0.81386, 0.8171, 0.8176, 0.81664, 0.81719, 0.81685, 0.81547, 0.82136, 0.81551, 0.82315, 0.82591, 0.82132, 0.81777, 0.81414]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d1b9e8429e
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..540d4c1b73
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
new file mode 100644
index 0000000000..d932464f76
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values.json
@@ -0,0 +1,763 @@
+{
+    "forward-backward-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.18678,
+            0.67885,
+            0.68278,
+            0.68333,
+            0.67855,
+            0.68179,
+            0.68809,
+            0.67808,
+            0.67889,
+            0.69586,
+            0.69577,
+            0.67938,
+            0.68076,
+            0.68551,
+            0.69108,
+            0.67821,
+            0.68422,
+            0.68947,
+            0.67891,
+            0.68614
+        ]
+    },
+    "forward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            8.91183,
+            0.31386,
+            0.31455,
+            0.31529,
+            0.31399,
+            0.31376,
+            0.3168,
+            0.31219,
+            0.31205,
+            0.32539,
+            0.32943,
+            0.31424,
+            0.31569,
+            0.32161,
+            0.32188,
+            0.31166,
+            0.31627,
+            0.31935,
+            0.31029,
+            0.32078
+        ]
+    },
+    "backward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            4.25414,
+            0.3682,
+            0.37658,
+            0.37755,
+            0.37333,
+            0.37381,
+            0.37727,
+            0.37278,
+            0.37206,
+            0.37541,
+            0.37183,
+            0.37214,
+            0.37101,
+            0.37247,
+            0.37485,
+            0.36955,
+            0.37359,
+            0.3825,
+            0.37545,
+            0.37777
+        ]
+    },
+    "layernorm-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00004,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002
+        ]
+    },
+    "embedding-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003
+        ]
+    },
+    "all-grads-sync-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.9061,
+            0.00163,
+            0.00202,
+            0.00163,
+            0.00157,
+            0.00156,
+            0.00183,
+            0.0016,
+            0.00183,
+            0.00157,
+            0.00157,
+            0.00158,
+            0.00168,
+            0.00158,
+            0.00169,
+            0.00156,
+            0.00157,
+            0.00157,
+            0.00156,
+            0.00185
+        ]
+    },
+    "optimizer-copy-to-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0011,
+            0.00104,
+            0.00102,
+            0.00101,
+            0.00097,
+            0.00098,
+            0.001,
+            0.00096,
+            0.00096,
+            0.00099,
+            0.00095,
+            0.00097,
+            0.00096,
+            0.00098,
+            0.00097,
+            0.00098,
+            0.00095,
+            0.00099,
+            0.00098,
+            0.00099
+        ]
+    },
+    "optimizer-clip-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.59317,
+            0.00265,
+            0.00282,
+            0.00284,
+            0.00289,
+            0.00298,
+            0.00282,
+            0.00294,
+            0.00302,
+            0.00301,
+            0.00304,
+            0.00294,
+            0.00253,
+            0.00296,
+            0.00251,
+            0.00227,
+            0.00282,
+            0.00287,
+            0.00308,
+            0.00276
+        ]
+    },
+    "optimizer-count-zeros-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.04375,
+            0.02396,
+            0.02387,
+            0.02381,
+            0.02385,
+            0.02393,
+            0.0241,
+            0.02406,
+            0.02393,
+            0.024,
+            0.02396,
+            0.024,
+            0.0241,
+            0.02397,
+            0.024,
+            0.02378,
+            0.0238,
+            0.02393,
+            0.02395,
+            0.02405
+        ]
+    },
+    "optimizer-inner-step-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.01715,
+            0.00212,
+            0.0021,
+            0.00212,
+            0.00212,
+            0.00211,
+            0.00218,
+            0.00213,
+            0.00212,
+            0.00214,
+            0.00211,
+            0.00226,
+            0.00211,
+            0.00209,
+            0.00211,
+            0.00218,
+            0.00207,
+            0.00211,
+            0.00213,
+            0.00218
+        ]
+    },
+    "optimizer-copy-main-to-model-params-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00281,
+            0.00282,
+            0.00281,
+            0.00283,
+            0.00281,
+            0.00283,
+            0.00289,
+            0.00286,
+            0.00281,
+            0.00284,
+            0.00282,
+            0.00431,
+            0.00295,
+            0.00284,
+            0.00283,
+            0.00283,
+            0.18259,
+            0.00284,
+            0.00283,
+            0.00295
+        ]
+    },
+    "optimizer-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.65881,
+            0.03322,
+            0.03326,
+            0.03323,
+            0.03329,
+            0.03345,
+            0.03361,
+            0.03357,
+            0.03352,
+            0.03364,
+            0.03349,
+            0.03532,
+            0.03332,
+            0.03347,
+            0.03313,
+            0.03267,
+            0.21285,
+            0.03336,
+            0.03358,
+            0.03357
+        ]
+    },
+    "learning-rate": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "learning-rate vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "batch-size": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "batch-size vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "lm loss vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "loss-scale": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "loss-scale vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "grad-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "grad-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "num-zeros vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "params-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "params-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            15.87098,
+            0.73261,
+            0.73669,
+            0.73696,
+            0.73228,
+            0.73561,
+            0.74191,
+            0.73193,
+            0.73279,
+            0.75004,
+            0.74974,
+            0.73772,
+            0.73447,
+            0.73951,
+            0.74553,
+            0.73119,
+            0.9162,
+            0.74318,
+            0.73275,
+            0.74014
+        ]
+    },
+    "lm loss validation": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation ppl": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    },
+    "lm loss validation ppl vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6aae44ca71
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6e9731d4ce
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
index c5dbbb35ea..64784c36a6 100644
--- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
@@ -46,7 +46,7 @@ MODEL_ARGS:
   --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
   --tokenizer-type: BertWordPieceCase
   --split: 99982,9,9
-  --data-cache-path: ${OUTPUT_PATH}/cache
+  --data-cache-path: ${DATA_CACHE_PATH}
   --vocab-extra-ids: 100
 
   # EVAL_AND_LOGGING_ARGS
diff --git a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
index d42b73b8af..5a31d9d3d4 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_sequential_mlp.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from importlib.metadata import version
-
 import pytest
 import torch
-from pkg_resources import packaging
 
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing import load, load_plain_tensors, save
@@ -21,11 +18,10 @@
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.moe.experts import SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 def initialize_expert_layer(seed, glu=True, moe_grouped_gemm=False, **config_kwargs):
     torch.manual_seed(seed)
@@ -69,7 +65,7 @@ def get_pp_offsets():
 
 
 moe_grouped_gemm_options = [False]
-if _te_version >= packaging.version.Version("1.9.0.dev0"):
+if is_te_min_version("1.9.0.dev0"):
     moe_grouped_gemm_options.append(True)
 
 
@@ -155,7 +151,7 @@ def test_parallel_reconfiguration_e2e(
             assert not any(map(bool, diffs)), diffs
 
     @pytest.mark.skipif(
-        _te_version < packaging.version.Version("1.9.0.dev0"),
+        not is_te_min_version("1.9.0.dev0"),
         reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.",
     )
     @pytest.mark.parametrize(
diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
index a93f263d50..1238d09f76 100644
--- a/tests/unit_tests/dist_checkpointing/test_fp8.py
+++ b/tests/unit_tests/dist_checkpointing/test_fp8.py
@@ -51,6 +51,7 @@ def get_ten(dtype: str = 'fp8'):
             (False, (2, 4), (2, 4), None),
         ],
     )
+    @pytest.mark.skip(reason="Flaky test")
     def test_fp8_save_load(
         self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
     ):
diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
index e1f3eb75f4..d5d5cdce8f 100644
--- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
+++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
@@ -29,6 +29,7 @@ def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
+    @pytest.mark.skip(reason="Flaky test")
     def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0
diff --git a/tests/unit_tests/export/trtllm/__init__.py b/tests/unit_tests/export/trtllm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
new file mode 100644
index 0000000000..5a0aa0e9c5
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+
+class TestTRTLLMDistributedGPUConverter:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(2, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=64,
+            num_attention_heads=2,
+            use_cpu_initialization=True,
+            pipeline_dtype=torch.float32,
+            add_qkv_bias=False,
+            add_bias_linear=False,
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=_VOCAB_SIZE,
+            max_sequence_length=_SEQUENCE_LENGTH,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_get_model_weights_converter(self, mocker):
+        device = torch.device("cuda")
+        self.gpt_model.to(device)
+
+        transformer_config = self.gpt_model.config
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        dtype = DataType.bfloat16
+        distributed_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config, dtype, activation="gelu"
+        )
+
+        model_state_dict = {}
+        for key, val in self.gpt_model.state_dict().items():
+            # val is non for _extra_state layers . We filter it out
+            if val is not None:
+                model_state_dict[key] = val
+
+        distributed_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=GPT_DICT,
+            tokenizer_vocab_size=_VOCAB_SIZE,
+        )
+
+        expected_result = {
+            'transformer.vocab_embedding.weight': torch.Size([128, 64]),
+            'transformer.position_embedding.weight': torch.Size([32, 64]),
+            'lm_head.weight': torch.Size([128, 64]),
+            'transformer.ln_f.weight': torch.Size([64]),
+            'transformer.ln_f.bias': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.0.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.0.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.0.mlp.proj.weight': torch.Size([64, 128]),
+            'transformer.layers.1.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.1.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.1.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.1.mlp.proj.weight': torch.Size([64, 128]),
+        }
+
+        for key, value in distributed_converter.trtllm_model_weights.items():
+            assert (
+                expected_result[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result[key]} but got {value.shape}"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_helper.py b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
new file mode 100644
index 0000000000..53c0a5ffea
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
@@ -0,0 +1,73 @@
+import pytest
+
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+
+
+# TODO : Remove importorskip and handle with mocker
+class TestTRTLLMHelper:
+
+    def test_exceptions(self, mocker):
+        pytest.importorskip('tensorrt_llm')
+
+        from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+
+        trtllm_helper = TRTLLMHelper(
+            transformer_config=None,
+            model_type=ModelType.gpt,
+            share_embeddings_and_output_weights=True,
+        )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                vocab_size=None,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                ModelType=ModelType.falcon,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(),
+                on_device_distributed_conversion=True,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                vocab_size=100,
+                on_device_distributed_conversion=True,
+                gpus_per_node=None,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=False),
+                on_device_distributed_conversion=False,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=True),
+                vocab_size=100,
+            )
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_layers.py b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
new file mode 100644
index 0000000000..b2e88852e5
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
@@ -0,0 +1,111 @@
+import pytest
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers, get_layer_name_without_prefix
+
+
+class TestTRTLLMLayers:
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_without_layer_numbers(self):
+
+        conversion_dict = {
+            "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "transformer.layers.attn.dense.bias": 0,
+            "transformer.layers.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+        assert (
+            converted_dict[TRTLLMLayers.attention_dense_bias.value] == 0
+        ), "Something wrong with conversion dict"
+        assert (
+            converted_dict[TRTLLMLayers.mlp_fc_weight.value] == 1
+        ), "Something wrong with conversion dict"
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_exception(self):
+
+        with pytest.raises(AssertionError):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": "randomValue",
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            del conversion_dict["attn.dense.bias"]
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=True,
+            )
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_with_layer_numbers(self):
+
+        conversion_dict = {
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "deocder.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "decoder.lm_head.weight": 2,
+            "decoder.layers.0.attn.dense.bias": 0,
+            "deocder.layers.43.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        assert (
+            converted_dict['transformer.layers.0.attention.dense.bias'] == 0
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['transformer.layers.43.mlp.fc.weight'] == 1
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['lm_head.weight'] == 2
+        ), "Something wrong with conversion of layer names"
+
+    def test_get_layer_name_without_prefix(self):
+        layer_name_without_prefix = get_layer_name_without_prefix(
+            TRTLLMLayers.attention_dense_weight
+        )
+        assert (
+            layer_name_without_prefix == "attention.dense.weight"
+        ), f"get_layer_name_without_prefix returned {layer_name_without_prefix}, expected attention.dense.weight"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
new file mode 100644
index 0000000000..e431326f0b
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
@@ -0,0 +1,169 @@
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TestTRTLLMSingleDeviceConverter:
+    def test_get_model_weights_converter(self, mocker):
+
+        export_config = ExportConfig(inference_tp_size=2)
+
+        vocab_size = 10
+        hidden_dim = 4
+        seq_len = 8
+        num_layers = 2
+        num_attn_heads = 2
+
+        model_config = TransformerConfig(
+            num_layers=num_layers,
+            num_attention_heads=num_attn_heads,
+            num_query_groups=0,
+            hidden_size=hidden_dim,
+            ffn_hidden_size=hidden_dim * 4,
+        )
+
+        dtype = DataType.bfloat16
+
+        model_state_dict = {
+            "decoder.position_embedding.weight": torch.randn(seq_len, hidden_dim),
+            "decoder.word_embedding.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.lm_head.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.final_layernorm.weight": torch.randn(hidden_dim),
+            "decoder.layers.input_layernorm.weight": torch.randn(num_layers, hidden_dim),
+            "decoder.layers.attention.qkv.weight": torch.randn(
+                num_layers, hidden_dim * 3, hidden_dim
+            ),
+            "decoder.layers.attention.qkv.bias": torch.randn(num_layers, hidden_dim * 3),
+            "decoder.layers.attention.dense.weight": torch.randn(
+                num_layers, hidden_dim, hidden_dim
+            ),
+            "deocder.layers.mlp.fc.weight": torch.randn(num_layers, 4 * hidden_dim, hidden_dim),
+            "decoder.layers.mlp.fc.expert": torch.randn(num_layers, hidden_dim, hidden_dim * 4),
+            "decoder.layers.mlp.proj.expert": torch.randn(num_layers, hidden_dim * 4, hidden_dim),
+        }
+
+        trtllm_conversion_dict = {
+            "decoder.position_embedding.weight": TRTLLMLayers.position_embedding,
+            "decoder.word_embedding.weight": TRTLLMLayers.vocab_embedding,
+            "decoder.final_layernorm.weight": TRTLLMLayers.final_layernorm_weight,
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.input_layernorm.weight": TRTLLMLayers.input_layernorm_weight,
+            "decoder.layers.attention.qkv.weight": TRTLLMLayers.attention_qkv_weight,
+            "decoder.layers.attention.qkv.bias": TRTLLMLayers.attention_qkv_bias,
+            "decoder.layers.attention.dense.weight": TRTLLMLayers.attention_dense_weight,
+            "deocder.layers.mlp.fc.weight": TRTLLMLayers.mlp_fc_weight,
+            "decoder.layers.mlp.fc.expert": TRTLLMLayers.mlp_fc_weight_mixture_of_experts,
+            "decoder.layers.mlp.proj.expert": TRTLLMLayers.mlp_projection_weight_mixture_of_experts,
+        }
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        trtllm_model_weights_converter_cpu = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config, model_config, dtype, activation="swiglu"
+        )
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.pad_vocab_size",
+            return_value=10,
+        )
+
+        trtllm_model_weights_converter_cpu.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        expected_shapes = {
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (10, 4),
+            'transformer.ln_f.weight': (4,),
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.0.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.0.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.0.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.0.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.0.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.0.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.0.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.1.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.1.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.1.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.1.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.0.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.0.mlp.proj.weight.1.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.1.bin': (4, 8),
+        }
+
+        for key, value in trtllm_model_weights_converter_cpu.trtllm_model_weights.items():
+            assert (
+                expected_shapes[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_shapes[key]} but got {value.shape}"
+
+        class SampleMapping:
+
+            def __init__(self):
+                self.tp_size = 2
+                self.tp_rank = 1
+
+            def pp_layers(self, num_layers):
+                return [0, 1]
+
+            def is_first_pp_rank(self):
+                return True
+
+            def is_last_pp_rank(self):
+                return True
+
+        trtllm_model_weights_per_gpu = (
+            trtllm_model_weights_converter_cpu.get_local_model_weights_per_gpu(
+                mapping=SampleMapping(), trtllm_model_config=None
+            )
+        )
+
+        expected_result_per_gpu = {
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight': (6, 4),
+            'transformer.layers.1.attention.qkv.weight': (6, 4),
+            'transformer.layers.0.attention.qkv.bias': (6,),
+            'transformer.layers.1.attention.qkv.bias': (6,),
+            'transformer.layers.0.attention.dense.weight': (4, 2),
+            'transformer.layers.1.attention.dense.weight': (4, 2),
+            'transformer.layers.0.mlp.gate.weight': (4, 4),
+            'transformer.layers.0.mlp.fc.weight': (16, 2),
+            'transformer.layers.1.mlp.gate.weight': (4, 4),
+            'transformer.layers.1.mlp.fc.weight': (16, 2),
+            'transformer.layers.0.mlp.proj.weight': (4, 8),
+            'transformer.layers.1.mlp.proj.weight': (4, 8),
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (5, 4),
+            'transformer.ln_f.weight': (4,),
+        }
+
+        for key, value in trtllm_model_weights_per_gpu.items():
+            assert (
+                expected_result_per_gpu[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result_per_gpu[key]} but got {value.shape}"
diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py
index 161284ceeb..835aeed22d 100644
--- a/tests/unit_tests/inference/engines/test_mcore_engine.py
+++ b/tests/unit_tests/inference/engines/test_mcore_engine.py
@@ -93,3 +93,30 @@ def test_generate(self):
             ), f"Status should be completed but its {result.status}"
             assert result.generated_length > 0, f"Generated length should be greater than zero"
             assert result.generated_text is not None, f'Generated text should not be None'
+
+    def test_generate_empty_prompt(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.bos = self.vocab_size - 2
+        # Generating random length integer prompts
+        self.mock_tokenizer.tokenize.return_value = [
+            random.randint(0, self.vocab_size - 1) for _ in range(random.randint(5, 10))
+        ]
+        # Generates some random string
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+
+        prompts = ["" for i in range(self.batch_size)]
+        results: List[InferenceRequest] = self.mcore_engine.generate(
+            prompts,
+            add_BOS=True,
+            common_inference_params=CommonInferenceParams(num_tokens_to_generate=10),
+        )
+
+        for result in results:
+            assert (
+                result.status == Status.COMPLETED
+            ), f"Status should be completed but its {result.status}"
+            assert result.generated_length > 0, f"Generated length should be greater than zero"
+            assert result.generated_text is not None, f'Generated text should not be None'
diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
new file mode 100644
index 0000000000..b9ece5c395
--- /dev/null
+++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
@@ -0,0 +1,124 @@
+from argparse import Namespace
+from copy import deepcopy
+from unittest import mock
+
+import numpy as np
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestT5InferenceWrapper:
+
+    def setup_model(self, tensor_parallel_size, pipeline_parallel_size):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_inference_only_tensor_parallel(self):
+        self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1)
+
+        batch_prompt_tokens = (
+            torch.randint(
+                low=0, high=self.vocab_size, size=(self.batch_size, self.decoder_sequence_length)
+            )
+            .int()
+            .cuda()
+        )
+        batch_encoder_prompts = ["sample prompt encoders"] * self.batch_size
+        mock_tokenizer = mock.Mock()
+        mock_tokenizer.pad = self.vocab_size - 1
+        mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=self.encoder_sequence_length
+        ).tolist()
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=batch_prompt_tokens,
+            encoder_prompts=batch_encoder_prompts,
+            tokenizer=mock_tokenizer,
+        )
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+            0, self.decoder_sequence_length
+        )
+
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+
+        assert logits.shape == (
+            self.batch_size,
+            self.decoder_sequence_length,
+            self.vocab_size,
+        ), f"Shape mismatch . Expected {(self.batch_size, self.decoder_sequence_length, self.vocab_size)}, but got {logits.shape}"
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..14c9a88852
--- /dev/null
+++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
@@ -0,0 +1,143 @@
+import random
+import string
+import time
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict
+from unittest import mock
+
+import numpy as np
+import pytest
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestEncoderDecoderTextGenerationController:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=4, pipeline_model_parallel_size=1
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=4,
+            pipeline_model_parallel_size=1,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+        self.mock_tokenizer = mock.Mock()
+
+        self.text_generation_controller = EncoderDecoderTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_generate_all_output_tokens_static_batch(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.pad = self.vocab_size - 2
+        self.mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+        self.mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=(self.encoder_sequence_length - 5)
+        ).tolist()
+
+        active_requests: Dict[int, InferenceRequest] = OrderedDict()
+        for i in range(self.batch_size):
+            prompt = "decoder_sample"
+            prompt_tokens = np.random.randint(
+                self.vocab_size, size=self.decoder_sequence_length
+            ).tolist()
+            encoder_prompt = "encoder_sample"
+            inference_request = InferenceRequest(
+                request_id=i,
+                prompt=prompt,
+                encoder_prompt=encoder_prompt,
+                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                arrival_time=time.time(),
+                prompt_tokens=prompt_tokens,
+                status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
+            )
+            active_requests[i] = inference_request
+
+        requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
+            active_requests
+        )
+
+        for request_id, request in requests.items():
+            assert (
+                request.status == Status.COMPLETED
+            ), f"Status should be completed but its {request.status}"
+            assert request.generated_length > 0, f"Generated length should be greater than zero"
+            assert request.generated_text is not None, "Generated text should not be None"
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
index a9f15faf80..df7109e021 100644
--- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
+++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
@@ -26,7 +26,7 @@
 from tests.unit_tests.test_utilities import Utils
 
 
-class TestTextGenerationController:
+class TestSimpleTextGenerationController:
 
     def setup_method(self, method):
         Utils.initialize_model_parallel(
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
index 30d4aec024..186ce5c34e 100644
--- a/tests/unit_tests/models/test_bert_model.py
+++ b/tests/unit_tests/models/test_bert_model.py
@@ -5,17 +5,20 @@
 
 import pytest
 import torch
-from pkg_resources import packaging
+from packaging.version import Version as PkgVersion
 from pytest_mock import mocker
 
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
 from megatron.core.models.bert.bert_model import BertModel
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestBertModel:
 
@@ -91,110 +94,131 @@ def test_post_process_forward(self):
         assert logits[0].shape[2] == self.bert_model.vocab_size
 
 
-class TestBertModelAssertions:
+class TestBertModelAttentionDimensions:
 
-    @pytest.mark.internal
-    def test_te_assertions_te_less_than_1_7(self, mocker):
-        os.environ.pop('NVTE_FLASH_ATTN', None)
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
         os.environ.pop('NVTE_FUSED_ATTN', None)
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
+        os.environ.pop('NVTE_FLASH_ATTN', None)
+        os.environ.pop('NVTE_UNFUSED_ATTN', None)
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
+        self.transformer_config = TransformerConfig(
             num_layers=2,
             hidden_size=12,
             num_attention_heads=4,
             use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
             pipeline_dtype=torch.bfloat16,
         )
+        # This should convert arbitray mask to padding mask
+        self.bert_model = BertModel(
+            config=self.transformer_config,
+            num_tokentypes=0,
+            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    @pytest.mark.internal
+    def test_local_spec(self, mocker):
+        self.bert_model.transformer_layer_spec = bert_layer_local_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_10(self, mocker):
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.arbitrary
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.10"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.padding
+        ), f"Exepcted attn mask type to be padding, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        os.environ['NVTE_FUSED_ATTN'] = '0'
 
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
         with pytest.raises(Exception) as exc_info:
-            mocker.patch(
-                "megatron.core.models.bert.bert_model.get_te_version",
-                return_value=packaging.version.Version("1.4"),
-            )
             self.bert_model = BertModel(
-                config=transformer_config,
+                config=self.transformer_config,
                 num_tokentypes=0,
                 transformer_layer_spec=bert_layer_with_transformer_engine_spec,
                 vocab_size=100,
                 max_sequence_length=4,
             )
-
-        assert (
-            str(exc_info.value)
-            == "Flash and fused attention is not supported with transformer engine version < 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer engine >= 1.7"
+        assert str(exc_info.value) == (
+            "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
+            "instantiating TERowParallelLinear when instantiating SelfAttention when "
+            "instantiating TransformerLayer"
         )
 
     @pytest.mark.internal
-    def test_te_assertions_te_equal_to_1_7_exception(self, mocker):
+    def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker):
         os.environ['NVTE_FLASH_ATTN'] = '0'
         os.environ['NVTE_FUSED_ATTN'] = '0'
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2,
-            hidden_size=12,
-            num_attention_heads=4,
-            use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
-            pipeline_dtype=torch.bfloat16,
-        )
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.arbitrary
+        ), f"Exepcted attn mask type to be arbitrary, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
 
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_less_than_1_7(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
         with pytest.raises(Exception) as exc_info:
-            mocker.patch(
-                "megatron.core.models.bert.bert_model.get_te_version",
-                return_value=packaging.version.Version("1.7"),
-            )
+            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.5"))
             self.bert_model = BertModel(
-                config=transformer_config,
+                config=self.transformer_config,
                 num_tokentypes=0,
                 transformer_layer_spec=bert_layer_with_transformer_engine_spec,
                 vocab_size=100,
                 max_sequence_length=4,
             )
 
-        assert (
-            str(exc_info.value)
-            == "Both NVTE_FLASH_ATTN and NVTE_FUSED_ATTN env flag set to 0. Either unset both of them or set one of them to 1 to use a more optimized attention kernal. Currently using unfused attention path. If you want to proceed with this path set AttnMaskType in module spec to be arbitrary"
-        )
-
-    @pytest.mark.internal
-    def test_te_assertions_te_equal_to_1_7_no_exception(self, mocker):
-        os.environ.pop('NVTE_FLASH_ATTN', None)
-        os.environ.pop('NVTE_FUSED_ATTN', None)
-        tp = 1
-        pp = 1
-        Utils.initialize_model_parallel(tp, pp)
-        model_parallel_cuda_manual_seed(123)
-        transformer_config = TransformerConfig(
-            num_layers=2,
-            hidden_size=12,
-            num_attention_heads=4,
-            use_cpu_initialization=True,
-            perform_initialization=True,
-            tensor_model_parallel_size=tp,
-            pipeline_model_parallel_size=pp,
-            pipeline_dtype=torch.bfloat16,
-        )
-
-        mocker.patch(
-            "megatron.core.models.bert.bert_model.get_te_version",
-            return_value=packaging.version.Version("1.7"),
+        assert str(exc_info.value) == (
+            "Flash and fused attention is not supported with transformer engine version "
+            "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+            "engine >= 1.7"
         )
-        self.bert_model = BertModel(
-            config=transformer_config,
-            num_tokentypes=0,
-            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
-            vocab_size=100,
-            max_sequence_length=4,
-        )
-        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
index e246ef466a..b3142fb807 100644
--- a/tests/unit_tests/models/test_llava_model.py
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -18,16 +18,22 @@ def setup_method(self, method):
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
 
+        self.language_hidden_size = 64
+        self.language_num_attention_heads = 4
+
         language_config = TransformerConfig(
-            num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False
+            num_layers=3,
+            hidden_size=self.language_hidden_size,
+            num_attention_heads=self.language_num_attention_heads,
+            use_cpu_initialization=False,
         )
         vision_config = TransformerConfig(
-            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False
+            num_layers=2, hidden_size=16, num_attention_heads=2, use_cpu_initialization=False
         )
         vision_projection_config = TransformerConfig(
             num_layers=2,
-            hidden_size=128,
-            ffn_hidden_size=72,
+            hidden_size=self.language_hidden_size,
+            ffn_hidden_size=32,
             num_attention_heads=1,
             use_cpu_initialization=False,
         )
@@ -36,10 +42,11 @@ def setup_method(self, method):
         vision_layer_spec = deepcopy(language_layer_spec)
         vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
 
+        vision_config.vision_model_type = "clip"
         self.model = LLaVAModel(
             language_transformer_config=language_config,
             language_transformer_layer_spec=language_layer_spec,
-            language_vocab_size=2048,
+            language_vocab_size=8192,
             language_max_sequence_length=4096,
             vision_transformer_config=vision_config,
             vision_transformer_layer_spec=vision_layer_spec,
@@ -60,7 +67,7 @@ def test_constructor(self):
         assert isinstance(self.model, LLaVAModel)
 
         num_weights = sum([p.numel() for p in self.model.parameters()])
-        assert num_weights == 1832520
+        assert num_weights == 1488736
 
     @pytest.mark.internal
     def test_set_input_tensor(self):
@@ -73,12 +80,18 @@ def test_set_input_tensor(self):
     def test_preprocess_data(self):
         self.model.cuda()
 
-        image_embedding_value = torch.tensor(123.0)
+        hidden_size = 72
+
         # 3 images with 1 tile and 2 image with 2 tiles = 7 tiles.
-        image_embeddings = image_embedding_value * torch.ones((577, 7, 128)).cuda()
+        image_embeddings = (
+            1e-5
+            * torch.arange(577 * 7 * hidden_size, dtype=torch.float)
+            .reshape(577, 7, hidden_size)
+            .cuda()
+        )
 
         image_token_index = -200
-        input_ids = torch.arange(0, 1024, dtype=torch.int).expand(5, 1024).cuda()
+        input_ids = torch.arange(1024).expand(5, 1024).cuda()
         input_ids[0, 0] = image_token_index  # image before text
         input_ids[1, 100] = image_token_index  # image in between
         input_ids[2, -1] = image_token_index  # image at the end
@@ -86,8 +99,14 @@ def test_preprocess_data(self):
         input_ids[4, 50] = image_token_index  # two images in between
         input_ids[4, 150] = image_token_index
 
-        language_embedding_value = torch.tensor(999.0)
-        language_embeddings = language_embedding_value * torch.ones((5, 1024, 128)).cuda()
+        # Offset by 1000 to distinguish from image embeddings.
+        language_embeddings = (
+            1000.0
+            + 1e-5
+            * torch.arange(5 * 1024 * hidden_size, dtype=torch.float)
+            .reshape(5, 1024, hidden_size)
+            .cuda()
+        )
 
         # Labels are input_ids shifted to left by one.
         labels = torch.arange(1, 1025, dtype=torch.int).expand(5, 1024).cuda()
@@ -121,14 +140,14 @@ def test_preprocess_data(self):
         # The fifth sample has 2 images with 3 tiles and 1024 text tokens.
         max_seq_len = 3 * img_seq_len - 2 + 1024
 
-        assert embeddings.shape == torch.Size((max_seq_len, 5, 128))
+        assert embeddings.shape == torch.Size((max_seq_len, 5, hidden_size))
         assert labels.shape == torch.Size((5, max_seq_len))
         assert loss_mask.shape == labels.shape
 
         # First sample where image is before text (index 0).
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:577] = image_embedding_value
-        expected_embeddings[577:1600] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:577] = image_embeddings[:, 0]
+        expected_embeddings[577:1600] = language_embeddings[0, 1:]
         expected_embeddings[1600:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -144,15 +163,16 @@ def test_preprocess_data(self):
         expected_loss_mask[696:1600] = 1
         expected_loss_mask[1600:] = 0
 
-        assert torch.allclose(embeddings[:, 0], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 0], expected_embeddings)
         assert torch.allclose(labels[0], expected_labels)
         assert torch.allclose(loss_mask[0], expected_loss_mask)
 
         # Second sample where image is in between (index 100). The image has 2 tiles.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:100] = language_embedding_value
-        expected_embeddings[100:1254] = image_embedding_value
-        expected_embeddings[1254:2177] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:100] = language_embeddings[1, :100]
+        expected_embeddings[100:677] = image_embeddings[:, 1]
+        expected_embeddings[677:1254] = image_embeddings[:, 2]
+        expected_embeddings[1254:2177] = language_embeddings[1, 101:]
         expected_embeddings[2177:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -172,14 +192,14 @@ def test_preprocess_data(self):
         expected_loss_mask[1273:2177] = 1
         expected_loss_mask[2177:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 1], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 1], expected_embeddings)
         assert torch.allclose(labels[1], expected_labels)
         assert torch.allclose(loss_mask[1], expected_loss_mask)
 
         # Third sample where image is at the end.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:1023] = language_embedding_value
-        expected_embeddings[1023:1600] = image_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1023] = language_embeddings[2, :1023]
+        expected_embeddings[1023:1600] = image_embeddings[:, 3]
         expected_embeddings[1600:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -195,13 +215,13 @@ def test_preprocess_data(self):
         expected_loss_mask[1023:1600] = 0
         expected_loss_mask[1600:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 2], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 2], expected_embeddings)
         assert torch.allclose(labels[2], expected_labels)
         assert torch.allclose(loss_mask[2], expected_loss_mask)
 
         # Fourth sample where there is no image.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:1024] = language_embedding_value
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1024] = language_embeddings[3]
         expected_embeddings[1024:] = 0  # padding
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
@@ -212,17 +232,18 @@ def test_preprocess_data(self):
         expected_loss_mask[:1024] = 1
         expected_loss_mask[1024:] = 0  # padding
 
-        assert torch.allclose(embeddings[:, 3], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 3], expected_embeddings)
         assert torch.allclose(labels[3], expected_labels)
         assert torch.allclose(loss_mask[3], expected_loss_mask)
 
-        # Fifth sample has two images in between. The first image has two tiles.
-        expected_embeddings = torch.empty(max_seq_len).cuda()
-        expected_embeddings[:50] = language_embedding_value
-        expected_embeddings[50:1204] = image_embedding_value  # two tiles
-        expected_embeddings[1204:1303] = language_embedding_value
-        expected_embeddings[1303:1880] = image_embedding_value
-        expected_embeddings[1880:] = language_embedding_value
+        # Fifth sample has two images in between (indices 50 and 150). The first image has two tiles.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:50] = language_embeddings[4, :50]
+        expected_embeddings[50:627] = image_embeddings[:, 4]  # two tiles
+        expected_embeddings[627:1204] = image_embeddings[:, 5]
+        expected_embeddings[1204:1303] = language_embeddings[4, 51:150]
+        expected_embeddings[1303:1880] = image_embeddings[:, 6]
+        expected_embeddings[1880:] = language_embeddings[4, 151:]
 
         expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
         expected_labels[:49] = torch.arange(1, 50)
@@ -238,7 +259,7 @@ def test_preprocess_data(self):
         expected_loss_mask[1302:1880] = 0
         expected_loss_mask[1880:] = 1
 
-        assert torch.allclose(embeddings[:, 4], expected_embeddings.unsqueeze(1))
+        assert torch.allclose(embeddings[:, 4], expected_embeddings)
         assert torch.allclose(labels[4], expected_labels)
         assert torch.allclose(loss_mask[4], expected_loss_mask)
 
@@ -309,7 +330,7 @@ def test_forward(self):
             loss_mask=None,
             num_image_tiles=num_image_tiles,
         )
-        assert logits.shape == torch.Size((5, max_seq_len, 2048))
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
 
         # Try without labels and with inference params.
         inference_params = InferenceParams(5, max_seq_len)
@@ -323,7 +344,7 @@ def test_forward(self):
             num_image_tiles=num_image_tiles,
             inference_params=inference_params,
         )
-        assert logits.shape == torch.Size((5, max_seq_len, 2048))
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
 
         # Check KV cache got populated correctly.
         kv_dict = inference_params.key_value_memory_dict
@@ -332,7 +353,11 @@ def test_forward(self):
         for layer_no in range(1, 4):  # 3 layers in the model.
             layer_kv = kv_dict[layer_no]
             # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head]
-            assert layer_kv[0].shape == layer_kv[1].shape == torch.Size((max_seq_len, 5, 8, 16))
+            assert (
+                layer_kv[0].shape
+                == layer_kv[1].shape
+                == torch.Size((max_seq_len, 5, self.language_num_attention_heads, 16))
+            )
 
     @pytest.mark.internal
     def test_save_load(self, tmp_path):
@@ -353,3 +378,62 @@ def test_freeze(self):
 
         for param in self.model.vision_projection.parameters():
             assert param.requires_grad
+
+
+class TestLLaVAModelSigLIP:
+    @pytest.mark.internal  # The model is under active development and its methods may change.
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        language_config = TransformerConfig(
+            num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False
+        )
+        vision_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False
+        )
+        vision_projection_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            ffn_hidden_size=72,
+            num_attention_heads=1,
+            use_cpu_initialization=False,
+        )
+
+        language_layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        vision_layer_spec = deepcopy(language_layer_spec)
+        vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
+
+        vision_config.vision_model_type = "siglip"
+        self.model = LLaVAModel(
+            language_transformer_config=language_config,
+            language_transformer_layer_spec=language_layer_spec,
+            language_vocab_size=2048,
+            language_max_sequence_length=4096,
+            vision_transformer_config=vision_config,
+            vision_transformer_layer_spec=vision_layer_spec,
+            drop_vision_class_token=False,
+            vision_projection_config=vision_projection_config,
+            vision_projection_layer_spec=vision_projection_spec,
+            img_h=336,
+            img_w=336,
+            patch_dim=14,
+        )
+
+    @pytest.mark.internal
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.model, LLaVAModel)
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 1832456
+
+    @pytest.mark.internal
+    def test_set_input_tensor(self):
+        expected_shape = (1, 2, 3, 4)
+        input_tensor = torch.zeros(expected_shape)
+        self.model.set_input_tensor(input_tensor)
+        assert self.model.vision_model.decoder.input_tensor.shape == expected_shape
diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py
index 9fcc38c259..039ad071a7 100644
--- a/tests/unit_tests/tensor_parallel/test_initialization.py
+++ b/tests/unit_tests/tensor_parallel/test_initialization.py
@@ -4,13 +4,16 @@
 import torch
 
 import megatron.core.parallel_state as ps
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.tensor_parallel.layers import (
     ColumnParallelLinear,
     RowParallelLinear,
     VocabParallelEmbedding,
 )
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TERowParallelLinear,
+)
 from megatron.core.transformer.transformer_config import TransformerConfig
 from tests.unit_tests.test_utilities import Utils
 
@@ -21,6 +24,9 @@ class Test:
         num_layers=1, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
     )
 
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_embedding_init(self):
 
@@ -117,3 +123,79 @@ def test_col_init(self):
         rank = ps.get_tensor_model_parallel_rank()
         assert tp4.shape[0] * 4 == tp1.shape[0]
         assert torch.equal(tp1[rank * 4 : (rank + 1) * 4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_col_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[0] * 4 == tp1.shape[0]
+            assert torch.allclose(tp1[:4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_row_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[1] * 4 == tp1.shape[1]
+            assert torch.allclose(tp1[:, :4], tp4)
diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
index a78921ad10..043bdc8c58 100644
--- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
@@ -1,17 +1,15 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from importlib.metadata import version
-
 import pytest
 import torch
 import torch.nn.functional as F
-from pkg_resources import packaging
 
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.transformer.moe import grouped_gemm_util as gg
 from megatron.core.transformer.moe.experts import TEGroupedMLP
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from megatron.legacy.model import Float16Module
 from megatron.training.arguments import parse_args
 from megatron.training.initialize import _set_random_seed
@@ -21,8 +19,6 @@
 if torch.cuda.is_available():
     DEVICE_CAPABILITY = torch.cuda.get_device_capability()
 
-_te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestParallelGroupedMLP:
 
@@ -218,7 +214,7 @@ def test_gradient_with_no_tokens_allocated(self):
 
 
 @pytest.mark.skipif(
-    _te_version < packaging.version.Version("1.9.0.dev0"),
+    not is_te_min_version("1.9.0.dev0"),
     reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.",
 )
 class TestTEGroupedMLP:
diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
index 40a0caf31a..514e098bfd 100644
--- a/tests/unit_tests/transformer/moe/test_sequential_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 from importlib.metadata import version
 
-import packaging
 import pytest
 import torch
 
@@ -16,10 +15,9 @@
 from megatron.core.transformer.moe.experts import SequentialMLP
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
-te_version = packaging.version.Version(version("transformer-engine"))
-
 
 class TestParallelSequentialMLP:
 
@@ -117,7 +115,7 @@ def setup_method(self, method):
         )
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -133,7 +131,7 @@ def test_constructor(self):
             )
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -155,7 +153,7 @@ def test_gpu_forward(self):
         assert torch.equal(output_local, output_te)
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
@@ -179,7 +177,7 @@ def test_gpu_forward_with_one_local_expert(self):
         assert torch.equal(output_local, output_te)
 
     @pytest.mark.skipif(
-        te_version < packaging.version.Version("1.7.0"),
+        not is_te_min_version("1.7.0"),
         reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
     )
     @pytest.mark.internal
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
new file mode 100644
index 0000000000..4188d7b069
--- /dev/null
+++ b/tests/unit_tests/transformer/test_multi_latent_attention.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from importlib.metadata import version
+
+import pytest
+import torch
+import transformer_engine as te
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.multi_latent_attention import MLASelfAttention
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelMLAAttention:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            q_lora_rank=32,
+            kv_lora_rank=32,
+            qk_head_dim=128,
+            v_head_dim=128,
+            qk_pos_emb_head_dim=64,
+            rotary_base=10000,
+        )
+        self.parallel_attention = MLASelfAttention(
+            self.transformer_config,
+            get_gpt_layer_with_transformer_engine_spec(
+                multi_latent_attention=True
+            ).submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.parallel_attention, MLASelfAttention)
+        assert self.parallel_attention.layer_number == 1
+
+        num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
+        assert num_weights == 65036
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = self.parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+
+    def test_fused_rope_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            self.parallel_attention.config.apply_rope_fusion = True
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+            rotary_pos_emb = torch.ones(
+                sequence_length, 1, 1, self.parallel_attention.config.kv_channels
+            ).cuda()
+            output, bias = self.parallel_attention(
+                hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb
+            )
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+            self.parallel_attention.config.apply_rope_fusion = False
+
+    def test_checkpointed_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            transformer_config = self.transformer_config
+            transformer_config.recompute_granularity = 'selective'
+            checkpointed_parallel_attention = MLASelfAttention(
+                transformer_config,
+                get_gpt_layer_with_transformer_engine_spec(
+                    multi_latent_attention=True
+                ).submodules.self_attention.submodules,
+                layer_number=1,
+            )
+            config = checkpointed_parallel_attention.config
+
+            sequence_length = 32
+            micro_batch_size = 2
+
+            checkpointed_parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (
+                    sequence_length,
+                    micro_batch_size,
+                    checkpointed_parallel_attention.config.hidden_size,
+                )
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = checkpointed_parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity == 'selective'
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py
index 80c3bf7577..a9a245b861 100755
--- a/tests/unit_tests/transformer/test_spec_customization.py
+++ b/tests/unit_tests/transformer/test_spec_customization.py
@@ -2,12 +2,10 @@
 
 import sys
 from dataclasses import dataclass, fields
-from importlib.metadata import version
 
 import pytest
 import torch
 import transformer_engine as te
-from pkg_resources import packaging
 
 from megatron.core.extensions.transformer_engine import (
     TEDotProductAttention,
@@ -26,6 +24,7 @@
 from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import is_te_min_version
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -134,8 +133,7 @@ def test_build_module(self):
         assert id(bda_op) == id(get_bias_dropout_add)
 
     def test_sliding_window_attention(self):
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version < packaging.version.Version("1.2.0"):
+        if not is_te_min_version("1.2.0"):
             print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr)
             return
 
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
index 1b5fec9afd..ea803c5543 100644
--- a/tools/checkpoint/loader_llama_mistral.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -385,15 +385,10 @@ def load_checkpoint_to_model(args):
     '''Set model params.'''
 
     from pretrain_gpt import model_provider
-    if "llama" in args.model_size or "yi" in args.model_size:
-        from transformers import LlamaForCausalLM as ModelForCausalLM
-    elif "mistral" in args.model_size:
-        from transformers import MistralForCausalLM as ModelForCausalLM
-    else:
-        raise AttributeError(f"args.model_size={args.model_size} not supported")
+    from transformers import AutoModelForCausalLM
 
     # Load Huggingface model.
-    hf_model = ModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
+    hf_model = AutoModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
 
     # Init Megatron model.
     model = model_provider(True, True).to(args.params_dtype)
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
index aea481abed..6aec90e41b 100644
--- a/tools/checkpoint/saver_mcore.py
+++ b/tools/checkpoint/saver_mcore.py
@@ -3,11 +3,10 @@
 import os
 import sys
 import torch
-from importlib.metadata import version
-from pkg_resources import packaging
 
 from setter import ModelSetter
 from utils import get_mcore_transformer_block_key, print_memory_usage
+from megatron.core.utils import get_te_version, is_te_min_version
 
 
 class MCoreSetter(ModelSetter):
@@ -288,9 +287,8 @@ def add_arguments(parser):
 def save_checkpoint(queue, args):
 
     # Transformer engine >= 0.12.0, for CPU initialization.
-    te_version = packaging.version.Version(version("transformer-engine"))
-    assert te_version >= packaging.version.Version("0.12.0"), \
-        "transformer engine version: %s (>=0.12.0 required)." % te_version
+    assert is_te_min_version("0.12.0"), \
+        "transformer engine version: %s (>=0.12.0 required)." % get_te_version()
 
     # Search in directory above this
     sys.path.append(os.path.abspath(
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index 861d8d6d73..e5b3f08a58 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -83,7 +83,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
             position_embedding_type=args.position_embedding_type,
             rotary_percent=args.rotary_percent,
-            rotary_base=args.rotary_base
+            rotary_base=args.rotary_base,
+            rope_scaling=args.use_rope_scaling
         )
 
     return model
@@ -122,6 +123,8 @@ def add_text_generate_args(parser):
 
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
+    model.eval()
+
     if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         server = MegatronServer(model)
         server.run("0.0.0.0",port=args.port)