From c28bc05a14e40ca0f8d0ead91dc103aa597110c1 Mon Sep 17 00:00:00 2001
From: Walid Baruni <wdbaruni@gmail.com>
Date: Thu, 16 Jan 2025 15:57:23 +0200
Subject: [PATCH] add llama benchmarking

---
 .gitignore                                    |   1 +
 llama-benchmarking/Dockerfile                 |  13 +
 .../llama3.1_24.11.1/README_405b.md           | 154 ++++++++++++
 .../llama3.1_24.11.1/README_70b.md            | 163 ++++++++++++
 .../llama3.1_24.11.1/README_8b.md             | 162 ++++++++++++
 .../llama3.1_24.11.1/configure.sh             | 168 +++++++++++++
 .../llama3.1_24.11.1/generate_dataset.sh      |  47 ++++
 llama-benchmarking/llama3.1_24.11.1/launch.sh |  76 ++++++
 .../llama3.1_24.11.1/llama3.1_405b.yaml       | 235 ++++++++++++++++++
 .../llama3.1_24.11.1/llama3.1_70b.yaml        | 230 +++++++++++++++++
 .../llama3.1_24.11.1/llama3.1_8b.yaml         | 174 +++++++++++++
 llama-benchmarking/llama3.1_24.11.1/setup.sh  |  49 ++++
 llama-benchmarking/run_training.sh            | 113 +++++++++
 13 files changed, 1585 insertions(+)
 create mode 100644 llama-benchmarking/Dockerfile
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/README_405b.md
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/README_70b.md
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/README_8b.md
 create mode 100755 llama-benchmarking/llama3.1_24.11.1/configure.sh
 create mode 100755 llama-benchmarking/llama3.1_24.11.1/generate_dataset.sh
 create mode 100755 llama-benchmarking/llama3.1_24.11.1/launch.sh
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/llama3.1_405b.yaml
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/llama3.1_70b.yaml
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/llama3.1_8b.yaml
 create mode 100644 llama-benchmarking/llama3.1_24.11.1/setup.sh
 create mode 100644 llama-benchmarking/run_training.sh

diff --git a/.gitignore b/.gitignore
index 72896b4..52768d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ bacalhau
 **/Dockerfile
 !tools/**/Dockerfile
 !scale-tester/**/Dockerfile
+!llama-benchmarking/**/Dockerfile
 **/.py
 .vscode
 js-helloworld/outputs/*
diff --git a/llama-benchmarking/Dockerfile b/llama-benchmarking/Dockerfile
new file mode 100644
index 0000000..0b18df6
--- /dev/null
+++ b/llama-benchmarking/Dockerfile
@@ -0,0 +1,13 @@
+FROM nvcr.io/nvidia/nemo:24.12
+
+WORKDIR /workspace
+
+# Create config directory and copy configs
+RUN mkdir -p /workspace/cfg
+COPY llama3.1_24.11.1/llama3.1_*.yaml /workspace/cfg/
+
+# Copy training script
+COPY run_training.sh /workspace/
+RUN chmod +x /workspace/run_training.sh
+
+ENTRYPOINT ["/bin/bash"]
\ No newline at end of file
diff --git a/llama-benchmarking/llama3.1_24.11.1/README_405b.md b/llama-benchmarking/llama3.1_24.11.1/README_405b.md
new file mode 100644
index 0000000..1cfc412
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/README_405b.md
@@ -0,0 +1,154 @@
+# Overview
+
+This recipe contains information and scripts to produce performance results for the Llama 3.1 training workload. The scripts help perform environment setup, dataset setup, and launch benchmark jobs.
+This variant of the workload is best-suited for GPU clusters with
+
+* At least 576 GPUs with at least 80 GB memory each. Training of this 405-billion parameter variant of the workload will not fit on fewer GPUs with less memory.
+	* 32 GPUs with at least 80GB memory is the minimum when running proxy configs: <576 GPUs.
+* H100 GPUs. This workload runs with BF16 or FP8, which are both supported by H100 GPUs.
+
+# Expected Performance
+
+Performance for Llama 3.1 training is measured by seconds per iteration, or in other words seconds per training step. This metric is logged for every training step in a .out file which is generated inside of the `$STAGE_PATH/results/$GSW_VERSION/$DTYPE/405b/$JOB_TOTAL_GPUS` folder.
+
+Since the performance fluctuates significantly at the beginning, we are using the last training step timing to obtain throughput value.
+
+```shell
+grep train_step_timing results/*.out
+Epoch 0: : 100%|██████████| 50/50 [16:26<00:00, v_num=gjbq, reduced_train_loss=11.70, global_step=49.00, consumed_samples=12600.0, train_step_timing in s=12.80]
+```
+
+To obtain throughput as a tokens per second measurement, follow this formula:
+```shell
+(sequence length) * (global batch size) / (training_step_timing) = (throughput in tokens per second)
+```
+
+E.g. 8192 * 252 / 12.84 = 160778
+
+To calculate time to train estimate:
+```shell
+(total tokens) / (throughput in tokens per second) / (number of seconds in a day) = (time to train in days)
+```
+E.g. 1e12 / 160778 / 86400 = 71.99 days
+
+
+To calculate the model flops utilization (MFU):
+```shell
+MFU = (global batch size) * (model flops) / (training step time) / (number of GPUs) / (peak GPU FLOPS)
+```
+
+The peak theoretical throughput for H100 FP8 is 1979 TFLOPS and for H100 BF16 is 989 TFLOPS.
+
+The model flops for Llama 3.1 405b for GBS=1 is 2.17E+16. Calculation shown [here](#notes).
+
+E.g. Llama 3.1 405b FP8 on 576x H100 GPUs (GBS=252)
+```shell
+peak FP8 FLOPS for H100 = 1979 TFLOPS
+training step time = 11.24
+model flops = 2.17E+16
+MFU = 252 * 2.17E+16 / 11.24 / 576 / 1979E+12 = 42.71%
+```
+
+| Llama 3.1 405b 24.09 BF16 (TP=8, PP=9, CP=2, VP=7, MBS=1, GA=63) | Throughput on 32x H100 GPUs  | Throughput on 96x H100 GPUs  | Throughput on 192x H100 GPUs | Throughput on 576x H100 GPUs | Throughput on 1152x H100 GPUs | Throughput on 2304x H100 GPUs |
+|---|---|---|---|---|---|---|
+| Layers                                 | 7      | 21     | 42     | 126    | 126    | 126    | 
+| GBS                                    | 126    | 126    | 252    | 252    | 504    | 1008   | 
+| PP                                     | 1      | 3      | 3      | 9      | 9      | 9      | 
+| VP                                     | n/a    | 7      | 7      | 7      | 7      | 7      | 
+| Training step time (seconds per step)  | 8.85   | 8.7  5 | 16.93  | 17.20  | 17.52  | 17.62  | 
+| Throughput in tokens per second        | 116632 | 117965 | 121936 | 120022 | 235660 | 468646 | 
+| Model flops utilization                | 58.63% | 56.17% | 57.25% | 55.78% | 54.76% | 54.45% | 
+| Time to train 1T tokens in days        | n/a    | n/a    | n/a    | 96.43  | 49.11  | 24.7   | 
+
+| Llama 3.1 405b 24.09 FP8 (TP=8, PP=9, CP=2, VP=7, MBS=1, GA=63) | Throughput on 32x H100 GPUs  | Throughput on 96x H100 GPUs  | Throughput on 192x H100 GPUs | Throughput on 576x H100 GPUs | Throughput on 1152x H100 GPUs | Throughput on 2304x H100 GPUs |
+|---|---|---|---|---|---|---|
+| Layers                                 | 7      | 21     | 42     | 126    | 126    | 126    | 
+| GBS                                    | 126    | 126    | 252    | 252    | 504    | 1008   | 
+| PP                                     | 1      | 3      | 3      | 9      | 9      | 9      | 
+| VP                                     | n/a    | 7      | 7      | 7      | 7      | 7      | 
+| Training step time (seconds per step)  | 5.80   | 5.71   | 11.00  | 11.24  | 11.31  | 12.35  | 
+| Throughput in tokens per second        | 178118 | 180674 | 187740 | 183664 | 365055 | 668626 | 
+| Model flops utilization                | 44.77% | 43.01% | 44.07% | 42.71% | 42.45% | 38.87% | 
+| Time to train 1T tokens in days        | n/a    | n/a    | n/a    | 63.02  | 31.71  | 17.31  | 
+
+For proxy configs (<576 GPUs scales) we don't provide time to train estimates to avoid misleading conclusions. Proxy configs are not realistic and were created to allow fit of Llama model to smaller number of GPUs than intended.
+
+# Prerequisites
+
+This recipe requires access to Llama 3.1. Instructions are below if needed.
+
+# Request Access
+A HuggingFace account is required and you will need to [create a HuggingFace access token](https://huggingface.co/settings/tokens) in order to run the training script. Add the generated token to your environment via `export HF_TOKEN=<your token>`.
+
+Access to Llama 3.1 must be requested through [Meta's website](https://llama.meta.com/llama-downloads/) then requested on the [HuggingFace Llama](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B) page. The approval process is not automatic and could take a day or more.
+
+# Prepare Environment
+
+Create a staging area by running the attached setup.sh. The script converts the docker image from `nvcr.io/nvidia/nemo:24.09` to the `nvidia+nemo+24.09.sqsh` file under the `$STAGE_PATH` folder and copies NeMo Launcher code from the container. 
+
+```shell
+# Set the path where all artifacts will be downloaded
+export STAGE_PATH=<path to your shared file system folder> (e.g. /lustre/myproject/nemo)
+# Set the Slurm partition to launch against
+export SLURM_PARTITION="batch"
+# Set the Slurm account to launch against
+export SLURM_ACCOUNT="account_name"
+# Set the number of GPUs per node according to Slurm's gres, this is usually 8 or null - https://slurm.schedmd.com/gres.html
+export SLURM_GPUS_PER_NODE=null
+# Set HuggingFace token
+export HF_TOKEN=<your token>
+
+# Run the setup
+bash ./setup.sh
+```
+
+# Prepare Dataset
+Llama 3.1 405B uses synthetic data for training. A dataset does not need to be prepared. Note that Llama3.1 405B uses the GPT2BPETokenizer as a proxy.
+
+# Run Training
+
+NeMo Launcher is using the Hydra framework to process command line arguments and pass them down as hyperparameters to a multi-node job performing the training.
+
+The training will run for the first 50 steps and will stop afterwards. Log files and results will be located under the `$STAGE_PATH/results/$GSW_VERSION/$DTYPE/405b/$JOB_TOTAL_GPUS` folder.
+
+Below is a command template for launching Llama 3.1 405b model training.
+```shell
+DTYPE=<fp8/bf16> MODEL_SIZE=405b sbatch -A ${SLURM_ACCOUNT} -p ${SLURM_PARTITION} -N ${NUM_NODES} ./launch.sh
+```
+Where:
+- `DTYPE` and `MODEL_SIZE` are **required** environment variables.
+	- `DTYPE` can be either `fp8` or `bf16`.
+	- `MODEL_SIZE` should be `405b` in this case.
+- `NUM_NODES` can be calculate by `N_GPUS / N_GPUS_PER_NODE`, `N_GPUS_PER_NODE` is 8 for DGX H100, therefore for 576 GPUs scale, `NUM_NODES` should be `576 / 8 = 72`.
+
+**Note:** it might be necessary to pass `--gres=gpu:8` to sbatch for certain clusters on encountering errors like GPU not found. See https://slurm.schedmd.com/gres.html
+
+The following applies only to the full model scales: 576, 1152, 2304 GPUs. Configurations and Global batch size changes for proxy configs <576 GPUs.
+>>>
+It is important to maintain these values for model parallelism settings in order to accurately assess performance results for completed jobs against expected baseline for the non-proxy 405b configurations:
+* `training.model.tensor_model_parallel_size=8`
+* `training.model.pipeline_model_parallel_size=9`
+* `training.model.virtual_pipeline_model_parallel_size=7`
+* `training.model.context_parallel_size=2`
+
+Global batch size (`training.model.global_batch_size`) value should scale with total number GPUs. The starting global batch size for 576 GPUs is 252, therefore it should set to `<number of total gpus> * 252 / 576`.
+>>>
+
+# Notes
+
+```shell
+model flops = (sequence length) * ((attention flops) + (mlp flops) + (embedding flops))
+
+model flops breakdown:
+    attention flops = 12 * (number of layers) * (hidden size)^2 * (1 + (number of query groups)/(number of attention heads) + (sequence length)/(hidden size))
+    mlp flops = 18 * (number of layers) * (FFN size) * (hidden size)
+    embedding flops = 6 * (vocab size) * (hidden size)
+
+Llama 3.1 405b calculation:
+    sequence length = 8192
+    attention flops = 12 * 126 * 16384^2 * (1 + 16/128 + 8192/16384) = 659,545,915,392
+    mlp flops = 18 * 126 * 53248 * 16384 = 1,978,637,746,176
+    embedding flops = 6 * 128256 * 16384 = 12,608,077,824
+
+    model flops = 8129 * (659,545,915,392 + 1,978,637,746,176 + 12,608,077,824) = 2.17E16
+```
diff --git a/llama-benchmarking/llama3.1_24.11.1/README_70b.md b/llama-benchmarking/llama3.1_24.11.1/README_70b.md
new file mode 100644
index 0000000..34c07ad
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/README_70b.md
@@ -0,0 +1,163 @@
+# Overview
+
+This recipe contains information and scripts to produce performance results for the Llama 3.1 training workload. The scripts help perform environment setup, dataset setup, and launch benchmark jobs.
+This variant of the workload is best-suited for GPU clusters with
+
+* At least 64 GPUs with at least 80 GB memory each. Training of this 70-billion parameter variant of the workload will not fit on fewer GPUs with less memory.
+* H100 GPUs. This workload runs with BF16 or FP8, which are both supported by H100 GPUs.
+
+# Expected Performance
+
+Performance for Llama 3.1 training is measured by seconds per iteration, or in other words seconds per training step. This metric is logged for every training step in a .out file which is generated inside of the `$STAGE_PATH/results/$GSW_VERSION/${DTYPE}/70b/$JOB_TOTAL_GPUS` folder. 
+
+Since the performance fluctuates significantly at the beginning, we are using the last training step timing to obtain throughput value.
+
+```shell
+grep train_step_timing results/*.out
+Epoch 0: : 100%|██████████| 100/100 [20:15<00:00, reduced_train_loss=6.370, global_step=99.00, consumed_samples=12800.0, train_step_timing in s=11.20]
+```
+
+To obtain throughput as a tokens per second measurement, follow this formula: 
+```shell
+(sequence length) * (global batch size) / (training_step_timing) = (throughput in tokens per second)
+```
+
+E.g. 8192 * 128 / 11.31 = 92712
+
+To calculate time to train estimate:
+```shell
+(total tokens) / (throughput in tokens per second) / (number of seconds in a day) = (time to train in days) 
+```
+E.g. 1e12 / 92712 / 86400 = 124.84 days 
+
+
+To calculate the model flops utilization (MFU):
+```shell
+MFU = (global batch size) * (model flops) / (training step time) / (number of GPUs) / (peak GPU FLOPS)
+```
+
+The peak theoretical throughput for H100 FP8 is 1979 TFLOPS and for H100 BF16 is 989 TFLOPS.
+
+The model flops for Llama 3.1 70b for GBS=1 is 3.94E+15. Calculation shown [here](#notes).
+
+E.g. Llama 3.1 70b FP8 on 64x H100 GPUs (GBS=128)
+```shell
+peak FLOPS for H100 = 1979 TFLOPS 
+training step time = 11.31
+model flops = 3.94E+15
+MFU = 128 * 3.94E+15 / 11.31 / 64 / 1979E+12 = 35.21% 
+```
+
+| Llama 3.1 70b 24.09 BF16 (TP=4, PP=4, CP=2, VP=5, MBS=1, GA=64) | Throughput on 64x H100 GPUs (GBS=128) | Throughput on 128x H100 GPUs (GBS=256) | Throughput on 256x H100 GPUs (GBS=512) | Throughput on 512x H100 GPUs (GBS=1024) | Throughput on 1024x H100 GPUs (GBS=2048) | Throughput on 2048x H100 GPUs (GBS=4096)
+|---|:---:|:---:|:---:|:---:|:---:|:---:|
+| Training step time (seconds per step) | 14.72  | 14.73  | 14.8   | 14.89  | 14.92   | 14.98
+| Throughput in tokens per second       | 71235  | 142373 | 283399 | 563372 | 1124478 | 2248957
+| Model flops utilization               | 54.10% | 54.06% | 53.81% | 53.48% | 53.38%  | 53.38%
+| Time to train 1T tokens in days       | 162.48 | 81.29  | 40.84  | 20.54  | 10.29   | 5.15
+
+| Llama 3.1 70b 24.09 FP8 (TP=4, PP=4, CP=2, VP=5, MBS=1, GA=64) | Throughput on 64x H100 GPUs (GBS=128) | Throughput on 128x H100 GPUs (GBS=256) | Throughput on 256x H100 GPUs (GBS=512) | Throughput on 512x H100 GPUs (GBS=1024) | Throughput on 1024x H100 GPUs (GBS=2048) | Throughput on 2048x H100 GPUs (GBS=4096)
+|---|:---:|:---:|:---:|:---:|:---:|:---:|
+| Training step time (seconds per step)   | 11.01  | 10.93  | 11.16  | 11.18  | 11.28   | 11.39
+| Throughput in tokens per second         | 95239  | 191871 | 375834 | 750323 | 1487342 | 2945955
+| Model flops utilization                 | 36.17% | 36.43% | 35.68% | 35.62% | 35.30%  | 34.96%
+| Time to train 1T tokens in days         | 121.53 | 60.32  | 30.80  | 15.43  | 7.78    | 3.93
+
+# Prerequisites
+
+This recipe requires access to Llama 3.1. Instructions are below if needed.
+
+# Request Access
+A HuggingFace account is required and you will need to [create a HuggingFace access token](https://huggingface.co/settings/tokens) in order to run the training script. Add the generated token to your environment via ```export HF_TOKEN=<your token>```.
+
+Access to Llama 3.1 must be requested through [Meta's website](https://llama.meta.com/llama-downloads/) then requested on the [HuggingFace Llama](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) page. The approval process is not automatic and could take a day or more.
+
+# Prepare Environment
+
+Create a staging area by running the attached setup.sh. The script converts the docker image from `nvcr.io/nvidia/nemo:24.09` to the `nvidia+nemo+24.09.sqsh` file under the $STAGE_PATH folder and copies NeMo Launcher code from the container. The setup script also downloads Llama3 tokenizer related files from HuggingFace [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) repo using `HF_TOKEN` obtained in the previous step. **Note:** Llama3.1 8B and 70B use the same tokenizer and for this recipe we use the Llama 3 tokenizer.
+
+```shell
+# Set the path where all artifacts will be downloaded
+export STAGE_PATH=<path to your shared file system folder> (e.g. /lustre/myproject/nemo)
+# Set the Slurm partition to launch against
+export SLURM_PARTITION="batch"
+# Set the Slurm account to launch against
+export SLURM_ACCOUNT="account_name"
+# Set the number of GPUs per node according to Slurm's gres, this is usually 8 or null - https://slurm.schedmd.com/gres.html
+export SLURM_GPUS_PER_NODE=null
+# Set HuggingFace token
+export HF_TOKEN=<your token>
+
+# Run the setup
+bash ./setup.sh
+```
+
+# Prepare Dataset
+Pre-training a GPT-3 model requires a text-based dataset to be downloaded and pre-processed for the NeMo Framework to ingest the data optimally. [The Pile](https://huggingface.co/datasets/monology/pile-uncopyrighted) is often used as the dataset for pre-training models. The NeMo Framework contains helper scripts to download and pre-process the dataset. The following steps outline how to download and pre-process the dataset on DGX Cloud with an explanation of key points after.
+
+Make sure `$STAGE_PATH/llama3.1-dataset/llama` contains tokenizer files downloaded from previous step.
+
+Run the `generate_dataset.sh` script. The script launches several Slurm jobs that will download the dataset from The Pile, pre-process it and save it in a form suitable for subsequent training. The resulting dataset files will be saved under the `$STAGE_PATH/llama3.1-dataset` folder. The dataset creation may use up to 100GB. Make sure you have sufficient disk space available.
+
+
+```shell
+bash ./generate_dataset.sh
+```
+
+If the dataset generation step was successful there should be 2 idx and 2 bin files in the $STAGE_PATH/llama3.1-dataset folder.
+
+```shell
+my-llama_00_text_document.bin
+my-llama_00_text_document.idx
+my-llama_01_text_document.bin
+my-llama_01_text_document.idx
+```
+
+If that is not the case, check the log files in: $STAGE_PATH/results.data_preparation
+
+
+# Run Training
+
+NeMo Launcher is using the Hydra framework to process command line arguments and pass them down as hyperparameters to a multi-node job performing the training.
+
+The training will run for the first 50 steps and will stop afterwards. Log files and results will be located under the `$STAGE_PATH/results/$GSW_VERSION/${DTYPE}/70b/$JOB_TOTAL_GPUS` folder.
+
+Below is a command template for launching Llama 3.1 70b model training.
+```shell
+DTYPE=<fp8/bf16> MODEL_SIZE=70b sbatch -A ${SLURM_ACCOUNT} -p ${SLURM_PARTITION} -N ${NUM_NODES} ./launch.sh
+```
+
+Where:
+- `DTYPE` and `MODEL_SIZE` are **required** environment variables.
+	- `DTYPE` can be either `fp8` or `bf16`.
+	- `MODEL_SIZE` should be `70b` in this case.
+- `NUM_NODES` can be calculate by `N_GPUS / N_GPUS_PER_NODE`, `N_GPUS_PER_NODE` is 8 for DGX H100, therefore for 128 GPUs scale, `NUM_NODES` should be `128 / 8 = 16`.
+
+**Note:** it might be necessary to pass `--gres=gpu:8` to sbatch for certain clusters on encountering errors like GPU not found. See https://slurm.schedmd.com/gres.html
+
+It is important to maintain these values for model parallelism settings in order to accurately assess performance results for completed jobs against expected baseline:
+* `training.model.tensor_model_parallel_size=4`
+* `training.model.pipeline_model_parallel_size=4`
+* `training.model.virtual_pipeline_model_parallel_size=5`
+* `training.model.context_parallel_size=2`
+
+Global batch size ( training.model.global_batch_size) value should be set to ```<number of nodes> * 16. E.g., 16 * 16 = 256 (in the example above)```.
+
+# Notes
+
+```shell
+model flops = (sequence length) * ((attention flops) + (mlp flops) + (embedding flops))
+
+model flops breakdown:
+    attention flops = 12 * (number of layers) * (hidden size)^2 * (1 + (number of query groups)/(number of attention heads) + (sequence length)/(hidden size))
+    mlp flops = 18 * (number of layers) * (FFN size) * (hidden size)
+    embedding flops = 6 * (vocab size) * (hidden size)
+
+Llama 3.1 70b calculation: 
+    sequence length = 8192
+    attention flops = 12 * 80 * 8192^2 * (1 + 8/64 + 8192/8192) = 136,902,082,560
+    mlp flops = 18 * 80 * 28672 * 8192 = 338,228,674,560
+    embedding flops = 6 * 128256 * 8192 = 6,304,038,912
+
+    model flops = 8129 * (136,902,082,560 + 338,228,674,560 + 6,304,038,912) = 3.94E+15
+```
+
diff --git a/llama-benchmarking/llama3.1_24.11.1/README_8b.md b/llama-benchmarking/llama3.1_24.11.1/README_8b.md
new file mode 100644
index 0000000..1184de2
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/README_8b.md
@@ -0,0 +1,162 @@
+# Overview
+
+This recipe contains information and scripts to produce performance results for the Llama 3.1 training workload. The scripts help perform environment setup, dataset setup, and launch benchmark jobs.
+This variant of the workload is best-suited for GPU clusters with
+
+* At least 8 GPUs with at least 80 GB memory each. Training of this 8-billion parameter variant of the workload will not fit on fewer GPUs with less memory.
+* H100 GPUs. This workload runs with BF16 or FP8, which are both supported by H100 GPUs.
+
+# Expected Performance
+
+Performance for Llama 3.1 training is measured by seconds per iteration, or in other words seconds per training step. This metric is logged for every training step in a .out file which is generated inside of the `$STAGE_PATH/results/$GSW_VERSION/${DTYPE}/8b/$JOB_TOTAL_GPUS` folder. 
+
+Since the performance fluctuates significantly at the beginning, we are using the last training step timing to obtain throughput value.
+
+```shell
+grep train_step_timing results/*.out
+Epoch 0: : 100%|██████████| 100/100 [18:58<00:00, reduced_train_loss=6.190, global_step=99.00, consumed_samples=12800.0, train_step_timing in s=11.10]
+```
+
+To obtain throughput as a tokens per second measurement, follow this formula: 
+```shell
+(sequence length) * (global batch size) / (training_step_timing) = (throughput in tokens per second)
+```
+
+E.g. 8192 * 128 / 11.1 = 94466
+
+To calculate time to train estimate:
+```shell
+(total tokens) / (throughput in tokens per second) / (number of seconds in a day) = (time to train in days) 
+```
+E.g. 1e12 / 94466 / 86400 = 122.52 days 
+
+
+To calculate the model flops utilization (MFU):
+```shell
+MFU = (global batch size) * (model flops) / (training step time) / (number of GPUs) / (peak GPU FLOPS)
+```
+
+The peak theoretical throughput for H100 FP8 is 1979 TFLOPS and for H100 BF16 is 989 TFLOPS.
+
+The model flops for Llama 3.1 8b for GBS=1 is 4.74E+14. Calculation shown [here](#notes).
+
+E.g. Llama 3.1 8b FP8 on 8x H100 GPUs (GBS=128)
+```shell
+peak FLOPS for H100 = 1979 TFLOPS
+training step time = 11.1 s
+model flops = 4.74E+14
+
+MFU = 128 * 4.74E+14 / 11.1 / 8 / 1979E+12 = 34.52%
+```
+
+| Llama 3.1 8b 24.09 BF16 (TP=1, PP=1, CP=2, MBS=1, GA=32) | Throughput on 8x H100 GPUs (GBS=128) | Throughput on 16x H100 GPUs (GBS=256) | Throughput on 32x H100 GPUs (GBS=512) | Throughput on 64x H100 GPUs (GBS=1024) | Throughput on 128x H100 GPUs (GBS=2048) | 
+|---|:---:|:---:|:---:|:---:|:---:|
+| Training step time (seconds per step) | 12.91  | 13.00  | 13.05  | 13.08  | 13.10   | 
+| Throughput in tokens per second       | 81222  | 161319 | 321403 | 641331 | 1280704 | 
+| Model flops utilization               | 59.37% | 58.96% | 58.73% | 58.60% | 58.51%  | 
+| Time to train 1T tokens in days       | 142.5  | 71.75  | 36.01  | 18.05  | 9.04    | 
+
+| Llama 3.1 8b 24.09 FP8 (TP=1, PP=1, CP=2, MBS=1, GA=32) | Throughput on 8x H100 GPUs (GBS=128) | Throughput on 16x H100 GPUs (GBS=256) | Throughput on 32x H100 GPUs (GBS=512) | Throughput on 64x H100 GPUs (GBS=1024) | Throughput on 128x H100 GPUs (GBS=2048) | 
+|---|:---:|:---:|:---:|:---:|:---:|
+| Training step time (seconds per step) | 9.67   | 9.73   | 9.75   | 9.80   | 9.82    | 
+| Throughput in tokens per second       | 108436 | 215535 | 430185 | 855980 | 1708474 | 
+| Model flops utilization               | 39.63% | 39.39% | 39.31% | 39.10% | 39.02%  | 
+| Time to train 1T tokens in days       | 106.74 | 53.70  | 26.90  | 13.52  | 6.77    | 
+
+# Prerequisites
+
+This recipe requires access to Llama 3.1. Instructions are below if needed.
+
+# Request Access
+A HuggingFace account is required and you will need to [create a HuggingFace access token](https://huggingface.co/settings/tokens) in order to run the training script. Add the generated token to your environment via ```export HF_TOKEN=<your token>```.
+
+Access to Llama 3.1 must be requested through [Meta's website](https://llama.meta.com/llama-downloads/) then requested on the [HuggingFace Llama](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) page. The approval process is not automatic and could take a day or more.
+
+# Prepare Environment
+
+Create a staging area by running the attached setup.sh. The script converts the docker image from `nvcr.io/nvidia/nemo:24.09` to the `nvidia+nemo+24.09.sqsh` file under the $STAGE_PATH folder and copies NeMo Launcher code from the container. The setup script also downloads Llama3 tokenizer related files from HuggingFace [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) repo using `HF_TOKEN` obtained in the previous step. **Note:** Llama3.1 8B and 70B use the same tokenizer and for this recipe we use the Llama 3 tokenizer.
+
+```shell
+# Set the path where all artifacts will be downloaded
+export STAGE_PATH=<path to your shared file system folder> (e.g. /lustre/myproject/nemo)
+# Set the Slurm partition to launch against
+export SLURM_PARTITION="batch"
+# Set the Slurm account to launch against
+export SLURM_ACCOUNT="account_name"
+# Set the number of GPUs per node according to Slurm's gres, this is usually 8 or null - https://slurm.schedmd.com/gres.html
+export SLURM_GPUS_PER_NODE=null
+# Set HuggingFace token
+export HF_TOKEN=<your token>
+
+# Run the setup
+bash ./setup.sh
+```
+
+# Prepare Dataset
+Pre-training a GPT-3 model requires a text-based dataset to be downloaded and pre-processed for the NeMo Framework to ingest the data optimally. [The Pile](https://huggingface.co/datasets/monology/pile-uncopyrighted) is often used as the dataset for pre-training models. The NeMo Framework contains helper scripts to download and pre-process the dataset. The following steps outline how to download and pre-process the dataset on DGX Cloud with an explanation of key points after.
+
+Make sure `$STAGE_PATH/llama3.1-dataset/llama` contains tokenizer files downloaded from previous step.
+
+Run the `generate_dataset.sh` script. The script launches several Slurm jobs that will download the dataset from The Pile, pre-process it and save it in a form suitable for subsequent training. The resulting dataset files will be saved under the `$STAGE_PATH/llama3.1-dataset` folder. The dataset creation may use up to 100GB. Make sure you have sufficient disk space available.
+
+
+```shell
+bash ./generate_dataset.sh
+```
+
+If the dataset generation step was successful there should be 2 idx and 2 bin files in the $STAGE_PATH/llama3.1-dataset folder.
+
+```shell
+my-llama_00_text_document.bin
+my-llama_00_text_document.idx
+my-llama_01_text_document.bin
+my-llama_01_text_document.idx
+```
+
+If that is not the case, check the log files in: $STAGE_PATH/results.data_preparation
+
+
+# Run Training
+
+NeMo Launcher is using the Hydra framework to process command line arguments and pass them down as hyper parameters to a multi-node job performing the training.
+
+The training will run for the first 50 steps and will stop afterwards. Log files and results will be located under the `$STAGE_PATH/results/$GSW_VERSION/${DTYPE}/8b/$JOB_TOTAL_GPUS` folder.
+
+Below is a command template for launching Llama 3.1 8b model training.
+```shell
+DTYPE=<fp8/bf16> MODEL_SIZE=8b sbatch -A ${SLURM_ACCOUNT} -p ${SLURM_PARTITION} -N ${NUM_NODES} ./launch.sh
+```
+
+Where:
+- `DTYPE` and `MODEL_SIZE` are **required** environment variables.
+	- `DTYPE` can be either `fp8` or `bf16`.
+	- `MODEL_SIZE` should be `8b` in this case.
+- `NUM_NODES` can be calculate by `N_GPUS / N_GPUS_PER_NODE`, `N_GPUS_PER_NODE` is 8 for DGX H100, therefore for 128 GPUs scale, `NUM_NODES` should be `128 / 8 = 16`.
+
+**Note:** it might be necessary to pass `--gres=gpu:8` to sbatch for certain clusters on encountering errors like GPU not found. See https://slurm.schedmd.com/gres.html
+
+It is important to maintain these values for model parallelism settings in order to accurately assess performance results for completed jobs against expected baseline:
+* training.model.tensor_model_parallel_size=1
+* training.model.pipeline_model_parallel_size=1
+* training.model.context_parallel_size=2
+
+Global batch size ( training.model.global_batch_size) value should be set to ```<number of nodes> * 128. E.g., 16 * 128 = 2048 (in the example above)```.
+
+# Notes
+
+```shell
+model flops = (sequence length) * ((attention flops) + (mlp flops) + (embedding flops))
+
+model flops breakdown:
+    attention flops = 12 * (number of layers) * (hidden size)^2 * (1 + (number of query groups)/(number of attention heads) + (sequence length)/(hidden size))
+    mlp flops = 18 * (number of layers) * (FFN size) * (hidden size)
+    embedding flops = 6 * (vocab size) * (hidden size)
+
+Llama 3.1 8b calculation:
+    sequence length = 8192
+    attention flops = 12 * 32 * 4096^2 * (1 + 8/32 + 8192/4096) = 20,937,965,568
+    mlp flops = 18 * 32 * 14336 * 4096 = 33,822,867,456
+    embedding flops = 6 * 128256 * 4096 = 3,152,019,456
+
+    model flops = 8192 * (20,937,965,568 + 33,822,867,456 + 3,152,019,456) = 4.74E+14
+```
diff --git a/llama-benchmarking/llama3.1_24.11.1/configure.sh b/llama-benchmarking/llama3.1_24.11.1/configure.sh
new file mode 100755
index 0000000..156379f
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/configure.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For each dataset a user elects to use, the user is responsible for
+# checking if the dataset license is fit for the intended purpose.
+
+set -eu -o pipefail
+
+export GSW_VERSION=${GSW_VERSION?"Required variable GSW_VERSION is not set in the container. Aborting"}
+
+if [[ ! "$MODEL_SIZE" =~ ^(8b|70b|405b)$ ]]; then
+  echo "FATAL: unsupported MODEL_SIZE $MODEL_SIZE for llama 3.1" >&2
+  exit 1
+fi
+
+if [[ ! "$DTYPE" =~ ^(bf16|fp8)$ ]]; then
+  echo "FATAL: unsupported DTYPE $DTYPE for llama 3.1 $MODEL_SIZE" >&2
+  exit 1
+fi
+
+# setup
+export PYTHONUNBUFFERED=1
+export SLURM_UNBUFFEREDIO=1
+export TORCHX_MAX_RETRIES=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=False
+export TRANSFORMERS_OFFLINE=1
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_NVLS_ENABLE=0
+export NVTE_DP_AMAX_REDUCE_INTERVAL=0
+export NVTE_ASYNC_AMAX_REDUCTION=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=1
+export NEMO_LOG_MEMORY_USAGE=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=8
+export NVTE_BWD_LAYERNORM_SM_MARGIN=8
+export HYDRA_FULL_ERROR=1
+
+export PRE_CMD="
+  cd /opt/NeMo;
+  git rev-parse HEAD;
+  export PYTHONPATH=/opt/NeMo:\${PYTHONPATH};
+  export CUDA_DEVICE_MAX_CONNECTIONS=1;
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;"
+
+export PROFILE_ENABLED=${ENABLE_PROFILE:-false}
+
+export ENV_VARS=""
+export CONFIG_OVERRIDES=""
+
+MAX_STEPS=${MAX_STEPS:-50}
+
+if [[ "$MODEL_SIZE" = "8b" ]]; then
+  DEFAULT_PROFILE_RANKS="0,1,2,3,4,5,6,7"
+  CONFIG_OVERRIDES+=" model.tokenizer.type=/dataset/llama"
+  # Upstream uses GBS 128 for 8 GPUs, scale it with number of total gpus.
+  GBS=${GBS:-$((128 * JOB_TOTAL_GPUS / 8))}
+elif [[ "$MODEL_SIZE" = "70b" ]]; then
+  DEFAULT_PROFILE_RANKS="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
+  CONFIG_OVERRIDES+=" model.tokenizer.type=/dataset/llama"
+  # Upstream uses GBS 128 for 64 GPUs, scale it with number of total gpus.
+  GBS=${GBS:-$((128 * JOB_TOTAL_GPUS / 64))}
+elif [[ "$MODEL_SIZE" = "405b" ]]; then
+  DEFAULT_PROFILE_RANKS="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
+  export SYNTHETIC_DATA_ENABLED=true
+  export NCCL_P2P_NET_CHUNKSIZE=262144
+  CONFIG_OVERRIDES+=" model.data.data_impl=\"mock\""
+  CONFIG_OVERRIDES+=" model.data.data_prefix=\"\""
+  CONFIG_OVERRIDES+=" model.ub_tp_comm_overlap_cfg.proj_fprop.fp8_buf=$FP8_ENABLED"
+  CONFIG_OVERRIDES+=" model.ub_tp_comm_overlap_cfg.fc2_fprop.fp8_buf=$FP8_ENABLED"
+
+  # Upstream uses GBS 252 for 576 GPUs, scale it with number of total gpus.
+  GBS=${GBS:-$((252 * JOB_TOTAL_GPUS / 576))}
+  if [[ $JOB_TOTAL_GPUS = 192 ]]; then
+    GBS=252
+    NUM_LAYERS=42
+    PP=3
+  elif [[ $JOB_TOTAL_GPUS = 96 ]]; then
+    GBS=126
+    NUM_LAYERS=21
+    PP=3
+  elif [[ $JOB_TOTAL_GPUS = 32 ]]; then
+    GBS=126
+    NUM_LAYERS=7
+    PP=1
+    VP=null
+    CONFIG_OVERRIDES+=" model.defer_embedding_wgrad_compute=false"
+  fi
+fi
+
+CONFIG_OVERRIDES+=" ++data_dir=/dataset"
+CONFIG_OVERRIDES+=" run.results_dir=$RESULT_DIR"
+CONFIG_OVERRIDES+=" model.data.index_mapping_dir=$INDEX_MAPPING_DIR"
+CONFIG_OVERRIDES+=" trainer.num_nodes=$SLURM_JOB_NUM_NODES"
+CONFIG_OVERRIDES+=" trainer.devices=$SLURM_NTASKS_PER_NODE"
+CONFIG_OVERRIDES+=" trainer.max_steps=$MAX_STEPS"
+CONFIG_OVERRIDES+=" trainer.val_check_interval=$MAX_STEPS"
+CONFIG_OVERRIDES+=" model.global_batch_size=$GBS"
+CONFIG_OVERRIDES+=" model.fp8=$FP8_ENABLED"
+CONFIG_OVERRIDES+=" model.fp8_hybrid=$FP8_ENABLED"
+CONFIG_OVERRIDES+=" +model.fp8_params=$FP8_ENABLED"
+[[ -n ${TP-} ]] && CONFIG_OVERRIDES+=" model.tensor_model_parallel_size=$TP"
+[[ -n ${PP-} ]] && CONFIG_OVERRIDES+=" model.pipeline_model_parallel_size=$PP"
+[[ -n ${VP-} ]] && CONFIG_OVERRIDES+=" model.virtual_pipeline_model_parallel_size=$VP"
+[[ -n ${CP-} ]] && CONFIG_OVERRIDES+=" model.context_parallel_size=$CP"
+if [[ -n ${SEQ_LEN-} ]]; then
+  CONFIG_OVERRIDES+=" model.encoder_seq_length=$SEQ_LEN"
+  CONFIG_OVERRIDES+=" model.max_position_embeddings=$SEQ_LEN"
+  CONFIG_OVERRIDES+=" model.data.seq_length=$SEQ_LEN"
+fi
+[[ -n ${NUM_LAYERS-} ]] && CONFIG_OVERRIDES+=" model.num_layers=$NUM_LAYERS"
+CONFIG_OVERRIDES+=" model.nsys_profile.enabled=${PROFILE_ENABLED^} "
+
+# capture command line overrides prior to optimizations
+BASE_CONFIG=$CONFIG_OVERRIDES
+
+# prototype for handling optimizations
+if [[ -n "${OPTIMIZATION_NAME:-""}" ]] && [[ -n "${OPTIMIZATION_CODE:-""}" ]]; then
+	# inject optimization parameters into command line
+	CONFIG_OVERRIDES+=" "$OPTIMIZATION_CODE
+else
+	OPTIMIZATION_NAME=""
+	OPTIMIZATION_CODE=""
+fi
+
+export INFO_STR="GSW: MODEL=${MODEL} FRAMEWORK=${FRAMEWORK} MODEL_SIZE=${MODEL_SIZE} JOB_NUM_NODES=${SLURM_JOB_NUM_NODES} GPUS_PER_NODE=${SLURM_NTASKS_PER_NODE} DTYPE=${DTYPE} SYNTHETIC_DATA=${SYNTHETIC_DATA_ENABLED^} GSW_VERSION=${GSW_VERSION} FW_VERSION=${FW_VERSION} IMAGE=\'${IMAGE}\' JOB_ID=${SLURM_JOB_ID} JOB_MODE=training OPTIMIZATION_NAME=\'${OPTIMIZATION_NAME}\' OPTIMIZATION_CODE=\'${OPTIMIZATION_CODE}\' BASE_CONFIG=\'${BASE_CONFIG}\'"
+
+export PROFILE_START_STEP=${RUN_CONF_PROFILE_START_STEP:-20}
+export PROFILE_STOP_STEP=${RUN_CONF_PROFILE_STOP_STEP:-30}
+export PROFILE_RANKS=${DEFAULT_PROFILE_RANKS:-"0,1,2,3,4,5,6,7"}
+export PROFILE_GPU_METRICS=${RUN_CONF_PROFILE_GPU_METRICS:-false}
+
+if [[ "${PROFILE_ENABLED,,}" = true ]]; then
+  NSYS_EXTRA_OPTIONS=""
+  if [[ "$SLURM_LOCALID" = "0" ]] && [[ "${PROFILE_GPU_METRICS,,}" = true ]]; then
+    NSYS_EXTRA_OPTIONS="--gpu-metrics-device=all"
+  fi
+  PROFILE_CMD="which nsys && nsys --version && nsys status --env && \
+  mkdir -p ${RESULT_DIR}/nsys && \
+  nsys profile --output ${RESULT_DIR}/nsys/${MODEL}-${MODEL_SIZE}-${DTYPE}_${JOB_TOTAL_GPUS}g_${SLURM_JOB_ID}_%q{SLURM_NODEID}_%q{SLURM_LOCALID} \
+  --nic-metrics=true $NSYS_EXTRA_OPTIONS --inherit-environment true --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop --stop-on-exit true --trace cuda,nvtx --sample none --cpuctxsw none"
+  PROFILE_CFG="model.nsys_profile.start_step=$PROFILE_START_STEP model.nsys_profile.end_step=$PROFILE_STOP_STEP model.nsys_profile.ranks=[$PROFILE_RANKS]"
+else
+  PROFILE_CMD=""
+  PROFILE_CFG=""
+fi
+
+export COMMAND_LINE="$ENV_VARS \
+  echo $INFO_STR; \
+  $PRE_CMD $PROFILE_CMD python3 -u /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
+  --config-path=/cfg \
+  --config-name=llama3.1_${MODEL_SIZE}.yaml \
+  $CONFIG_OVERRIDES $PROFILE_CFG"
diff --git a/llama-benchmarking/llama3.1_24.11.1/generate_dataset.sh b/llama-benchmarking/llama3.1_24.11.1/generate_dataset.sh
new file mode 100755
index 0000000..df0e70a
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/generate_dataset.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For each dataset a user elects to use, the user is responsible for
+# checking if the dataset license is fit for the intended purpose.
+
+set -eu -o pipefail
+
+IMAGE=${IMAGE:-$STAGE_PATH/nvidia+nemo+24.09.sqsh}
+TOKENIZER_PATH=$STAGE_PATH/llama3.1-dataset/llama
+NUM_NODES=2
+
+mkdir -p "$STAGE_PATH/launcher_scripts/results"
+python3 "$STAGE_PATH/launcher_scripts/main.py" \
+    launcher_scripts_path="$STAGE_PATH/launcher_scripts" \
+    data_preparation=llama/download_llama_pile \
+    stages="[data_preparation]" \
+    data_dir="$STAGE_PATH/llama3.1-dataset" \
+    data_preparation.run.results_dir="$STAGE_PATH/results.data_preparation" \
+    data_preparation.run.node_array_size=$NUM_NODES \
+    data_preparation.file_numbers='0-1' \
+    data_preparation.rm_downloaded=True \
+    data_preparation.rm_extracted=True \
+    data_preparation.download_tokenizer_url=null \
+    data_preparation.tokenizer_model=null \
+    data_preparation.tokenizer_save_dir=null \
+    data_preparation.tokenizer_library=huggingface \
+    +data_preparation.tokenizer_type="$TOKENIZER_PATH" \
+    cluster.gpus_per_node="${SLURM_GPUS_PER_NODE:-null}" \
+    cluster.account="$SLURM_ACCOUNT" \
+    cluster.partition="$SLURM_PARTITION" \
+    "cluster.srun_args=[\"--container-writable\",\"--no-container-mount-home\"]" \
+    container="$IMAGE"
diff --git a/llama-benchmarking/llama3.1_24.11.1/launch.sh b/llama-benchmarking/llama3.1_24.11.1/launch.sh
new file mode 100755
index 0000000..937fa47
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/launch.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For each dataset a user elects to use, the user is responsible for
+# checking if the dataset license is fit for the intended purpose.
+
+# Parameters
+#SBATCH --job-name=nemo_llama3.1
+#SBATCH --dependency=singleton
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=1:00:00
+
+if [ ${BASH_VERSION:0:1} -lt 4 ] || [ ${BASH_VERSION:0:1} -eq 4 -a ${BASH_VERSION:2:1} -lt 2 ]; then
+    printf "Unsupported %s version: %s\n" "${BASH}" "${BASH_VERSION}" >&2
+    echo "Requires Bash 4.2 or greater." >&2
+    exit 1
+fi
+
+set -eu -o pipefail
+
+export GSW_VERSION=24.11
+export FRAMEWORK=nemo
+export MODEL=llama3.1
+export FW_VERSION=24.09
+export SYNTHETIC_DATA_ENABLED=False # 405b is true
+
+export IMAGE=${RUN_CONF_IMAGE:-$STAGE_PATH/nvidia+nemo+${FW_VERSION}.sqsh}
+
+export DTYPE=${DTYPE:-fp8}
+export DTYPE=${DTYPE,,}
+if [[ "${DTYPE}" = fp8 ]]; then
+  export FP8_ENABLED=true
+else
+  export FP8_ENABLED=false
+fi
+
+export JOB_TOTAL_GPUS=${SBATCH_GPUS:-$(( ${SLURM_JOB_NUM_NODES} * ${SLURM_NTASKS_PER_NODE} ))}
+
+export RESULT_DIR=$STAGE_PATH/results/$GSW_VERSION/$DTYPE/$MODEL_SIZE/$JOB_TOTAL_GPUS
+export RESULT_FILES_NAME=log-${FRAMEWORK}_${MODEL}_${MODEL_SIZE}_${JOB_TOTAL_GPUS}
+
+export DATA_DIR=$STAGE_PATH/llama3.1-dataset
+export INDEX_MAPPING_DIR=${RUN_CONF_INDEX_DIR:-$STAGE_PATH}/index_mapping
+
+mkdir -p "$RESULT_DIR"
+mkdir -p $INDEX_MAPPING_DIR
+
+# SRUN_OUTPUT and SRUN_ERROR are Slurm environment variables to control output/error file locations.
+export SLURM_MPI_TYPE=${SLURM_MPI_TYPE:-"pmix"}
+export SRUN_OUTPUT=${SRUN_OUTPUT-${RESULT_DIR}/${RESULT_FILES_NAME}_%j.out}
+export SRUN_ERROR=${SRUN_ERROR-${RESULT_DIR}/${RESULT_FILES_NAME}_%j.err}
+
+# Workload specific configuration
+source ./configure.sh
+
+srun \
+  --container-image "$IMAGE" \
+  --container-mounts $RESULT_DIR,$INDEX_MAPPING_DIR,$DATA_DIR:/dataset,$STAGE_PATH/cfg:/cfg \
+  --container-writable \
+  --no-container-mount-home bash -c "$COMMAND_LINE"
diff --git a/llama-benchmarking/llama3.1_24.11.1/llama3.1_405b.yaml b/llama-benchmarking/llama3.1_24.11.1/llama3.1_405b.yaml
new file mode 100644
index 0000000..10d0d1f
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/llama3.1_405b.yaml
@@ -0,0 +1,235 @@
+run:
+  name: llama3.1_405b
+  results_dir: /results/
+  time_limit: "0-02:30:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 72
+  devices: 8
+  accelerator: gpu
+  precision: bf16-mixed
+  logger: false # logger provided by exp_manager
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 100
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 1
+  val_check_interval: 100
+  limit_val_batches: 1
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: /results/
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: 72
+  log_step_timing: true
+  log_tflops_per_sec_per_gpu: false
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 252
+  rampup_batch_size: null
+  tensor_model_parallel_size: 8
+  pipeline_model_parallel_size: 9
+  virtual_pipeline_model_parallel_size: 7
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 126
+  hidden_size: 16384
+  ffn_hidden_size: 53248
+  num_attention_heads: 128
+  num_query_groups: 16
+  init_method_std: 0.02
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  scale_positional_embedding: true
+  tokenizer:
+    library: 'megatron'
+    type: 'GPT2BPETokenizer'
+    model: null
+    delimiter: null # only used for tabular tokenizer
+    vocab_file: ${data_dir}/vocab.json
+    merge_file: ${data_dir}/merges.txt
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  cross_entropy_loss_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 50
+  deterministic_mode: false
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  ub_tp_comm_overlap: true
+  use_flash_attention: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
+  gc_interval: 5
+  nsys_profile:
+    enabled: False
+    trace: [nvtx,cuda]
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 2.9999999999999997e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - .5
+    - ${data_dir}/my-llama_00_text_document
+    - .5
+    - ${data_dir}/my-llama_01_text_document
+  ub_tp_comm_overlap_cfg:
+    fc1_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 2
+      set_sm_margin: 0
+    fc1_fprop:
+      aggregate: 1
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc1_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 2
+      set_sm_margin: 0
+    fc2_dgrad:
+      aggregate: 1
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc2_fprop:
+      cga_size: 2
+      fp8_buf: false
+      method: pipeline
+      num_sm: 8
+      num_splits: 4
+      set_sm_margin: 1
+    proj_dgrad:
+      aggregate: 1
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    proj_fprop:
+      cga_size: 2
+      fp8_buf: false
+      method: pipeline
+      num_sm: 24
+      num_splits: 4
+      set_sm_margin: 1
+    qkv_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 2
+      set_sm_margin: 0
+    qkv_fprop:
+      aggregate: 1
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    qkv_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 24
+      set_sm_margin: 0
diff --git a/llama-benchmarking/llama3.1_24.11.1/llama3.1_70b.yaml b/llama-benchmarking/llama3.1_24.11.1/llama3.1_70b.yaml
new file mode 100644
index 0000000..06ddfde
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/llama3.1_70b.yaml
@@ -0,0 +1,230 @@
+run:
+  name: llama3_1_70b
+  results_dir: /results/
+  time_limit: 0:30:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16-mixed
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 25
+  max_time: 05:23:30:00
+  log_every_n_steps: 1
+  val_check_interval: 25
+  limit_val_batches: 1
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: /results/
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: llama3_1_70b
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 16
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 5
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: meta-llama/Meta-Llama-3.1-70B
+    use_fast: true
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  cross_entropy_loss_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  ub_tp_comm_overlap: true
+  use_flash_attention: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
+  gc_interval: 25
+  nsys_profile:
+    enabled: false
+    trace:
+    - nvtx
+    - cuda
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 75
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 2.9999999999999997e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: true
+    reset_attention_mask: true
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 0.5
+    - /${data_dir}/my-llama_00_text_document
+    - 0.5
+    - /${data_dir}/my-llama_01_text_document
+  ub_tp_comm_overlap_cfg:
+    fc1_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 2
+      set_sm_margin: 0
+    fc1_fprop:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc1_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 4
+      set_sm_margin: 0
+    fc2_dgrad:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc2_fprop:
+      cga_size: 2
+      method: pipeline
+      num_sm: 16
+      num_splits: 4
+      set_sm_margin: 1
+    proj_dgrad:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    proj_fprop:
+      cga_size: 2
+      method: pipeline
+      num_sm: 24
+      num_splits: 4
+      set_sm_margin: 1
+    qkv_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 4
+      set_sm_margin: 0
+    qkv_fprop:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    qkv_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 24
+      set_sm_margin: 0
diff --git a/llama-benchmarking/llama3.1_24.11.1/llama3.1_8b.yaml b/llama-benchmarking/llama3.1_24.11.1/llama3.1_8b.yaml
new file mode 100644
index 0000000..f65e24d
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/llama3.1_8b.yaml
@@ -0,0 +1,174 @@
+run:
+  name: llama3_1_8b
+  results_dir: /results/
+  time_limit: 0:20:00
+  dependency: singleton
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 25
+  max_time: 05:23:30:00
+  log_every_n_steps: 1
+  val_check_interval: 25
+  limit_val_batches: 1
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: /results/
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: llama3_1_8b
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 1
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 14336
+  num_attention_heads: 32
+  num_query_groups: 8
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  cross_entropy_loss_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: meta-llama/Meta-Llama-3-8B
+    use_fast: true
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  deterministic_mode: false
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  gc_interval: 25
+  nsys_profile:
+    enabled: false
+    trace:
+    - nvtx
+    - cuda
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 75
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 2.9999999999999997e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: true
+    reset_attention_mask: true
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 0.5
+    - /${data_dir}/my-llama_00_text_document
+    - 0.5
+    - /${data_dir}/my-llama_01_text_document
+  defer_embedding_wgrad_compute: false
diff --git a/llama-benchmarking/llama3.1_24.11.1/setup.sh b/llama-benchmarking/llama3.1_24.11.1/setup.sh
new file mode 100644
index 0000000..409fe8e
--- /dev/null
+++ b/llama-benchmarking/llama3.1_24.11.1/setup.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For each dataset a user elects to use, the user is responsible for
+# checking if the dataset license is fit for the intended purpose.
+
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --time=00:40:00
+
+set -eu -o pipefail
+
+export HF_TOKEN=${HF_TOKEN?"Required variable HF_TOKEN"}
+
+# create staging folder
+mkdir -p $STAGE_PATH
+mkdir -p $STAGE_PATH/cfg
+
+# copy configs to stagepath
+cp -f llama3*.yaml "${STAGE_PATH}/cfg"
+
+cp -f gen*.sh configure.sh launch.sh $STAGE_PATH
+
+# create the squash file
+srun -N 1 -t 00:30:00 --pty bash -c "enroot import --output ${STAGE_PATH}/nvidia+nemo+24.09.sqsh docker://nvcr.io#nvidia/nemo:24.09"
+
+# copy out the configuration from the container to the $STAGE_PATH
+# this is required for data set generation.
+srun -N 1 -t 00:10:00 --container-mounts=$STAGE_PATH --container-image="$STAGE_PATH/nvidia+nemo+24.09.sqsh" bash -c "cp -r /opt/NeMo-Framework-Launcher/launcher_scripts $STAGE_PATH; cp /opt/NeMo-Framework-Launcher/requirements.txt $STAGE_PATH"
+
+# install required Python modules for generating dataset
+pip install -r "$STAGE_PATH/requirements.txt"
+
+# download Llama 3 tokenizer files from huggingface meta-llama/Meta-Llama-3-8B for data prep and training
+huggingface-cli download --force-download --local-dir "$STAGE_PATH/llama3.1-dataset/llama" meta-llama/Meta-Llama-3-8B config.json special_tokens_map.json tokenizer.json tokenizer_config.json generation_config.json
diff --git a/llama-benchmarking/run_training.sh b/llama-benchmarking/run_training.sh
new file mode 100644
index 0000000..dc09541
--- /dev/null
+++ b/llama-benchmarking/run_training.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Exit on error, undefined variables, and ensure pipe errors are caught
+set -euo pipefail
+
+###############################################
+# Environment Configuration
+###############################################
+
+# Model configuration
+: "${MODEL_SIZE:=8b}"       # Model size options: 8b, 70b, 405b
+: "${DTYPE:=bf16}"          # Data type options: fp8, bf16
+: "${NUM_GPUS:=8}"          # Number of GPUs for training
+
+# Training hyperparameters
+: "${GLOBAL_BATCH_SIZE:=128}"  # Total batch size across all GPUs
+: "${SEQ_LEN:=8192}"          # Sequence length for training
+: "${MAX_STEPS:=10}"          # Number of training steps (reduced for POC)
+
+# Data configuration
+: "${USE_SYNTHETIC_DATA:=true}"  # Set to false to use real dataset
+: "${DATA_DIR:=/inputs}"         # Directory containing the real dataset if used
+: "${RESULT_DIR:=/outputs}"      # Directory for outputs (logs, checkpoints)
+
+# Create output directory
+mkdir -p "${RESULT_DIR}"
+
+###############################################
+# Environment Settings for NeMo
+###############################################
+
+# These settings optimize NeMo's performance and behavior
+export PYTHONUNBUFFERED=1           # Ensure Python output is unbuffered
+export TOKENIZERS_PARALLELISM=False  # Avoid tokenizer warnings
+export TRANSFORMERS_OFFLINE=1        # Don't try to download models
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1  # NCCL optimization
+export CUDA_DEVICE_MAX_CONNECTIONS=1      # GPU connection optimization
+
+###############################################
+# Build Configuration Overrides
+###############################################
+
+# Start with empty config overrides
+CONFIG_OVERRIDES=""
+
+# Basic training configuration
+CONFIG_OVERRIDES+=" trainer.num_nodes=1"                        # Single node training
+CONFIG_OVERRIDES+=" trainer.devices=${NUM_GPUS}"               # Number of GPUs to use
+CONFIG_OVERRIDES+=" trainer.max_steps=${MAX_STEPS}"            # Training duration
+CONFIG_OVERRIDES+=" model.global_batch_size=${GLOBAL_BATCH_SIZE}"  # Global batch size
+CONFIG_OVERRIDES+=" model.data.seq_length=${SEQ_LEN}"         # Sequence length
+CONFIG_OVERRIDES+=" run.results_dir=${RESULT_DIR}"            # Output directory
+
+# Data configuration based on synthetic vs real data
+if [[ "${USE_SYNTHETIC_DATA}" == "true" ]]; then
+    echo "Using synthetic data for training"
+    CONFIG_OVERRIDES+=" model.data.data_impl=mock"     # Enable synthetic data
+    CONFIG_OVERRIDES+=" model.data.data_prefix=\"\""   # Empty prefix for synthetic
+else
+    echo "Using real data for training from ${DATA_DIR}"
+    CONFIG_OVERRIDES+=" model.data.data_impl=mmap"     # Memory-mapped real data
+    # Configure data prefix with two splits (standard configuration)
+    CONFIG_OVERRIDES+=" model.data.data_prefix=[\"0.5\",\"${DATA_DIR}/my-llama_00_text_document\",\"0.5\",\"${DATA_DIR}/my-llama_01_text_document\"]"
+fi
+
+###############################################
+# Select Configuration File
+###############################################
+
+CONFIG_NAME="llama3.1_${MODEL_SIZE}.yaml"
+
+###############################################
+# Verify Environment and Start Training
+###############################################
+
+# Print training configuration
+echo "Starting Llama 3.1 training with configuration:"
+echo "- Model size: ${MODEL_SIZE}"
+echo "- Data type: ${DTYPE}"
+echo "- Number of GPUs: ${NUM_GPUS}"
+echo "- Global batch size: ${GLOBAL_BATCH_SIZE}"
+echo "- Sequence length: ${SEQ_LEN}"
+echo "- Training steps: ${MAX_STEPS}"
+echo "- Using synthetic data: ${USE_SYNTHETIC_DATA}"
+
+# Verify critical paths and files
+echo -e "\nVerifying critical paths..."
+if [ ! -f "/workspace/cfg/${CONFIG_NAME}" ]; then
+    echo "ERROR: Config file not found: /workspace/cfg/${CONFIG_NAME}"
+    exit 1
+fi
+
+if [ ! -f "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py" ]; then
+    echo "ERROR: NeMo training script not found"
+    exit 1
+fi
+
+# Check for dataset only if using real data
+if [[ "${USE_SYNTHETIC_DATA}" == "false" ]]; then
+    if ! ls "${DATA_DIR}"/my-llama_00_text_document* >/dev/null 2>&1; then
+        echo "WARNING: Dataset files not found in ${DATA_DIR}"
+        echo "Please ensure dataset is properly mounted when using real data"
+    fi
+fi
+
+# Start training
+echo -e "\nLaunching training..."
+python3 /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+    --config-path=/workspace/cfg \
+    --config-name="${CONFIG_NAME}" \
+    ${CONFIG_OVERRIDES}
+
+echo -e "\nTraining completed successfully!"
\ No newline at end of file