diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..f3dda8c
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,32 @@
+
+*Description:*
+
+*Issue #, sim, or t.corp if available:*
+
+* Link to RTD for my changes: https://github.com/aws-neuron/aws-neuron-samples-staging/YOUR_BRANCH_NAME/
+
+* Submitter Checklist        
+    * Tested on : Neuron SDK <version>, release_version, Instance_type.
+    *  I've completely filled out the form above!
+       **(MANDATORY) PR needs test run output
+        
+            * I have provided the output with expected metrics in a metrics.json file
+       
+            * I have attached metric.json in the PR
+
+            * I have attached golden_step_loss.txt
+       
+            * I have added screen shot of plotted loss curve
+       
+        *  (If applicable) I've automated a test to safegaurd my changes from regression.
+        *  (If applicable) I've posted test collateral to prove my change was effective and not harmful.
+        *  (If applicable) I've added someone from QA to the list of reviewers. Do this if you didn't make an automated test or feel it's appropriate for another reason.
+        *  (If applicable) I've reviewed the licenses of updated and new binaries and their dependencies to make sure all licenses are on the pre-approved Amazon license list.
+* Reviewer Checklist
+        *  I've verified the changes render correctly on RTD (link above)
+        *  I've ensured the submitter completed the form
+        *  (If appropriate) I've verified the metrics.json file provided by the submitter
+
+
+
+
diff --git a/.github/workflows/aggregate-prs.yml b/.github/workflows/aggregate-prs.yml
index a98f0ea..1d06538 100644
--- a/.github/workflows/aggregate-prs.yml
+++ b/.github/workflows/aggregate-prs.yml
@@ -1,19 +1,20 @@
-name: Aggregate PRs into Staging Branch for Automated Testing
+name: Merge PR into Dynamic Branch on Label
 
-on: 
-  pull_request:
-    types: [opened, reopened, synchronize, closed]
+on:
+  pull_request_target:
+    types: [labeled, synchronize]
     branches:
       - master
 
 jobs:
-  merge-to-target:
-    if: github.event.pull_request.state == 'open' && !github.event.pull_request.draft
+  merge-to-dynamic-branch:
+    if: github.event.label.name != 'do-not-merge' #Excludes those labeled with do-not-merge
     runs-on: ubuntu-latest
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v2
       with:
+        ref: ${{ github.event.pull_request.head.ref }}
         fetch-depth: 0
 
     - name: Configure Git
@@ -21,16 +22,43 @@ jobs:
         git config user.name "GitHub Actions"
         git config user.email "actions@github.com"
 
-    - name: Merge PR into Testing Branch
+    - name: Check PR Labels and Merge for New Commit Events
+      if: github.event.action == 'synchronize'
       run: |
-        git fetch origin
-        git checkout -b testing origin/testing
-        git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit
-        git commit -m "Merged PR #${{ github.event.pull_request.number }}"
-        git push origin testing
+        LABELS_JSON=$(gh pr view ${{ github.event.pull_request.number }} --json labels)
+        LABELS=$(echo "$LABELS_JSON" | jq -r '.labels[].name')
+        for LABEL_BRANCH in $LABELS; do
+          # Check if the branch exists
+          if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then
+            echo "Branch $LABEL_BRANCH already exists."
+          else
+            echo "Branch $LABEL_BRANCH does not exist, creating it."
+            git branch $LABEL_BRANCH origin/master
+          fi
+          git checkout $LABEL_BRANCH
+
+          # Merge PR changes into dynamic branch
+          git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit
+          git commit -m "Merged PR #${{ github.event.pull_request.number }} due to new commits on labeled PR"
+          git push origin $LABEL_BRANCH
+        done
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-    - name: Cleanup if PR Closed
-      if: github.event.action == 'closed'
+    - name: Merge for Labeled Event
+      if: github.event.action == 'labeled'
       run: |
-        # Add commands to reset or clean up target branch
-        # Example: git reset --hard origin/master
+        LABEL_BRANCH=${{ github.event.label.name }}
+        # Check if the branch exists
+        if git show-ref --verify --quiet refs/heads/$LABEL_BRANCH; then
+          echo "Branch $LABEL_BRANCH already exists."
+        else
+          echo "Branch $LABEL_BRANCH does not exist, creating it."
+          git branch $LABEL_BRANCH origin/master
+        fi
+        git checkout $LABEL_BRANCH
+
+        # Merge PR changes into dynamic branch
+        git merge ${{ github.event.pull_request.head.sha }} --no-ff --no-commit
+        git commit -m "Merged PR #${{ github.event.pull_request.number }} due to label '$LABEL_BRANCH'"
+        git push origin $LABEL_BRANCH
diff --git a/torch-neuronx/README.md b/torch-neuronx/README.md
index 576081e..e3d0689 100644
--- a/torch-neuronx/README.md
+++ b/torch-neuronx/README.md
@@ -20,10 +20,10 @@ The following samples are available for training:
 | [hf_bert_jp](training/hf_bert_jp)                           | Fine-tuning & Deployment Hugging Face BERT Japanese model                                                                               | DataParallel |
 | [hf_sentiment_analysis](training/hf_sentiment_analysis)     | Examples of training Hugging Face bert-base-cased model for a text classification task with Trn1 Single Neuron and Distributed Training | DataParallel |
 | [customop_mlp](training/customop_mlp)     | Examples of training a multilayer perceptron model with a custom Relu operator on a single Trn1 | DataParallel |
-| [tp_dp_gpt_neox_20b_hf_pretrain](training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain) [Deprecated]   | Please note the following sample location has changed to [NeuronX Distributed Repository](https://github.com/aws-neuron/neuronx-distributed). Training GPT-NEOX 20B model using neuronx-distributed | Tensor Parallel & DataParallel |
-| [tp_dp_gpt_neox_6.9b_hf_pretrain](training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain) [Deprecated] | Please note the following sample location has changed to [NeuronX Distributed Repository](https://github.com/aws-neuron/neuronx-distributed). Training GPT-NEOX 6.9B model using neuronx-distributed | Tensor Parallel & DataParallel |
-| [tp_zero1_llama2_7b_hf_pretrain](training/llama2/tp_zero1_llama2_7b_hf_pretrain) [Deprecated] | Please note the following sample location has changed to [NeuronX Distributed Repository](https://github.com/aws-neuron/neuronx-distributed). Training Llama-2 7B model using neuronx-distributed | Tensor Parallel |
-| [tp_pp_llama2_70b_hf_pretrain](training/llama2/tp_pp_llama2_70b_hf_pretrain) [Deprecated] | Please note the following sample location has changed to [NeuronX Distributed Repository](https://github.com/aws-neuron/neuronx-distributed). Training Llama-2 70B model using neuronx-distributed | Tensor Parallel & Pipeline Parallel |
+| [tp_dp_gpt_neox_20b_hf_pretrain](https://github.com/aws-neuron/neuronx-distributed/tree/main/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain) | Training GPT-NEOX 20B model using neuronx-distributed | Tensor Parallel & DataParallel |
+| [tp_dp_gpt_neox_6.9b_hf_pretrain](https://github.com/aws-neuron/neuronx-distributed/tree/main/examples/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain) | Training GPT-NEOX 6.9B model using neuronx-distributed | Tensor Parallel & DataParallel |
+| [tp_zero1_llama2_7b_hf_pretrain](https://github.com/aws-neuron/neuronx-distributed/tree/main/examples/training/llama2/tp_zero1_llama2_7b_hf_pretrain) | Training Llama-2 7B model using neuronx-distributed | Tensor Parallel |
+| [tp_pp_llama2_70b_hf_pretrain](https://github.com/aws-neuron/neuronx-distributed/tree/main/examples/training/llama2/tp_pp_llama2_hf_pretrain) | Training Llama-2 70B model using neuronx-distributed | Tensor Parallel & Pipeline Parallel |
 
 ## Inference
 
diff --git a/torch-neuronx/inference/hf_pretrained_clip_base_inference_on_inf2.ipynb b/torch-neuronx/inference/hf_pretrained_clip_base_inference_on_inf2.ipynb
index 6e80b5a..456303e 100644
--- a/torch-neuronx/inference/hf_pretrained_clip_base_inference_on_inf2.ipynb
+++ b/torch-neuronx/inference/hf_pretrained_clip_base_inference_on_inf2.ipynb
@@ -54,7 +54,7 @@
             "source": [
                 "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
                 "# torchvision version pinned to avoid pulling in torch 2.0\n",
-                "!pip install -U transformers torchvision==0.14.1 opencv-python Pillow"
+                "!pip install -U transformers opencv-python Pillow"
             ]
         },
         {
diff --git a/torch-neuronx/inference/hf_pretrained_clip_large_inference_on_inf2.ipynb b/torch-neuronx/inference/hf_pretrained_clip_large_inference_on_inf2.ipynb
index e0e365d..278117b 100644
--- a/torch-neuronx/inference/hf_pretrained_clip_large_inference_on_inf2.ipynb
+++ b/torch-neuronx/inference/hf_pretrained_clip_large_inference_on_inf2.ipynb
@@ -54,7 +54,7 @@
             "source": [
                 "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
                 "# torchvision version pinned to avoid pulling in torch 2.0\n",
-                "!pip install -U transformers torchvision==0.14.1 opencv-python Pillow"
+                "!pip install -U transformers opencv-python Pillow"
             ]
         },
         {
diff --git a/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb b/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
index 88135f0..a99e04a 100644
--- a/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
+++ b/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
@@ -41,6 +41,7 @@
                 "- `opencv-python-headless`\n",
                 "- `imageio`\n",
                 "- `scipy`\n",
+                "- `accelerate`\n",
                 "Furthermore, it requires the `ffmpeg` video-audio converter which is used to extract audio from the input videos.\n",
                 "\n",
                 "`torch-neuronx` and `neuronx-cc` should be installed when you configure your environment following the Inf2 setup guide. The remaining dependencies can be installed below:"
@@ -53,7 +54,7 @@
             "outputs": [],
             "source": [
                 "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
-                "!pip install transformers==4.30.2 opencv-python-headless==4.8.0.74 imageio scipy opencv-python==4.8.0.74\n",
+                "!pip install transformers==4.30.2 opencv-python-headless==4.8.0.74 imageio scipy accelerate opencv-python==4.8.0.74\n",
                 "\n",
                 "!wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz\n",
                 "!tar xvf ffmpeg-git-amd64-static.tar.xz\n",
diff --git a/torch-neuronx/inference/hf_pretrained_sdxl_base_1024_inference.ipynb b/torch-neuronx/inference/hf_pretrained_sdxl_base_1024_inference.ipynb
index 7ebe899..b860e5a 100644
--- a/torch-neuronx/inference/hf_pretrained_sdxl_base_1024_inference.ipynb
+++ b/torch-neuronx/inference/hf_pretrained_sdxl_base_1024_inference.ipynb
@@ -34,13 +34,12 @@
             "metadata": {},
             "source": [
                 "**Install Dependencies**\n",
-                "\n",
                 "This tutorial requires the following pip packages to be installed:\n",
                 "- `torch-neuronx`\n",
                 "- `neuronx-cc`\n",
-                "- `diffusers==0.20.0`\n",
-                "- `transformers==4.26.1`\n",
-                "- `accelerate==0.16.0`\n",
+                "- `diffusers==0.20.2`\n",
+                "- `transformers==4.33.1`\n",
+                "- `accelerate==0.22.0`\n",
                 "- `matplotlib`\n",
                 "\n",
                 "`torch-neuronx` and `neuronx-cc` will be installed when you configure your environment following the Inf2 setup guide. The remaining dependencies can be installed below:"
@@ -53,7 +52,7 @@
             "outputs": [],
             "source": [
                 "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
-                "!pip install diffusers==0.20.0 transformers==4.26.1 accelerate==0.16.0 matplotlib"
+                "!pip install diffusers==0.20.2 transformers==4.33.1 accelerate==0.22.0 matplotlib"
             ]
         },
         {
@@ -79,7 +78,8 @@
                 "from diffusers import DiffusionPipeline\n",
                 "from diffusers.models.unet_2d_condition import UNet2DConditionOutput\n",
                 "from diffusers.models.attention_processor import Attention\n",
-                " \n",
+                "from transformers.models.clip.modeling_clip import CLIPTextModelOutput\n",
+                "\n",
                 "from matplotlib import pyplot as plt\n",
                 "from matplotlib import image as mpimg\n",
                 "import time\n",
@@ -96,12 +96,12 @@
             "source": [
                 "**Define utility classes and functions**\n",
                 "\n",
-                "The following section defines some utility classes and functions. In particular, we define a double-wrapper for the UNet. These wrappers enable `torch_neuronx.trace` to trace the wrapped models for compilation with the Neuron compiler. In addition, the `get_attention_scores_neuron` utility function performs optimized attention score calculation and is used to replace the origianl `get_attention_scores` function in the `diffusers` package via a monkey patch (see the next code block under \"Compile UNet and save\" for usage)."
+                "The following section defines some utility classes and functions. In particular, we define a double-wrapper for the UNet and text encoders. These wrappers enable `torch_neuronx.trace` to trace the wrapped models for compilation with the Neuron compiler. The second wrapper enables the compiled model (which is a TorchScript object so loses the pre compilation attributes) to be used in the pipeline without having to modify the pipeline source code. In addition, the `get_attention_scores_neuron` utility function performs optimized attention score calculation and is used to replace the origianl `get_attention_scores` function in the `diffusers` package via a monkey patch (see the next code block under \"Compile UNet and save\" for usage)."
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 4,
+            "execution_count": 3,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -160,7 +160,29 @@
                 "                               encoder_hidden_states,\n",
                 "                               added_cond_kwargs[\"text_embeds\"],\n",
                 "                               added_cond_kwargs[\"time_ids\"])[0]\n",
-                "        return UNet2DConditionOutput(sample=sample)"
+                "        return UNet2DConditionOutput(sample=sample)\n",
+                "    \n",
+                "\n",
+                "class TextEncoderOutputWrapper(nn.Module):\n",
+                "    def __init__(self, traceable_text_encoder, original_text_encoder):\n",
+                "        super().__init__()\n",
+                "        self.traceable_text_encoder = traceable_text_encoder\n",
+                "        self.config = original_text_encoder.config\n",
+                "        self.dtype = original_text_encoder.dtype\n",
+                "        self.device = original_text_encoder.device\n",
+                "\n",
+                "    def forward(self, text_input_ids, output_hidden_states=True):\n",
+                "        out_tuple = self.traceable_text_encoder(text_input_ids)\n",
+                "        return CLIPTextModelOutput(text_embeds=out_tuple[0], last_hidden_state=out_tuple[1], hidden_states=out_tuple[2])\n",
+                "    \n",
+                "class TraceableTextEncoder(nn.Module):\n",
+                "    def __init__(self, text_encoder):\n",
+                "        super().__init__()\n",
+                "        self.text_encoder = text_encoder\n",
+                "\n",
+                "    def forward(self, text_input_ids):\n",
+                "        out_tuple = self.text_encoder(text_input_ids, output_hidden_states=True, return_dict=False)\n",
+                "        return out_tuple"
             ]
         },
         {
@@ -171,9 +193,10 @@
                 "**Compile the model into an optimized TorchScript and save the TorchScript**\n",
                 "\n",
                 "In the following section, we will compile parts of the Stable Diffusion pipeline for execution on Neuron. Note that this only needs to be done once: After you have compiled and saved the model by running the following section of code, you can reuse it any number of times without having to recompile. In particular, we will compile:\n",
-                "1. The VAE decoder;\n",
-                "2. The UNet, and\n",
-                "3. The VAE_post_quant_conv\n",
+                "1. The text encoders (text_encoder, text_encoder_2)\n",
+                "2. The VAE decoder;\n",
+                "3. The UNet, and\n",
+                "4. The VAE_post_quant_conv\n",
                 "These blocks are chosen because they represent the bulk of the compute in the pipeline, and performance benchmarking has shown that running them on Neuron yields significant performance benefit.\n",
                 "\n",
                 "Several points worth noting are:\n",
@@ -193,6 +216,58 @@
                 "# Model ID for SD XL version pipeline\n",
                 "model_id = \"stabilityai/stable-diffusion-xl-base-1.0\"\n",
                 "\n",
+                "# --- Compile Text Encoders and save ---\n",
+                "\n",
+                "pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)\n",
+                "\n",
+                "\n",
+                "# Apply wrappers to make text encoders traceable\n",
+                "traceable_text_encoder = copy.deepcopy(TraceableTextEncoder(pipe.text_encoder))\n",
+                "traceable_text_encoder_2 = copy.deepcopy(TraceableTextEncoder(pipe.text_encoder_2))\n",
+                "\n",
+                "del pipe\n",
+                "\n",
+                "text_input_ids_1 = torch.tensor([[49406,   736,  1615, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,\n",
+                "         49407, 49407, 49407, 49407, 49407, 49407, 49407]])\n",
+                "\n",
+                "\n",
+                "text_input_ids_2 = torch.tensor([[49406,   736,  1615, 49407,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+                "             0,     0,     0,     0,     0,     0,     0]])\n",
+                "\n",
+                "\n",
+                "# Text Encoder 1\n",
+                "neuron_text_encoder = torch_neuronx.trace(\n",
+                "    traceable_text_encoder,\n",
+                "    text_input_ids_1,\n",
+                "    compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'),\n",
+                ")\n",
+                "\n",
+                "text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')\n",
+                "torch.jit.save(neuron_text_encoder, text_encoder_filename)\n",
+                "\n",
+                "\n",
+                "# Text Encoder 2\n",
+                "neuron_text_encoder_2 = torch_neuronx.trace(\n",
+                "    traceable_text_encoder_2,\n",
+                "    text_input_ids_2,\n",
+                "    compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder_2'),\n",
+                ")\n",
+                "\n",
+                "text_encoder_2_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder_2/model.pt')\n",
+                "torch.jit.save(neuron_text_encoder_2, text_encoder_2_filename)\n",
+                "\n",
                 "# --- Compile VAE decoder and save ---\n",
                 "\n",
                 "# Only keep the model being compiled in RAM to minimze memory pressure\n",
@@ -296,14 +371,16 @@
             },
             "outputs": [],
             "source": [
-                "# --- Load all compiled models ---\n",
+                "# --- Load all compiled models and run pipeline ---\n",
                 "COMPILER_WORKDIR_ROOT = 'sdxl_compile_dir_1024'\n",
                 "model_id = \"stabilityai/stable-diffusion-xl-base-1.0\"\n",
+                "text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')\n",
+                "text_encoder_2_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder_2/model.pt')\n",
                 "decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')\n",
                 "unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')\n",
                 "post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')\n",
                 "\n",
-                "pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True)\n",
+                "pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)\n",
                 "\n",
                 "# Load the compiled UNet onto two neuron cores.\n",
                 "pipe.unet = NeuronUNet(UNetWrap(pipe.unet))\n",
@@ -313,6 +390,8 @@
                 "# Load other compiled models onto a single neuron core.\n",
                 "pipe.vae.decoder = torch.jit.load(decoder_filename)\n",
                 "pipe.vae.post_quant_conv = torch.jit.load(post_quant_conv_filename)\n",
+                "pipe.text_encoder = TextEncoderOutputWrapper(torch.jit.load(text_encoder_filename), pipe.text_encoder)\n",
+                "pipe.text_encoder_2 = TextEncoderOutputWrapper(torch.jit.load(text_encoder_2_filename), pipe.text_encoder_2)\n",
                 "\n",
                 "# Run pipeline\n",
                 "prompt = [\"a photo of an astronaut riding a horse on mars\",\n",
diff --git a/torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh b/torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh
index 476ee64..7301a24 100644
--- a/torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh
+++ b/torch-neuronx/training/aws-batch/llama2/docker/llama_batch_training.sh
@@ -62,9 +62,6 @@ OUTPUT_DIR="/llama_checkpoints"
 CURRENT_BATCH_JOB_ID=$(echo "$AWS_BATCH_JOB_ID" | sed 's/#.*//')
 CHECKPOINT_PATH="$CHECKPOINT_SAVE_URI$CURRENT_BATCH_JOB_ID"
 
-NODE_ID=0
-WORLD_SIZE=1
-
 if [ -v AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS ]
 then
 	export MASTER_ADDR=$AWS_BATCH_JOB_MAIN_NODE_PRIVATE_IPV4_ADDRESS
@@ -72,7 +69,7 @@ else
 	export MASTER_ADDR=`ip -f inet addr show eth0 | grep -Po 'inet \K[\d.]+'`
 fi
 
-DP=$(($NEURON_RT_NUM_CORES * $WORLD_SIZE / $TP_DEGREE))
+DP=$(($NEURON_RT_NUM_CORES * $NTASKS / $TP_DEGREE))
 ACC_STEPS=$(($GBS / $MBS / $DP))
 
 EXTRA_ARGS=" "
diff --git a/torch-neuronx/training/hf_image_classification/VisionPerceiverConv.ipynb b/torch-neuronx/training/hf_image_classification/VisionPerceiverConv.ipynb
index 5cdec2d..47c2e5a 100644
--- a/torch-neuronx/training/hf_image_classification/VisionPerceiverConv.ipynb
+++ b/torch-neuronx/training/hf_image_classification/VisionPerceiverConv.ipynb
@@ -190,4 +190,3 @@
     "nbformat": 4,
     "nbformat_minor": 2
 }
-
diff --git a/torch-neuronx/training/hf_summarization/BartLarge.ipynb b/torch-neuronx/training/hf_summarization/BartLarge.ipynb
index 3989c0c..c686fd7 100644
--- a/torch-neuronx/training/hf_summarization/BartLarge.ipynb
+++ b/torch-neuronx/training/hf_summarization/BartLarge.ipynb
@@ -38,7 +38,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -U optimum-neuron==0.0.15 accelerate==0.23.0 datasets>=1.8.0 sentencepiece!=0.1.92 protobuf==3.20.3 rouge-score nltk py7zr evaluate\n",
+    "%pip install -U optimum-neuron==0.0.15 accelerate==0.23.0 datasets>=1.8.0 sentencepiece!=0.1.92 rouge-score nltk py7zr evaluate\n",
     "# now restart the kernel"
    ]
   },
diff --git a/torch-neuronx/training/hf_summarization/T5Large.ipynb b/torch-neuronx/training/hf_summarization/T5Large.ipynb
index c312547..c770160 100644
--- a/torch-neuronx/training/hf_summarization/T5Large.ipynb
+++ b/torch-neuronx/training/hf_summarization/T5Large.ipynb
@@ -38,7 +38,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -U optimum-neuron==0.0.15 accelerate==0.23.0 datasets>=1.8.0 sentencepiece!=0.1.92 protobuf==3.20.3 rouge-score nltk py7zr evaluate\n",
+    "%pip install -U optimum-neuron==0.0.15 accelerate==0.23.0 datasets>=1.8.0 sentencepiece!=0.1.92 rouge-score nltk py7zr evaluate\n",
     "# now restart the kernel"
    ]
   },
diff --git a/torch-neuronx/training/hf_text_classification/XlmRobertaBase.ipynb b/torch-neuronx/training/hf_text_classification/XlmRobertaBase.ipynb
index a1f7d4d..d52e09c 100644
--- a/torch-neuronx/training/hf_text_classification/XlmRobertaBase.ipynb
+++ b/torch-neuronx/training/hf_text_classification/XlmRobertaBase.ipynb
@@ -12,7 +12,7 @@
                 "1. First compile the model using the utility `neuron_parallel_compile` to compile the model to run on the AWS Trainium device.\n",
                 "1. Run the fine-tuning script to train the model based on the associated task (e.g. mrpc). The training job will use 2 workers with data parallel to speed up the training. If you have a larger instance (trn1.32xlarge) you can increase the worker count to 8 or 32.\n",
                 "\n",
-                "It has been tested and run on a trn1.2xlarge\n",
+                "It has been tested and run on a trn1.32xlarge\n",
                 "\n",
                 "**Reference:** https://huggingface.co/xlm-roberta-base"
             ]
@@ -73,13 +73,13 @@
             "outputs": [],
             "source": [
                 "model_name = \"xlm-roberta-base\"\n",
-                "env_var_options = \"\"\n",
-                "num_workers = 2\n",
+                "env_var_options = \"XLA_USE_BF16=1 NEURON_CC_FLAGS=\\'--model-type=transformer --verbose=info\\'\"\n",
+                "num_workers = 32\n",
                 "task_name = \"mrpc\"\n",
-                "batch_size = 8\n",
-                "max_seq_length = 128\n",
+                "batch_size = 16\n",
+                "max_seq_length = 512\n",
                 "learning_rate = 2e-05\n",
-                "num_train_epochs = 5\n",
+                "num_train_epochs = 100\n",
                 "model_base_name = model_name"
             ]
         },
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/activation_checkpoint.py b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/activation_checkpoint.py
deleted file mode 100644
index 48ba1a4..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/activation_checkpoint.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from typing import Any, Dict, Iterator, Tuple
-import torch.nn as nn
-
-import torch
-from torch_xla.utils.checkpoint import checkpoint as torch_checkpoint
-from neuronx_distributed.parallel_layers.parallel_state import rmsg
-from neuronx_distributed.utils.logger import get_logger
-from torch.distributed.utils import _replace_by_prefix
-
-logger = get_logger()
-
-_CHECKPOINT_WRAPPED_MODULE = "mod"
-_CHECKPOINT_PREFIX = _CHECKPOINT_WRAPPED_MODULE + "."
-
-class CheckPointWrapper(torch.nn.Module):
-    def __init__(self, mod) -> None:
-        super().__init__()
-        self.mod = mod
-        # state_dict post hook to remove prefix to allow loading into a
-        # non-checkpoint wrapped module.
-        self._register_state_dict_hook(self._post_state_dict_hook)
-        # load_state_dict pre-hook to allow loading back into
-        # checkpoint-wrapped module.
-        self._register_load_state_dict_pre_hook(
-            self._pre_load_state_dict_hook, with_module=True
-        )
-
-
-    def forward(self, *args, **kwargs):
-        ordered_args = list(args)
-        for value in kwargs.values():
-            ordered_args += [value]
-
-        # Note: checkpoint cannot accept kwargs
-        return torch_checkpoint(self.mod, *ordered_args, use_reentrant=True)
-    
-    def named_parameters(
-        self,
-        *args,
-        **kwargs,
-    ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
-        """
-        Overrides :meth:`named_parameters()` to intercept parameter names and
-        remove all occurrences of ``_CHECKPOINT_PREFIX``.
-        """
-        for param_name, param in super().named_parameters(*args, **kwargs):
-            updated_name = param_name.replace(_CHECKPOINT_PREFIX, "")
-            yield updated_name, param
-    
-    def named_modules(self,*args,**kwargs):
-        for module_name, module in super().named_modules(*args, **kwargs):
-            updated_name = module_name.replace(_CHECKPOINT_PREFIX, "")
-            yield updated_name, module
-
-    @staticmethod
-    def _post_state_dict_hook(
-        module: nn.Module,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        *args: Any,
-    ) -> Dict[str, Any]:
-        """
-        _post_state_dict_hook() is called after the state_dict() of this
-        FSDP module is executed. For ``checkpoint_wrapper``, it will strip
-        checkpoint-wrapped module prefix so that this module can be loaded into
-        non-checkpointed modules. It would still be able to be loaded into
-        checkpoint-wrapped modules as this class adds the prefix back before
-        loading the state_dict.
-        """
-        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}", prefix)
-        return state_dict
-    
-    @staticmethod
-    def _pre_load_state_dict_hook(
-        module: nn.Module,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        *args: Any,
-    ) -> None:
-        """
-        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
-        is called. For ``checkpoint_wrapper``, it will add back the module
-        prefix so that non-checkpointed modules can be loaded into
-        checkpoint_wrapper modules properly.
-        """
-        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}")
-
-    
-
-def apply_checkpoint(dist_model, layers_to_checkpoint=None):
-    checkpoint_wrapper_added = False
-    if layers_to_checkpoint is not None and len(layers_to_checkpoint) == 0:
-        raise RuntimeError(
-            rmsg(f"invalid input layers_to_checkpoint {layers_to_checkpoint}, can't be empty")
-        )
-    for name, module in dist_model.local_module.named_children():
-        # checkpoint layers that are provided in input
-        # if layers not provide in input, then checkpoint if it is transformer layer
-        if (layers_to_checkpoint and name in layers_to_checkpoint) or (
-            not layers_to_checkpoint and type(module) == dist_model.transformer_layer_cls
-        ):
-            # add_module replaces old module with our own custom module.
-            # https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module.add_module
-            dist_model.local_module.add_module(name, CheckPointWrapper(module))
-            checkpoint_wrapper_added = True
-    if layers_to_checkpoint is not None and not checkpoint_wrapper_added:
-        logger.warning(
-            rmsg(f"layers_to_checkpoint {layers_to_checkpoint} do not exist in the graph")
-        )
-    elif layers_to_checkpoint is None and not checkpoint_wrapper_added:
-        logger.warning(
-            rmsg(
-                f"During applying activation checkpointing, transformer_layer_cls {dist_model.transformer_layer_cls.__name__} can not be found in stage {dist_model.pipeline_parallel_rank}, skipping..."
-            )
-        )
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/config.json b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/config.json
deleted file mode 100644
index 740e605..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 8192,
-  "initializer_range": 0.02,
-  "intermediate_size": 28672,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 64,
-  "num_hidden_layers": 80,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.31.0",
-  "use_cache": true,
-  "vocab_size": 32000,
-  "sequence_parallel_enabled": false,
-  "selective_checkpoint_enabled": false,
-  "move_model_to_device":false
-}
-  
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/lr.py b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/lr.py
deleted file mode 100644
index 9715509..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/lr.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# CosineAnnealing lr scheduler adopted from Nemo
-# https://github.com/NVIDIA/NeMo/blob/main/nemo/core/optim/lr_scheduler.py#L403
-# Used for benchmarking with Nemo
-
-from torch.optim.lr_scheduler import _LRScheduler
-import warnings
-import math
-
-class WarmupAnnealHoldPolicy(_LRScheduler):
-    """Adds warmup kwargs and warmup logic to lr policy.
-    All arguments should be passed as kwargs for clarity,
-    Args:
-        warmup_steps: Number of training steps in warmup stage
-        warmup_ratio: Ratio of warmup steps to total steps
-        max_steps: Total number of steps while training or `None` for
-            infinite training
-        min_lr: Minimum lr to hold the learning rate after decay at.
-        constant_steps: Number of steps to keep lr constant at.
-        constant_ratio: Ratio of steps to keep lr constant.
-    """
-
-    def __init__(
-        self,
-        optimizer,
-        *,
-        warmup_steps=None,
-        warmup_ratio=None,
-        constant_steps=None,
-        constant_ratio=None,
-        max_steps=None,
-        min_lr=0.0,
-        last_epoch=-1,
-    ):
-        assert not (
-            warmup_steps is not None and warmup_ratio is not None
-        ), "Either use particular number of step or ratio"
-        assert not (
-            constant_steps is not None and constant_ratio is not None
-        ), "Either use constant_steps or constant_ratio"
-        assert warmup_ratio is None or max_steps is not None, "If there is a ratio, there should be a total steps"
-
-        # It is necessary to assign all attributes *before* __init__,
-        # as class is wrapped by an inner class.
-        self.max_steps = max_steps
-
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            self.warmup_steps = int(warmup_ratio * max_steps)
-        else:
-            self.warmup_steps = 0
-
-        if constant_steps is not None:
-            self.constant_steps = constant_steps
-        elif constant_ratio is not None:
-            self.constant_steps = int(constant_ratio * max_steps)
-        else:
-            self.constant_steps = 0
-
-        self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps)
-
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn(
-                "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning
-            )
-
-        step = self.last_epoch
-
-        # Warmup steps
-        if self.warmup_steps > 0 and step <= self.warmup_steps:
-            return self._get_warmup_lr(step)
-
-        # Constant steps after warmup and decay
-        if self.constant_steps > 0 and (self.warmup_steps + self.decay_steps) < step <= self.max_steps:
-            return self._get_constant_lr(step)
-
-        # Min lr after max steps of updates
-        if step > self.max_steps:
-            return [self.min_lr for _ in self.base_lrs]
-
-        return self._get_lr(step)
-
-    def _get_warmup_lr(self, step):
-        lr_val = (step + 1) / (self.warmup_steps + 1)
-        return [initial_lr * lr_val for initial_lr in self.base_lrs]
-
-    def _get_constant_lr(self, step):
-        return [self.min_lr for _ in self.base_lrs]
-
-    def _get_lr(self, step):
-        """Simple const lr policy"""
-        return self.base_lrs
-
-def _cosine_annealing(initial_lr, step, max_steps, min_lr):
-    mult = 0.5 * (1 + math.cos(math.pi * step / max_steps))
-    out_lr = (initial_lr - min_lr) * mult + min_lr
-    return out_lr
-
-def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step, decay_steps, min_lr):
-
-    assert max_lr > min_lr
-    # Use linear warmup for the initial part.
-    if warmup_steps > 0 and step <= warmup_steps:
-        return max_lr * float(step) / float(warmup_steps)
-
-    # For any steps larger than `decay_steps`, use `min_lr`.
-    if step > warmup_steps + decay_steps:
-        return min_lr
-
-    # If we are done with the warmup period, use the decay style.
-    num_steps_ = step - warmup_steps
-    decay_steps_ = decay_steps
-    decay_ratio = float(num_steps_) / float(decay_steps_)
-    assert decay_ratio >= 0.0
-    assert decay_ratio <= 1.0
-    delta_lr = max_lr - min_lr
-
-    coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-
-    return min_lr + coeff * delta_lr
-
-class CosineAnnealing(WarmupAnnealHoldPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs)
-
-    def _get_lr(self, step):
-        for initial_lr in self.base_lrs:
-            if initial_lr < self.min_lr:
-                raise ValueError(
-                    f"{self} received an initial learning rate that was lower than the minimum learning rate."
-                )
-
-        if self.constant_steps is None or self.constant_steps == 0:
-            new_lrs = [
-                _cosine_annealing(
-                    initial_lr=initial_lr,
-                    step=step - self.warmup_steps,
-                    max_steps=self.max_steps - self.warmup_steps,
-                    min_lr=self.min_lr,
-                )
-                for initial_lr in self.base_lrs
-            ]
-        else:
-            new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step)
-        return new_lrs
-
-    def _get_warmup_lr(self, step):
-        if self.constant_steps is None or self.constant_steps == 0:
-            return super()._get_warmup_lr(step)
-        else:
-            # Use linear warmup for the initial part.
-            return self._get_linear_warmup_with_cosine_annealing_lr(step)
-
-    def _get_constant_lr(self, step):
-        # Only called when `constant_steps` > 0.
-        return self._get_linear_warmup_with_cosine_annealing_lr(step)
-
-    def _get_linear_warmup_with_cosine_annealing_lr(self, step):
-        # Cosine Schedule for Megatron LM, slightly different warmup schedule + constant LR at the end.
-        new_lrs = [
-            _linear_warmup_with_cosine_annealing(
-                max_lr=self.base_lrs[0],
-                warmup_steps=self.warmup_steps,
-                step=step,
-                decay_steps=self.decay_steps,
-                min_lr=self.min_lr,
-            )
-            for _ in self.base_lrs
-        ]
-        return new_lrs
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_70b_tp_pp.sh b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_70b_tp_pp.sh
deleted file mode 100755
index ced30c9..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_70b_tp_pp.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/bin/bash
-set -ex
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000
-
-export FI_EFA_USE_DEVICE_RDMA=1
-export FI_PROVIDER=efa
-export FI_EFA_FORK_SAFE=1
-
-export NEURON_FUSE_SOFTMAX=1
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=5
-export MALLOC_ARENA_MAX=128
-export XLA_DOWNCAST_BF16=1
-export NEURON_CC_FLAGS="--model-type=transformer --distribution-strategy=llm-training --enable-saturate-infinity --cache_dir=/home/ubuntu/cache_dir_neuron/"
-
-PROCESSES_PER_NODE=32
-WORLD_SIZE=1
-NODEID=0
-HOSTNAME=`hostname`
-if [ -v SLURM_NTASKS ]; then
-    # SLURM runs
-    IPS=""
-    for h in $(scontrol show hostname); do
-        IPS="$IPS $(nslookup $h  | awk '/^Address: / { print $2 }')";
-    done
-    HOSTS=(${IPS//\ / })
-    NODEID=$SLURM_NODEID
-    NTASKS=$SLURM_NTASKS
-    WORLD_SIZE=$SLURM_NTASKS
-    JOB_ID=$SLURM_JOB_ID
-    export NEMO_EXPM_VERSION=$SLURM_JOB_ID
-    export EXPLICIT_LOGDIR=null
-    LOG_PATH=logs/$SLURM_JOB_ID/$NODEID
-    
-    MASTER_ADDR=${HOSTS[0]}
-    MASTER_PORT=44000
-    DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NTASKS --node_rank $NODEID --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-else
-    DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE"
-    LOG_PATH=logs
-fi
-mkdir -p $LOG_PATH
-echo "Nodeinfo NODEID $NODEID hostname $HOSTNAME"
-echo $DISTRIBUTED_ARGS
-
-# Global batch size
-GBS=512
-# Input sequence length
-SEQ_LEN=4096
-# Pipeline parallel degree
-PP_DEGREE=8
-# Tensor parallel degree
-TP_DEGREE=8
-# Data paralell size
-DP=$(($PROCESSES_PER_NODE * $WORLD_SIZE / $TP_DEGREE / $PP_DEGREE))
-# Batch size per model replica
-BS=$(($GBS / $DP))
-# Number microbatches for pipeline execution
-# Setting same as BS so each microbatch contains a single datasample
-NUM_MICROBATCHES=$BS
-DATA_PATH="~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
-
-
-if [ "$NEURON_EXTRACT_GRAPHS_ONLY" = "1" ]; then
-    max_steps=10
-    tb_dir="~/tensorboard/llama70B_compile"
-else
-    max_steps=30000
-    tb_dir="~/tensorboard/llama70B_32nodes_${JOB_ID}"
-    mkdir -p $tb_dir
-fi
-
-torchrun $DISTRIBUTED_ARGS run_llama_nxd.py \
-    --train_batch_size $BS \
-    --use_meta_device_init 1 \
-    --training_dir $DATA_PATH \
-    --training_config $SCRIPT_DIR \
-    --max_steps $max_steps \
-    --seq_len $SEQ_LEN \
-    --pipeline_parallel_size $PP_DEGREE \
-    --tensor_parallel_size $TP_DEGREE \
-    --num_microbatches $NUM_MICROBATCHES \
-    --lr 0.00015 \
-    --min_lr 1e-05 \
-    --beta1 0.9 \
-    --beta2 0.95 \
-    --weight_decay 0.1 \
-    --warmup_steps 2000 \
-    --constant_steps 0 \
-    --use_zero1_optimizer 1 \
-    --tb_dir $tb_dir |& tee $LOG_PATH/log
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_nxd.py b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_nxd.py
deleted file mode 100644
index 835fe3c..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/run_llama_nxd.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import random
-import time
-import queue
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch_xla.core.xla_model as xm
-import torch_xla.distributed.xla_multiprocessing as xmp
-from neuronx_distributed.parallel_layers.parallel_state import (
-    get_data_parallel_rank,
-    get_data_parallel_size,
-    get_tensor_model_parallel_rank,
-    initialize_model_parallel,
-)
-from neuronx_distributed.parallel_layers import ColumnParallelLinear, RowParallelLinear, ParallelEmbedding
-from neuronx_distributed.parallel_layers.grads import clip_grad_norm
-from neuronx_distributed.pipeline import NxDPPModel
-from neuronx_distributed.parallel_layers import parallel_state
-from neuronx_distributed.optimizer import NeuronZero1Optimizer
-from neuronx_distributed.parallel_layers import mappings
-from neuronx_distributed.parallel_layers.checkpointing import save, load
-from neuronx_distributed.utils import model_utils
-from transformers import LlamaConfig
-import transformers.modeling_utils as modeling_utils
-# For delayed parameter inititalization
-# Check https://pytorch.org/torchdistx/latest/deferred_init.html
-try:
-    from torchdistx import deferred_init
-except ImportError:
-    deferred_init = None
-
-from modeling_llama_nxd import LlamaForCausalLM, LlamaRMSNorm, LlamaDecoderLayer
-from adamw_fp32_optim_params import AdamW_FP32OptimParams
-from activation_checkpoint import apply_checkpoint
-from training_utils import get_param_groups_by_weight_decay, get_learning_rate_scheduler, create_llama_pretraining_dataset, create_partition
-
-
-def allreduce_sequence_parallel_gradients(optimizer):
-    """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-        Modified from megatron-lm:
-        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
-    """
-    from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
-    grads = []
-    for param_group in optimizer.__getstate__()['param_groups']:
-        for group, params in param_group.items():
-            if group == 'params':
-                for p in params:
-                    if isinstance(p, torch.Tensor) and p.grad is not None:
-                        sequence_parallel_param = getattr(p, 'sequence_parallel_enabled', False)
-                        if sequence_parallel_param:
-                            grads.append(p.grad.data)
-    xm.master_print("# sequence parallel parameters = ", len(grads))
-    for grad in grads:
-        # sum v.s. average: sum
-        reduce_from_tensor_model_parallel_region(grad)
-
-
-
-def save_checkpoint(args, model, optimizer, lr_scheduler, batch_idx, total_steps):
-    """
-    Save model/optimizer checkpoint with script level configs
-    """
-    ckpt_start = time.time()
-    base_dir = f"{args.checkpoint_dir}/step{total_steps}"
-    local_state_dict = model.local_state_dict()
-    # save model states to disk
-    save(local_state_dict, f"{base_dir}/model", save_xser=args.save_load_xser>0)
-    local_state_dict = optimizer.state_dict()
-    # save optimizer states to disk
-    save(local_state_dict, f"{base_dir}/optimizer", save_xser=args.save_load_xser>0)
-    user_content = {"total_steps": total_steps, "lr_scheduler": lr_scheduler.state_dict(), "batch_idx": batch_idx, "cli_args": args.__dict__}
-    ckpt_end = time.time()
-    if torch.distributed.get_rank() == 0:
-        torch.save(user_content, f"{base_dir}/user_content.pt")
-        print(f"step {total_steps} checkpoint saved to {base_dir}, total time {ckpt_end-ckpt_start}s")
-    # Delete older checkpoints
-    if args.num_kept_checkpoint > 0:
-        import os, shutil
-        ckpt_to_del = f"{args.checkpoint_dir}/step{total_steps-args.checkpoint_freq*args.num_kept_checkpoint}"
-        if dist.get_rank() == 0 and os.path.exists(ckpt_to_del):
-            print(f"deleting old checkpoint {ckpt_to_del}")
-            shutil.rmtree(ckpt_to_del)
-
-class Throughput:
-    def __init__(
-        self, batch_size, world_size, grad_accum_usteps, moving_avg_window_size=10
-    ):
-        self.seqs_per_iteration = batch_size * world_size * grad_accum_usteps
-        self.moving_avg_window_size = moving_avg_window_size
-        self.moving_avg_window = queue.Queue()
-        self.window_time = 0
-        self.start_time = time.time()
-
-    def get_throughput(self):
-        step_time = time.time() - self.start_time
-        self.start_time += step_time
-        self.window_time += step_time
-        self.moving_avg_window.put(step_time)
-        window_size = self.moving_avg_window.qsize()
-        if window_size > self.moving_avg_window_size:
-            self.window_time -= self.moving_avg_window.get()
-            window_size -= 1
-        throughput = window_size * self.seqs_per_iteration / self.window_time
-        return throughput
-
-def train_llama(args):
-    if dist.get_rank() == 0:
-        print(f"args {args}")
-    torch.manual_seed(args.seed)
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    # Initialize model parallelism groups
-    initialize_model_parallel(
-        pipeline_model_parallel_size=args.pipeline_parallel_size,
-        tensor_model_parallel_size=args.tensor_parallel_size,
-    )
-    dp_rank = get_data_parallel_rank()
-    dp_size = get_data_parallel_size()
-    tp_rank = get_tensor_model_parallel_rank()
-    # load the config file if resume from checkpoint
-    if args.loading_step != -1:
-        user_content = torch.load(f"{args.checkpoint_dir}/step{args.loading_step}/user_content.pt")
-    else:
-        user_content = None
-
-    # Set up Llama config
-    config = LlamaConfig.from_pretrained(args.training_config)
-    config.use_cache = False
-    config.return_dict = False
-    config.sequence_parallel_enabled = args.use_sequence_parallel > 0
-    config.selective_checkpoint_enabled = args.use_selective_checkpoint > 0
-    config.max_position_embeddings = max(config.max_position_embeddings, args.seq_len)
-    if args.num_layer != -1:
-        config.num_hidden_layers = args.num_layer
-    if args.hidden_size != -1:
-        config.hidden_size = args.hidden_size
-    
-    # Create model with different options
-    # Either deferred_init or meta device initialization will be required to avoid host OOM for 70B model
-    if args.use_deferred_init > 0 and deferred_init is not None:
-        # Create model with PT's deferred initialization
-        # All tensors will be in fake tensor mode: https://pytorch.org/torchdistx/latest/fake_tensor.html
-        model = deferred_init.deferred_init(LlamaForCausalLM, config)
-    elif args.use_meta_device_init > 0:
-        # Create model on meta device
-        # Parameters will be meta tensors, so reinit will be required
-        # Buffers will be on CPU
-        def init_weights(module):
-            """
-            Re-init weights after partition
-            Referred from HF transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L690
-            """
-            if isinstance(module, torch.nn.Linear):
-                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
-                if module.bias is not None:
-                    module.bias.data.zero_()
-            elif isinstance(module, torch.nn.Embedding):
-                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
-                if module.padding_idx:
-                    module.weight.data[module.padding_idx].zero_()
-            elif isinstance(module, LlamaRMSNorm):
-                module.weight.data.fill_(1.0)
-            elif isinstance(module, (ParallelEmbedding, RowParallelLinear, ColumnParallelLinear)):
-                module.init_weight_cpu()
-                if hasattr(module, "bias") and module.bias is not None:
-                    module.bias.data.zero_()
-
-        with model_utils.init_on_device(device=torch.device("meta")):
-            model = LlamaForCausalLM(config)
-    else:
-        # Create model directly on host device
-        model = LlamaForCausalLM(config)
-    num_params = sum([np.prod(p.size()) for p in model.parameters()])
-    if dist.get_rank() == 0:
-        print(f"# total parameters: {num_params}")
-        print(f"model config {config}")
-    # Create the PP partitions
-    pipeline_cuts = create_partition(config.num_hidden_layers, args.pipeline_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print(f"pipeline_cuts {pipeline_cuts}")
-    # Create NxD PP model, tracing and partition happens internally
-    model = NxDPPModel(
-        model,
-        transformer_layer_cls=LlamaDecoderLayer,
-        num_microbatches=args.num_microbatches,
-        output_loss_value_spec=(True, False),
-        input_names=["input_ids", "attention_mask", "labels"],
-        pipeline_cuts=pipeline_cuts,
-        trace_file_path=args.trace_file_path,
-        param_init_fn=init_weights if args.use_meta_device_init > 0 else None,
-        leaf_module_cls=[LlamaRMSNorm.__name__],
-        autowrap_modules=[mappings],
-        use_zero1_optimizer=args.use_zero1_optimizer > 0,
-    )
-    # Applying activation checkpoint for transformer layers
-    if not config.selective_checkpoint_enabled:
-        apply_checkpoint(model)
-    # Only move the local module to device
-    # If either meta device/deferred init is used, only local module will be materialized
-    model.move_model_to_device()
-
-    # Load the model weight
-    if args.loading_step != -1:
-        load(f"{args.checkpoint_dir}/step{args.loading_step}/model", model_or_optimizer=model, model_key=None, load_xser=args.save_load_xser>0)
-    elif args.pretrained_weight_dir is not None:
-        load(args.pretrained_weight_dir, model_or_optimizer=model, model_key=None, load_xser=args.save_load_xser>0, strict=False)
-
-    param_groups = get_param_groups_by_weight_decay(model)
-    if args.use_zero1_optimizer > 0:
-        if args.use_fp32_optimizer > 0:
-            opt_cls = AdamW_FP32OptimParams
-        else:
-            opt_cls = torch.optim.AdamW
-        optimizer = NeuronZero1Optimizer(
-                param_groups,
-                opt_cls,
-                lr=args.lr,
-                pin_layout=False,
-                sharding_groups=parallel_state.get_data_parallel_group(as_list=True),
-                betas=(args.beta1, args.beta2),
-                weight_decay=args.weight_decay,
-            )
-    elif args.use_fp32_optimizer > 0:
-        optimizer = AdamW_FP32OptimParams(
-            param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay
-        )
-    else:
-        optimizer = torch.optim.AdamW(
-            param_groups, betas=(args.beta1, args.beta2), lr=args.lr, weight_decay=args.weight_decay
-        )
-
-    if args.loading_step != -1:
-        load(f"{args.checkpoint_dir}/step{args.loading_step}/optimizer", model_or_optimizer=optimizer, model_key=None, load_xser=args.save_load_xser>0)
-
-    lr_scheduler = get_learning_rate_scheduler(optimizer, args)
-    if args.loading_step != -1:
-        lr_scheduler_state = user_content["lr_scheduler"]
-        lr_scheduler.load_state_dict(lr_scheduler_state)
-
-    train_dataloader = create_llama_pretraining_dataset(args.training_dir, args.train_batch_size, args.seed, dp_size, dp_rank)
-    if user_content is not None and "batch_idx" in user_content:
-        resume_batch_idx = user_content["batch_idx"]
-    else:
-        resume_batch_idx = None
-    
-    print("Creating sample dataloader finised")
-
-
-    total_steps = 0 if user_content is None else user_content["total_steps"]
-    # Only print/logging on the last PP rank of the first PP group
-    # Since loss is only in the last PP rank
-    should_print = (
-        model.pipeline_parallel_rank == args.pipeline_parallel_size - 1 and dp_rank == 0 and tp_rank == 0
-    )
-    if should_print and args.tb_dir != "":
-        from torch.utils.tensorboard import SummaryWriter
-        tb_dir = args.tb_dir
-        import os
-        import shutil
-
-        exist = os.path.exists(tb_dir)
-        if exist:
-            shutil.rmtree(tb_dir)
-        writer = SummaryWriter(log_dir=tb_dir)
-    else:
-        writer = None
-
-    epoch = 0
-    throughput = Throughput(
-        args.train_batch_size, dp_size, 1
-    )
-    while True:
-        if torch.distributed.get_rank() == 0:
-            print(f"Epoch {epoch}")
-        for batch_idx, batch in enumerate(train_dataloader):
-            if resume_batch_idx is not None and batch_idx <= resume_batch_idx:
-                if torch.distributed.get_rank() == 0:
-                    print(f"skipping batch {batch_idx}")
-                continue
-            start = time.time()
-            input_ids = batch["input_ids"]
-            attention_mask = batch["attention_mask"]
-            labels = batch["labels"]
-            # Enavle auto-mix-precision if needed
-            with torch.autocast(enabled=args.use_amp > 0, dtype=torch.bfloat16, device_type="cuda"):
-                # Calling model.run_train instead of model forward to use the PP runtime
-                loss = model.run_train(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    labels=labels,
-                )
-            total_steps += 1
-            if config.sequence_parallel_enabled:
-                allreduce_sequence_parallel_gradients(optimizer)
-            if args.use_zero1_optimizer == 0:
-                global_norm = clip_grad_norm(model.parameters(), 1.0)
-            else:
-                # Zero optimizer will take care of grad clipping
-                global_norm = None
-            optimizer.step()
-            optimizer.zero_grad()
-            lr_scheduler.step()
-            xm.mark_step()
-            if should_print:
-                end = time.time()
-                iteration_time = end - start
-                tps = throughput.get_throughput()
-                print(
-                    f"step {total_steps} step_time {iteration_time}s throughput {tps} seq/s loss {loss.detach().cpu().item()} grad norm {global_norm.item() if global_norm is not None else None}"
-                )
-                if writer is not None:
-                    current_lr = lr_scheduler.get_lr()[0]
-                    writer.add_scalar("loss", loss.item(), total_steps)
-                    if global_norm is not None:
-                        writer.add_scalar(
-                            "global_norm", global_norm.item(), total_steps
-                        )
-                    writer.add_scalar("lr", current_lr, total_steps)
-                    writer.add_scalar("iteration_time", iteration_time, total_steps)
-                    writer.add_scalar("throughput", tps, total_steps)
-                    writer.add_scalar(
-                        "input_ids",
-                        torch.sum(input_ids.detach().cpu()).item(),
-                        total_steps,
-                    )
-            # Saving checkpoints
-            if (args.checkpoint_freq > 0) and (total_steps % args.checkpoint_freq == 0):
-                save_checkpoint(args, model, optimizer, lr_scheduler, batch_idx, total_steps)
-            if total_steps >= args.max_steps:
-                break
-
-        if total_steps >= args.max_steps:
-            break
-        epoch += 1
-
-    print("Training finished successfully")
-
-def _mp_fn(index, args):
-    train_llama(args)
-    xm.rendezvous("_mp_fn finished")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--num_microbatches", type=int, default=8, help="num_microbatches")
-    parser.add_argument("--tensor_parallel_size", type=int, default=8, help="tensor_parallel_size")
-    parser.add_argument("--num_layer", type=int, default=-1, help="override model number of layers")
-    parser.add_argument("--hidden_size", type=int, default=-1, help="override model model hidden size")
-    parser.add_argument("--train_batch_size", type=int, default=16, help="batch size")
-    parser.add_argument("--pipeline_parallel_size", type=int, default=1, help="PP size")
-    parser.add_argument("--seq_len", type=int, default=4096, help="PP size")
-    parser.add_argument("--training_dir", type=str, default=None)
-    parser.add_argument("--training_config", type=str, default=None)
-    parser.add_argument("--trace_file_path", type=str, default=None)
-    parser.add_argument("--tb_dir", type=str, default="")
-    parser.add_argument("--max_steps", type=int, default=100, help="max steps")
-    parser.add_argument("--checkpoint_freq", type=int, default=100000, help="save checkpoint freq")
-    parser.add_argument("--checkpoint_dir", type=str, default=None)
-    parser.add_argument("--loading_step", type=int, default=-1, help="load from step, -1 means no load")
-    parser.add_argument("--num_kept_checkpoint", type=int, default=-1, help="number of checkpoints kept, old checkpoint will get deleted")
-    parser.add_argument("--save_load_xser", type=int, default=1, help="save/load with xla serialization")
-    parser.add_argument("--pretrained_weight_dir", type=str, default=None, help="Load dir of pretrained weight")
-
-    # optimization
-    opt_grp = parser.add_argument_group(title="optimization", description="arguments for optimization")
-    opt_grp.add_argument("--weight_decay", default=0.01, type=float, help="weight decay")
-    opt_grp.add_argument("--beta1", default=0.9, type=float, help="beta1 parameter for Adam optimizer")
-    opt_grp.add_argument("--beta2", default=0.95, type=float, help="beta2 parameter for Adam optimizer")
-    opt_grp.add_argument("--use_fp32_optimizer", default=0, type=int, help="use_fp32_optimizer")
-    opt_grp.add_argument("--use_zero1_optimizer", default=0, type=int, help="use_zero1_optimizer")
-    opt_grp.add_argument("--seed", default=1234, type=int, help="random seed")
-    opt_grp.add_argument("--use_amp", default=0, type=int, help="use amp data")
-    opt_grp.add_argument("--use_deferred_init", default=0, type=int, help="use torchdistx deferred initialization")
-    opt_grp.add_argument("--use_meta_device_init", default=0, type=int, help="use meta device initialization")
-    opt_grp.add_argument("--use_selective_checkpoint", default=0, type=int, help="enable selective activation checkpointing")
-    opt_grp.add_argument("--use_sequence_parallel", default=1, type=int, help="enable sequence parallelism")
-
-    # learning rate
-    lr_grp = parser.add_argument_group(title="lr", description="arguments for learning rate schedule")
-    lr_grp.add_argument("--lr", type=float, default=None, help="Initial learning rate.")
-    lr_grp.add_argument("--warmup_steps",type=int,default=None,help="number of warmup_steps")
-    lr_grp.add_argument("--constant_steps",type=int,default=None,help="number of warmup_steps")
-    lr_grp.add_argument("--min_lr",type=float,default=None,help="Minumum value for learning rate. The scheduler" "clip values below this threshold.")
-
-    args, _ = parser.parse_known_args()
-    # Workaround for NaNs seen with transformers version >= 4.21.0
-    # https://github.com/aws-neuron/aws-neuron-sdk/issues/593   
-    if os.environ.get("XLA_USE_BF16") or os.environ.get("XLA_DOWNCAST_BF16") or args.use_amp > 0:
-        modeling_utils.get_parameter_dtype = lambda x: torch.bfloat16
-
-    if os.environ.get("WORLD_SIZE"):
-        dist.init_process_group("xla")
-        _mp_fn(0, args)
-    else:
-        xmp.spawn(_mp_fn, args=(args,))
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/training_utils.py b/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/training_utils.py
deleted file mode 100644
index 0b85004..0000000
--- a/torch-neuronx/training/llama2/tp_pp_llama2_70b_hf_pretrain/training_utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from transformers import default_data_collator
-from torch.utils.data.dataloader import DataLoader
-import datasets
-
-from torch.utils.data import DistributedSampler
-from transformers import set_seed
-
-try:
-    from lr import CosineAnnealing
-except ImportError:
-    CosineAnnealing=None
-
-def get_learning_rate_scheduler(optimizer, args, last_epoch=-1):
-    lr_scheduler = CosineAnnealing(optimizer, max_steps=args.max_steps, min_lr=args.min_lr, warmup_steps=args.warmup_steps, constant_steps=args.constant_steps, last_epoch=last_epoch)
-    return lr_scheduler
-
-def get_param_groups_by_weight_decay(model):
-    """Get param groups."""
-    if hasattr(model, "local_named_parameters"):
-        # Zero1 use the first param in opt to decide the device
-        param_optimizer = list(model.local_named_parameters())
-    else:
-        param_optimizer = list(model.named_parameters())
-    no_decay = ["bias", "LayerNorm"]  # gamma/beta are in LayerNorm.weight
-
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.01,
-        },
-        {
-            "params": [
-                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-    return optimizer_grouped_parameters
-
-def create_llama_pretraining_dataset(
-    data_dir, mini_batch_size, seed, dp_size, dp_rank,
-):
-    #Workaround because python functions are not picklable
-    class WorkerInitObj(object):
-        def __init__(self, seed):
-            self.seed = seed
-
-        def __call__(self, id):
-            set_seed(self.seed)
-    worker_init = WorkerInitObj(seed)
-    train_data = datasets.load_from_disk(data_dir)
-    train_sampler = DistributedSampler(
-        train_data,
-        num_replicas=dp_size,
-        rank=dp_rank,
-        shuffle=False,
-        drop_last=True,
-    )
-    train_dataloader = DataLoader(
-        train_data,
-        collate_fn=default_data_collator,
-        sampler=train_sampler,
-        batch_size=mini_batch_size,
-        num_workers=0,
-        worker_init_fn=worker_init,
-        drop_last=True,
-        pin_memory=True,
-    )
-    return train_dataloader
-
-def create_partition(num_hidden_layers, pipeline_parallel_size):
-    """
-    Evenly split the transformer layers between the PP ranks
-    """
-    assert num_hidden_layers % pipeline_parallel_size == 0
-    num_layer_per_partition = num_hidden_layers  // pipeline_parallel_size
-    pipeline_cuts = []
-    current_cut = num_layer_per_partition - 1
-    for i in range(pipeline_parallel_size-1):
-        pipeline_cuts.append(f"model.layers.{current_cut}")
-        current_cut += num_layer_per_partition
-    return pipeline_cuts
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json b/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json
deleted file mode 100644
index 892c4db..0000000
--- a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "architectures": [
-      "LlamaForCausalLM"
-    ],
-    "bos_token_id": 1,
-    "eos_token_id": 2,
-    "hidden_act": "silu",
-    "hidden_size": 4096,
-    "initializer_range": 0.02,
-    "intermediate_size": 11008,
-    "max_position_embeddings": 2048,
-    "model_type": "llama",
-    "num_attention_heads": 32,
-    "num_hidden_layers": 32,
-    "num_key_value_heads": 32,
-    "pad_token_id": 0,
-    "pretraining_tp": 1,
-    "rms_norm_eps": 1e-05,
-    "rope_scaling": null,
-    "tie_word_embeddings": false,
-    "torch_dtype": "float16",
-    "transformers_version": "4.31.0",
-    "use_cache": true,
-    "vocab_size": 32000,
-    "sequence_parallel_enabled": false,
-    "selective_checkpoint_enabled": false,
-    "move_model_to_device":true
-  }
-  
\ No newline at end of file
diff --git a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py b/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py
deleted file mode 100644
index a94b6e8..0000000
--- a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py
+++ /dev/null
@@ -1,775 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import torch
-import sys
-import time
-import argparse
-import json
-import queue
-from typing import Any, Dict, List
-from datetime import datetime, timezone
-from collections import namedtuple
-import torch_xla
-import torch_xla.core.xla_model as xm
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data import DistributedSampler
-import torch_xla.distributed.parallel_loader as pl
-import torch.distributed as dist
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.distributed.xla_backend
-import numpy as np
-from transformers import (
-    AdamW,
-    default_data_collator,
-    set_seed,
-    LlamaConfig,
-)
-from transformers.optimization import get_linear_schedule_with_warmup
-
-import copy
-from torch.utils.tensorboard import SummaryWriter
-import inspect
-import requests
-from neuronx_distributed.parallel_layers import parallel_state, layers, grads, checkpointing
-from neuronx_distributed.utils.model_utils import move_model_to_device
-from neuronx_distributed.parallel_layers.grads import bucket_allreduce_gradients
-from neuronx_distributed.parallel_layers.utils import is_pjrt_device
-import datasets
-
-from neuronx_distributed.optimizer import NeuronZero1Optimizer
-from adamw_fp32_optim_params import AdamW_FP32OptimParams
-from modeling_llama_nxd import LlamaForCausalLM
-
-# For PT autocast.
-torch.cuda.is_bf16_supported = lambda: True
-
-# Workaround for NaNs seen with transformers version >= 4.21.0
-# https://github.com/aws-neuron/aws-neuron-sdk/issues/593
-import transformers.modeling_utils as modeling_utils
-
-if os.environ.get("XLA_USE_BF16") or os.environ.get("XLA_DOWNCAST_BF16"):
-    modeling_utils.get_parameter_dtype = lambda x: torch.bfloat16
-
-datetime_str = str(datetime.now())
-results = {
-    "inference_success": 1
-}
-
-
-Metric = namedtuple("Metric", ["name", "value", "units", "additional_data"])
-
-
-class TrainingMetrics:
-    """
-    This class is used for logging metrics to a json file. One can provide a 
-    dictionary of metrics that needs to be stored, and it wpuld get 
-    written to the file.
-    Arguments:
-        json_file: File used for logging. If no file exists, new file would be created.
-    """
-    def __init__(self, json_file):
-        self.json_file = json_file
-
-    def read_modify_write_file(self, data, key: str = "metrics") -> None:
-        """
-        data (dict of training parameters or list of metrics): Data to update in the file.
-        key (str): the dictionary key under which data is to be recorded
-        """
-        result_dict = {}
-        print(f"Writing data to the provided results file: {self.json_file}")
-        if os.path.exists(self.json_file):
-            with open(self.json_file) as json_file:
-                result_dict = json.loads(json_file.read()) or result_dict
-        print(f"Updating with {key} data: {data}")
-        if result_dict:
-            try:
-                # handle internal named entity if present
-                results = result_dict[next(iter(result_dict))]
-            except Exception:
-                results = result_dict
-            current = results.get(key)
-            if not current:
-                results[key] = data
-            else:
-                if isinstance(current, list):
-                    current.extend(data)
-                elif isinstance(current, dict):
-                    current.update(data)
-        else:
-            result_dict["results"] = {key: data}
-        with open(self.json_file, "w") as json_file:
-            json.dump(result_dict, json_file)
-
-    def store_metrics(self, metrics: List[Metric]) -> None:
-        """
-        Writes collected metrics to the file.
-        """
-        data = [
-            {
-                "MetricName": metric.name,
-                "MeasuredValue": metric.value,
-                "Units": metric.units,
-                "Timestamp": datetime.now(timezone.utc).isoformat(),
-                "AdditionalData": metric.additional_data,
-            }
-            for metric in metrics
-        ]
-        self.update(data=data, key="metrics")
-
-    def store_parameters(self, parameters: Dict[str, Any]) -> None:
-        """
-        Writes specified model and configuration parameters to the file.
-        """
-        self.update(data=parameters, key="parameters")
-
-    def update(self, **kwargs: Any) -> None:
-        """
-        Write specified data to the output file.
-        """
-        self.read_modify_write_file(**kwargs)
-
-
-class Throughput:
-    """
-    Used to calculate the throughput over a moving window. It records the step time
-    between two calls and uses that time to calculate the throughput.
-    """
-    def __init__(
-        self, batch_size, world_size, grad_accum_usteps, moving_avg_window_size=10, logging_interval=1
-    ):
-        self.seqs_per_iteration = batch_size * world_size * grad_accum_usteps*logging_interval
-        self.moving_avg_window_size = math.ceil(moving_avg_window_size/logging_interval)
-        self.moving_avg_window = queue.Queue()
-        self.window_time = 0
-        self.start_time = time.time()
-
-    def get_throughput(self):
-        step_time = time.time() - self.start_time
-        self.start_time += step_time
-        self.window_time += step_time
-        self.moving_avg_window.put(step_time)
-        window_size = self.moving_avg_window.qsize()
-        if window_size > self.moving_avg_window_size:
-            self.window_time -= self.moving_avg_window.get()
-            window_size -= 1
-        throughput = window_size * self.seqs_per_iteration / self.window_time
-        return throughput
-
-
-class Logger:
-    def __init__(self, args, world_size, model_dtype):
-        xla = "torch_xla" in sys.modules
-        self.throughputs = []
-        dtype_short = model_dtype.replace("torch.", "")
-        self.tb = SummaryWriter(
-            os.path.join(
-                args.output_dir,
-                f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}"
-                f"_{dtype_short}"
-                f"_w{world_size}"
-                f"_lr{args.lr}"
-                f"_bs{args.batch_size}"
-                f"_acc{args.grad_accum_usteps}"
-                f"_warmup{args.warmup_steps}"
-                f"_max{args.max_steps}"
-                f"_xla{xla}"
-                f"_{self.get_instance_type()}",
-            )
-        )
-        self.tb.add_text(
-            "script", "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0
-        )
-        self.golden_steploss = []
-        golden = "golden_steploss.txt"
-        if os.path.exists(golden):
-            with open(golden, "r") as f:
-                self.golden_steploss = [float(i) for i in f]
-            print(
-                f"Read {len(self.golden_steploss)} golden step loss values from {golden}"
-            )
-
-    def get_instance_type(self):
-        try:
-            token = requests.put(
-                "http://169.254.169.254/latest/api/token",
-                headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
-            )
-            data = requests.get(
-                "http://169.254.169.254/latest/meta-data/instance-type",
-                headers={"X-aws-ec2-metadata-token": token.text},
-            )
-            return data.text
-        except:
-            return os.environ.get("HOSTNAME", "unknown")
-
-    def log(self, epoch, step, step_loss, learning_rate, throughput, grad_norm=None):
-        time_now = time.asctime()
-        grad_norm_msg = f"grad-norm : {grad_norm}" if grad_norm else ""
-        print(
-            f"LOG {time_now} - ({epoch}, {step}) step_loss : {step_loss:.4f} "
-            f"learning_rate : {learning_rate:.2e} throughput : {throughput:.2f} "
-            f"{grad_norm_msg}",
-            flush=True,
-        )
-        self.tb.add_scalar("step loss", step_loss, step)
-        self.tb.add_scalar("learning rate", learning_rate, step)
-        self.tb.add_scalar("throughput", throughput, step)
-        if grad_norm:
-            self.tb.add_scalar("grad-norm", grad_norm, step)
-        self.throughputs.append(throughput)
-        if not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
-            step_0start = step - 1
-            if step_0start < len(self.golden_steploss) and step_0start >= 0:
-                np.testing.assert_allclose(
-                    step_loss, self.golden_steploss[step_0start], rtol=2.3e-1
-                )
-
-
-# Workaround because python functions are not picklable
-class WorkerInitObj(object):
-    def __init__(self, seed):
-        self.seed = seed
-
-    def __call__(self, id):
-        set_seed(self.seed)
-
-def create_pretraining_dataset(
-    data_dir, mini_batch_size, worker_init
-):
-    train_data = datasets.load_from_disk(os.path.expanduser(data_dir))
-    train_sampler = DistributedSampler(
-        train_data,
-        num_replicas=parallel_state.get_data_parallel_size(),
-        rank=parallel_state.get_data_parallel_rank(),
-        shuffle=False,
-        drop_last=True,
-    )
-    train_dataloader = DataLoader(
-        train_data,
-        collate_fn=default_data_collator,
-        sampler=train_sampler,
-        batch_size=mini_batch_size,
-        num_workers=0,
-        worker_init_fn=worker_init,
-        drop_last=True,
-        pin_memory=True,
-    )
-    return train_dataloader
-
-def get_model(flags):
-    model_path, seq_len = flags.model_path, flags.seq_len
-    config = LlamaConfig.from_pretrained(model_path)
-    config.use_cache = False
-    config.max_position_embeddings = max(config.max_position_embeddings, seq_len)
-    if flags.num_layers > 0:
-        config.num_hidden_layers = flags.num_layers
-    if flags.sequence_parallel_enabled:
-        config.sequence_parallel_enabled = True
-    if flags.selective_checkpoint_enabled:
-        config.selective_checkpoint_enabled = True
-    xm.master_print(config)
-    model = LlamaForCausalLM(config)
-    xm.master_print(model)
-    return model
-
-def get_dtype(model) -> str:
-    """
-    Reference: https://pytorch.org/xla/release/1.12/index.html#xla-tensors-and-bfloat16
-    """
-    if "XLA_USE_BF16" in os.environ:
-        return "torch.bfloat16"
-    if "XLA_DOWNCAST_BF16" in os.environ:
-        if "torch.float" in str(model.dtype):
-            return "torch.bfloat16"
-        if "torch.double" in str(model.dtype):
-            return "torch.float32"
-    return str(model.dtype)    
-
-def allreduce_sequence_parallel_gradients(optimizer):
-    """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-        Modified from megatron-lm:
-        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
-    """
-    from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
-    grads = []
-    for param_group in optimizer.__getstate__()['param_groups']:
-        for group, params in param_group.items():
-            if group == 'params':
-                for p in params:
-                    if isinstance(p, torch.Tensor) and p.grad is not None:
-                        sequence_parallel_param = getattr(p, 'sequence_parallel_enabled', False)
-                        if sequence_parallel_param:
-                            grads.append(p.grad.data)
-    for grad in grads:
-        # sum v.s. average: sum
-        reduce_from_tensor_model_parallel_region(grad)
-
-def train_llama(flags):
-    # Initialize the model parallelism with the tp degree
-    parallel_state.initialize_model_parallel(tensor_model_parallel_size=flags.tensor_parallel_size)
-    world_size = parallel_state.get_data_parallel_size()
-    is_root = xm.is_master_ordinal(local=False)
-    extract_graphs_only = os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None)
-    set_seed(flags.seed)
-    worker_init = WorkerInitObj(flags.seed)
-    device = xm.xla_device()
-
-    model = get_model(flags)
-    # Moving the model to device using NxD's API. This takes care of preserving the 
-    # tp/sp attributes
-    move_model_to_device(model, device)
-    model.train()
-
-    model_dtype = get_dtype(model)
-    running_loss = torch.zeros(1, dtype=torch.double).to(device)
-
-    param_optimizer = list(model.named_parameters())
-    no_decay = ["bias", "LayerNorm"]  # gamma/beta are in LayerNorm.weight
-
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.01,
-        },
-        {
-            "params": [
-                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-
-    if flags.use_mix_precision:
-        optimizer_cls = AdamW_FP32OptimParams
-    else:
-        optimizer_cls = AdamW
-
-    if flags.use_zero_1:
-        optimizer = NeuronZero1Optimizer(
-            optimizer_grouped_parameters,
-            optimizer_cls,
-            lr=flags.lr,
-            pin_layout=False,
-            sharding_groups=parallel_state.get_data_parallel_group(as_list=True),
-            grad_norm_groups=parallel_state.get_tensor_model_parallel_group(as_list=True),
-        )
-    else:
-        optimizer = optimizer_cls(optimizer_grouped_parameters, flags.lr)
-    optimizer.zero_grad()
-
-    if is_root:
-        if not os.path.exists(flags.output_dir):
-            os.makedirs(flags.output_dir, exist_ok=True)
-        if not extract_graphs_only:
-            logger = Logger(flags, world_size, model_dtype)
-        metric_writer = TrainingMetrics(flags.metrics_file)
-        throughput = Throughput(
-            flags.batch_size, world_size, flags.grad_accum_usteps, logging_interval=args.logging_interval
-        )
-        print("--------TRAINING CONFIG----------")
-        print(flags)
-        print("--------MODEL CONFIG----------")
-        print(model.config)
-        print("---------------------------------")
-        metric_writer.store_parameters(
-            {
-                "Model": model.name_or_path,
-                "Model configuration": str(model.config),
-                "World size": xm.xrt_world_size(),
-                "Data parallel degree": world_size,
-                "Batch size": flags.batch_size,
-                "Total steps": flags.steps_this_run,
-                "Seed": flags.seed,
-                "Optimizer": str(optimizer),
-                "Data type": model_dtype,
-                "Gradient accumulation microsteps": flags.grad_accum_usteps,
-                "Warmup steps": flags.warmup_steps,
-                "Dataset": os.path.basename(os.path.normpath(flags.data_dir)),
-                "Environment variables": {
-                    variable: value
-                    for variable, value in os.environ.items()
-                    if variable.startswith("NEURON") or variable.startswith("XLA")
-                },
-            }
-        )
-
-    def train_loop_fn(
-        model, optimizer, train_loader, epoch, global_step, training_ustep, running_loss, use_zero_1
-    ):
-        for _, data in enumerate(train_loader):
-            training_ustep += 1
-            input_ids = data["input_ids"]
-            attention_mask = data["attention_mask"]
-            labels = data["labels"]
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels,
-            )
-            loss = outputs.loss / flags.grad_accum_usteps
-            loss.backward()
-            running_loss += loss.detach()
-
-            if training_ustep % flags.grad_accum_usteps == 0:
-                xm.mark_step()
-                # loss averaging
-                running_loss_div = running_loss / world_size
-                # Collecting loss across all data-parallel ranks
-                running_loss_reduced = xm.all_reduce(
-                    xm.REDUCE_SUM,
-                    running_loss_div,
-                    groups=parallel_state.get_data_parallel_group(as_list=True),
-                )
-                running_loss_reduced_detached = running_loss_reduced.detach()
-                running_loss.zero_()
-
-                # For sequence-parallel, we have to explicitly all-reduce the layernorm
-                # gradients.
-                allreduce_sequence_parallel_gradients(optimizer)
-                if not use_zero_1:
-                    # all-reduce and then clip. Order matters.
-                    if parallel_state.get_data_parallel_size() > 1:
-                        bucket_allreduce_gradients(xm._fetch_gradients(optimizer))
-                    max_grad_norm = 1.0
-                    grads.clip_grad_norm(
-                        model.parameters(), max_grad_norm
-                    )  # Gradient clipping is not in AdamW anymore
-                optimizer.step()
-                total_norm_detach = 0.0
-                with torch.no_grad():
-                    total_norm = torch.zeros(1, device=device)
-                    if flags.print_grad_norm and is_root:
-                        for p in model.parameters():
-                            param_norm_sq = torch.square(p.grad).sum()
-                            total_norm += param_norm_sq
-                        total_norm = torch.sqrt(total_norm)
-                        total_norm_detach = total_norm.detach()
-
-                optimizer.zero_grad()
-                scheduler.step()
-                global_step += 1
-
-                def _print_logs(running_loss_reduced_detached, total_norm):
-                    if is_root and not extract_graphs_only:
-                        total_norm_cpu = None
-                        if flags.print_grad_norm:
-                            total_norm_cpu = total_norm.cpu().item()
-                        # NOTE: The running_loss is the loss of the global_step
-                        logger.log(
-                            epoch,
-                            global_step,
-                            running_loss_reduced_detached.cpu().item(),
-                            optimizer.param_groups[0]["lr"],
-                            throughput.get_throughput(),
-                            total_norm_cpu,
-                        )
-
-                if global_step % flags.logging_interval == 0:
-                    # Printing the loss inside the step closure. This won't block
-                    # the tracing for next step. Also, we are logging every N steps.
-                    # This is done to reduce the overhead of copying tensors to CPU.
-                    # Tensor copy is expensive since it prevents the next step to start.
-                    xm.add_step_closure(
-                        _print_logs, (running_loss_reduced_detached, total_norm_detach)
-                    )
-                if global_step >= flags.steps_this_run:
-                    # NOTE: Prevent runtime "Call to recv failed : Broken pipe" issue
-                    xm.mark_step()
-                    break
-
-        return (
-            global_step,
-            training_ustep,
-            running_loss,
-            running_loss_reduced_detached.cpu().item(),
-        )
-
-    scheduler_state_dict = None
-
-    if flags.resume_ckpt:
-        state_dict = checkpointing.load(flags.output_dir, model)
-        optimizer.load_state_dict(state_dict["optimizer"])
-        global_step = state_dict["global_step"]
-        epoch = state_dict["epoch"]
-        scheduler_state_dict = state_dict["scheduler"]
-    else:
-        global_step = 0
-        epoch = 0
-
-    train_start = time.time()
-    training_ustep = 0
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=flags.warmup_steps,
-        num_training_steps=flags.max_steps,
-        last_epoch=epoch if scheduler_state_dict else -1,
-    )
-
-    if scheduler_state_dict:
-        scheduler.load_state_dict(scheduler_state_dict)
-
-    assert os.path.exists(
-        os.path.expanduser(flags.data_dir)
-    ), "ERROR: Data directory {} doesn't exist!".format(flags.data_dir)
-
-    mini_batch_size = flags.batch_size
-    train_dataloader = create_pretraining_dataset(
-        flags.data_dir, mini_batch_size, worker_init
-    )
-    # We wrap the dataloader with MpDeviceLoader. This dataloader should take
-    # care of copying the tensors to device and also inserting the mark_step at
-    # iteration end.
-    train_device_loader = pl.MpDeviceLoader(train_dataloader, device)
-
-    while True:
-        xm.master_print(
-            "Epoch {} begin {}".format(epoch, time.asctime()),
-            flush=True,
-        )
-
-        global_step, training_ustep, running_loss, final_loss = train_loop_fn(
-            model,
-            optimizer,
-            train_device_loader,
-            epoch,
-            global_step,
-            training_ustep,
-            running_loss,
-            flags.use_zero_1,
-        )
-
-        if is_root and not extract_graphs_only:
-            final_time = time.time()
-            time_diff = final_time - train_start
-            print(
-                "Epoch {} step {} end {} loss {} perf {} seq/sec (at train microstep {} time {} from beginning time {})".format(
-                    epoch,
-                    global_step,
-                    time.asctime(),
-                    final_loss,
-                    logger.throughputs[-1],
-                    training_ustep,
-                    final_time,
-                    train_start,
-                ),
-                flush=True,
-            )
-            additional_data = {
-                "Epoch": epoch,
-                "Global step": global_step,
-                "Microstep": training_ustep,
-            }
-            metric_data = [
-                Metric("Loss", final_loss, "", additional_data),
-                Metric(
-                    "Throughput", logger.throughputs[-1], "seq/s", additional_data
-                ),
-            ]
-            metric_writer.store_metrics(metric_data)
-
-        if global_step >= flags.steps_this_run:
-            if is_root and not extract_graphs_only:
-                # record aggregate & final statistics in the metrics file
-                additional_data = {
-                    "Epoch": epoch,
-                    "Global step": global_step,
-                    "Microstep": training_ustep,
-                }
-                average_throughput = round(
-                    sum(logger.throughputs) / len(logger.throughputs), 4
-                )
-                metric_data = [
-                    Metric("Final loss", final_loss, "", additional_data),
-                    Metric(
-                        "Time to train",
-                        round(time_diff / 60, 4),
-                        "minutes",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Average throughput",
-                        average_throughput,
-                        "seq/s",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Peak throughput",
-                        max(logger.throughputs),
-                        "seq/s",
-                        additional_data,
-                    ),
-                ]
-                metric_writer.store_metrics(metric_data)
-            # TODO may incur HOST OOM
-            state_dict = {
-               "model": model.state_dict(),
-               "global_step": global_step,
-               "epoch": epoch,
-               "scheduler": scheduler.state_dict()
-            }
-            # Note: We are not saving the optimizer using the checkpoint.save API.
-            # This is because the Zero1 Optimizer is sharded across data-parallel ranks
-            # and hence all DP ranks need to save. checkpoint.save API only saves from 
-            # DP rank=0
-            checkpointing.save(state_dict, flags.output_dir)
-            optimizer.save_sharded_state_dict(flags.output_dir)
-            return
-
-        epoch += 1
-
-
-def _mp_fn(index, flags):
-    torch.set_default_tensor_type("torch.FloatTensor")
-    train_llama(flags)
-    xm.rendezvous("_mp_fn finished")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Model weight and config path.",
-    )
-    parser.add_argument(
-        "--data_dir",
-        type=str,
-        help="Pre-tokenized dataset directory.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./output",
-        help="Directory for checkpoints and logs.",
-    )
-    parser.add_argument(
-        "--metrics_file",
-        type=str,
-        default="results.json",
-        help="training metrics results file",
-    )
-    parser.add_argument("--batch_size", type=int, default=8, help="Worker batch size.")
-    parser.add_argument(
-        "--max_steps",
-        type=int,
-        help="Maximum total accumulation-steps to run.",
-    )
-    parser.add_argument(
-        "--steps_this_run",
-        type=int,
-        help="Exit early at <value> steps and not go to max_steps. -1 to mean no early exit.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=12349,
-        help="Random seed. Worker seed is this value + worker rank.",
-    )
-    parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate.")
-    parser.add_argument(
-        "--warmup_steps",
-        type=int,
-        default=2000,
-        help="Number of warmup accumulation-steps for learning rate .",
-    )
-    parser.add_argument(
-        "--grad_accum_usteps",
-        type=int,
-        default=64,
-        help="Gradient accumulation micro-steps (an accumulation-step has <value> micro-steps.",
-    )
-    parser.add_argument(
-        "--print_grad_norm",
-        default=False,
-        action="store_true",
-        help="Whether to print grad norm",
-    )
-    parser.add_argument(
-        "--resume_ckpt",
-        action="store_true",
-        help="Resume from checkpoint at resume_step."
-    )
-    parser.add_argument(
-        "--tensor_parallel_size",
-        default=2,
-        type=int,
-        help="Tensor parallel size"
-    )
-    parser.add_argument(
-        "--seq_len",
-        default=2048,
-        type=int,
-        help="Sequence length"
-    )
-    parser.add_argument(
-        "--use_mix_precision", action="store_true", help="Use mix precision."
-    )
-    parser.add_argument(
-        "--use_zero_1", action="store_true", help="Use ZeRO-1."
-    )
-    parser.add_argument(
-        "--num_layers",
-        type=int,
-        default=-1,
-        help="Override number of layers for this LLaMA model",
-    )
-    parser.add_argument(
-        "--sequence_parallel_enabled",
-        default=False,
-        action="store_true",
-        help="Enable sequence parallel",
-    )
-    parser.add_argument(
-        "--selective_checkpoint_enabled",
-        default=False,
-        action="store_true",
-        help="Enable selective checkpoint",
-    )
-    parser.add_argument(
-        "--logging_interval",
-        default=1,
-        type=int,
-        help="Enable selective checkpoint",
-    )
-
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.steps_this_run < 0:
-        args.steps_this_run = args.max_steps
-
-    os.environ["NEURON_RT_STOCHASTIC_ROUNDING_EN"] = "1"
-    if args.use_mix_precision:
-        os.environ["XLA_DOWNCAST_BF16"]="1"
-    else:
-        os.environ["XLA_USE_BF16"]="1"
-
-
-    # WORLD_SIZE is set by torchrun
-    if os.environ.get("WORLD_SIZE"):
-        if is_pjrt_device():
-            import torch_xla.experimental.pjrt_backend
-            dist.init_process_group("xla", init_method="pjrt://")
-        else:
-            dist.init_process_group("xla") 
-        _mp_fn(0, args)
-    else:
-        xmp.spawn(_mp_fn, args=(args,))
diff --git a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh b/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh
deleted file mode 100644
index 9eebf6f..0000000
--- a/torch-neuronx/training/llama2/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/bin/bash
-
-#############################################
-# User defined parameters and env vars
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-export NEURON_CC_FLAGS="--model-type transformer --distribution-strategy=llm-training --cache_dir=~/neuron_compile_cache/"
-export NEURON_FUSE_SOFTMAX=1
-
-# Async Runtime
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-
-# HOST OOM
-export MALLOC_ARENA_MAX=64
-
-# TP degree
-TP_DEGREE=8
-# 0: bf16; 1: mixed precision
-USE_MIX_PRECISION=1
-# 0: use pure DP; 1: use ZeRO-1
-USE_ZERO_1=1
-# global batch size
-GBS=1024
-# micro batch size
-MBS=1
-# number of steps to run
-TOTAL_STEPS=10000
-# warmup steps
-WARMUP_STEPS=100
-# learning rate
-LR=3.0e-4
-# model path
-MODEL_PATH=$SCRIPT_DIR
-# data path
-DATA_PATH="~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
-# sequence length
-SEQ_LEN=4096
-
-#############################################
-
-export NUM_NEURONCORES=32
-NODE_ID=0
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
-if [ ! -z "$SLURM_NTASKS" ]; then
-    WORLD_SIZE=$SLURM_NTASKS
-    NODE_ID=$SLURM_NODEID
-    MASTER_ADDRESS=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
-    DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000"
-    if [ $NODE_ID -eq 0 ]; then
-        echo "WORLD_SIZE=$WORLD_SIZE"
-        echo "NODE_ID=$NODE_ID"
-        echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-        echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
-    fi
-    export FI_EFA_USE_DEVICE_RDMA=1
-    export FI_PROVIDER=efa
-fi
-
-echo "WORLD_SIZE=$WORLD_SIZE"
-echo "NODE_ID=$NODE_ID"
-echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-
-sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000,48620
-
-export NEURON_RT_NUM_CORES=32
-export NUM_NEURONCORES=$NEURON_RT_NUM_CORES
-export TPU_NUM_DEVICES=$NEURON_RT_NUM_CORES
-export TPU_CHIPS_PER_HOST_BOUNDS=$NEURON_RT_NUM_CORES
-export NEURON_RT_ROOT_COMM_ID=localhost:48620
-
-#############################################
-
-EXTRA_ARGS=" "
-if [ $USE_MIX_PRECISION -gt 0 ]; then
-    EXTRA_ARGS+=" --use_mix_precision"
-fi
-if [ $USE_ZERO_1 -gt 0 ]; then
-    EXTRA_ARGS+=" --use_zero_1"
-fi
-
-DP=$(($NEURON_RT_NUM_CORES * $WORLD_SIZE / $TP_DEGREE))
-ACC_STEPS=$(($GBS / $MBS / $DP))
-
-
-if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then
-    STEPS_THIS_RUN=2
-    OUTPUT_LOG=log_compile-$NODE_ID.log
-else
-    STEPS_THIS_RUN=-1
-    OUTPUT_LOG=log_exe-$NODE_ID.log
-fi
-
-echo TP_DEGREE=$TP_DEGREE
-echo USE_MIX_PRECISION=$USE_MIX_PRECISION
-echo USE_ZERO_1=$USE_ZERO_1
-echo GBS=$GBS
-echo MBS=$MBS
-echo TOTAL_STEPS=$TOTAL_STEPS
-echo WARMUP_STEPS=$WARMUP_STEPS
-echo LR=$LR
-echo MODEL_PATH=$MODEL_PATH
-echo DATA_PATH=$DATA_PATH
-echo SEQ_LEN=$SEQ_LEN
-
-echo EXTRA_ARGS=$EXTRA_ARGS
-echo DP=$DP
-echo ACC_STEPS=$ACC_STEPS
-echo STEPS_THIS_RUN=$STEPS_THIS_RUN
-echo OUTPUT_LOG=$OUTPUT_LOG
-
-torchrun $DISTRIBUTED_ARGS \
-    tp_zero1_llama2_7b_hf_pretrain.py \
-    --model_path $MODEL_PATH \
-    --data_dir $DATA_PATH \
-    --tensor_parallel_size $TP_DEGREE \
-    --batch_size $MBS \
-    --steps_this_run $STEPS_THIS_RUN\
-    --max_steps $TOTAL_STEPS \
-    --warmup_steps $WARMUP_STEPS \
-    --lr $LR \
-    --grad_accum_usteps $ACC_STEPS \
-    --seq_len $SEQ_LEN \
-    --sequence_parallel_enabled \
-    --selective_checkpoint_enabled \
-    --logging_interval 10 \
-    $EXTRA_ARGS |& tee $OUTPUT_LOG
diff --git a/torch-neuronx/training/resnet50/resnet50.ipynb b/torch-neuronx/training/resnet50/resnet50.ipynb
index f14d059..0f55379 100644
--- a/torch-neuronx/training/resnet50/resnet50.ipynb
+++ b/torch-neuronx/training/resnet50/resnet50.ipynb
@@ -5,7 +5,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "# ResNet50 training - Pytorch [Beta] \n",
+                "# ResNet50 training - Pytorch [Beta PyTorch 2.1] \n",
                 "This notebook shows how to fine-tune a pretrained ResNet50 Pytorch model with AWS Trainium (trn1 instances) using NeuronSDK.\n",
                 "The original implementation is provided by torchvision.\n",
                 "\n",
@@ -41,7 +41,7 @@
             "outputs": [],
             "source": [
                 "#Install Neuron Compiler and Neuron/XLA packages\n",
-                "%pip install -U \"torchvision==0.14.1\" \"tensorboard==2.6\"\n",
+                "%pip install -U \"tensorboard\" \"timm\" torchvision==0.16.*\n",
                 "# use --force-reinstall if you're facing some issues while loading the modules\n",
                 "# now restart the kernel again"
             ]
@@ -125,7 +125,6 @@
                 "      --batch_size {batch_size}\n",
                 "      --pretrained\n",
                 "      --lr {learning_rate}\n",
-                "      --do_eval\n",
                 "      --drop_last\n",
                 "   \"\"\".replace('\\n', '')\n",
                 "\n",
diff --git a/torch-neuronx/training/stable_diffusion/requirements.txt b/torch-neuronx/training/stable_diffusion/requirements.txt
new file mode 100644
index 0000000..319e10e
--- /dev/null
+++ b/torch-neuronx/training/stable_diffusion/requirements.txt
@@ -0,0 +1,5 @@
+torchvision
+diffusers==0.19.3 # Intentionally pin to 0.19.3. More recent versions have problems on Neuron.
+transformers==4.31.0
+datasets==2.14.2
+fsspec==2023.9.2
\ No newline at end of file
diff --git a/torch-neuronx/training/stable_diffusion/run.py b/torch-neuronx/training/stable_diffusion/run.py
index 8441c77..9791e07 100644
--- a/torch-neuronx/training/stable_diffusion/run.py
+++ b/torch-neuronx/training/stable_diffusion/run.py
@@ -14,16 +14,15 @@ def parse_args():
             prog='neuron-sd-training-test-wrapper',
             description='Test wrapper for Neuron Stable Diffusion training recipe')
 
-    parser.add_argument('--model', choices=['2.1', '1.5'], help='Which model to train')
-    parser.add_argument('--resolution', choices=[512, 768], type=int, help='Which resolution of model to train')
-    parser.add_argument('--batch_size', type=int, help='What per-device microbatch size to use')
+    parser.add_argument('--model', choices=['2.1', '1.5'], default='2.1', help='Which model to train')
+    parser.add_argument('--resolution', choices=[512], default=512, type=int, help='Which resolution of model to train')
+    parser.add_argument('--batch_size', type=int, default=2, help='What per-device microbatch size to use')
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help='How many gradient accumulation steps to do (1 for no gradient accumulation)')
-    parser.add_argument('--epochs', type=int, default=2000, help='How many epochs to train for')
-    parser.add_argument('--unroll_vae', action='store_true', help='Whether to unroll the VAE inference step (in batch dimension)')
-    parser.add_argument('--mark_step_after_vae', action='store_true', help='Whether to mark step after the VAE inference step')
+    parser.add_argument('--epochs', type=int, default=6, help='How many epochs to train for')
 
     # For saving checkpoints
-    parser.add_argument("--checkpointing_steps", type=int, default=None,
+    # Save every 750 steps ~= 1 epoch (at batch2) by default
+    parser.add_argument("--checkpointing_steps", type=int, default=750,
         help=(
             "Save a checkpoint of the training state every X training steps. These checkpoints are only suitable for resuming"
             " training using `--resume_from_checkpoint`."),
@@ -33,7 +32,7 @@ def parse_args():
     )
 
     # Used to save a copy of the trained model for inference
-    parser.add_argument("--save_model_epochs", type=int, default=None,
+    parser.add_argument("--save_model_epochs", type=int, default=1,
         help=(
             "Save a copy of the trained model every X epochs in a format that can be loaded using HuggingFace's from_pretrained method."
         ))
@@ -52,11 +51,11 @@ def parse_args():
     parser.add_argument('--neuron_rt_stochastic_rounding_seed', type=int, default=0, help="The setting for the NEURON_RT_STOCHASTIC_ROUNDING_SEED, which controls the seed for stochastic rounding.")
 
     # Path to dir containing the training and inference scripts
-    parser.add_argument('--training_script_path', type=str, default=None)
+    parser.add_argument('--training_script_path', type=str, default="./sd_training_neuron.py", help="Path to the training script (sd_training_neuron.py)")
 
     args = parser.parse_args()
 
-    assert args.training_script_path is not None, "Need to pass the dir containing the training and inference scripts!"
+    assert args.training_script_path is not None, "Need to pass the path of the training script via --training_script_path (path to sd_training_neuron.py)"
 
     # Build the test name that will get used by the ModuleTester out of the args we parsed
     test_name = f"sd_{args.model}_training-{args.resolution}-batch{args.batch_size}-AdamW-{WORLD_SIZE}w-zero1_optimizer-grad_checkpointing"
@@ -83,8 +82,6 @@ def parse_args():
     os.environ.pop("XLA_IR_DEBUG", None)
     os.environ.pop("XLA_HLO_DEBUG", None)
 
-    unroll_vae = "--unroll_vae" if args.unroll_vae else ""
-    mark_step_after_vae = "--mark_step_after_vae" if args.mark_step_after_vae else ""
     gradient_accumulation_steps = f"--gradient_accumulation_steps {args.gradient_accumulation_steps}" if args.gradient_accumulation_steps is not None else ""
     save_model_epochs = f"--save_model_epochs {args.save_model_epochs}" if args.save_model_epochs is not None else ""
     checkpointing_steps = f"--checkpointing_steps {args.checkpointing_steps}" if args.checkpointing_steps is not None else ""
@@ -93,7 +90,7 @@ def parse_args():
     resume_checkpoint_step = f"--resume_checkpoint_step {args.resume_checkpoint_step}" if args.resume_checkpoint_step is not None else ""
 
     # Only need to run for 1 epoch for NPC to do its thing
-    run_command = f"torchrun --nproc_per_node={WORLD_SIZE} {args.training_script_path} --model {args.model} --resolution {args.resolution} {unroll_vae} {mark_step_after_vae} {gradient_accumulation_steps} --batch_size {args.batch_size} {save_model_epochs} {checkpointing_steps} {max_num_checkpoints} {resume_from_checkpoint} {resume_checkpoint_step}"
+    run_command = f"torchrun --nproc_per_node={WORLD_SIZE} {args.training_script_path} --model {args.model} --resolution {args.resolution} {gradient_accumulation_steps} --batch_size {args.batch_size} {save_model_epochs} {checkpointing_steps} {max_num_checkpoints} {resume_from_checkpoint} {resume_checkpoint_step}"
 
     # We use 10 parallel jobs because we expect up to 9 graphs: 8 without grad accum enabled, 9 with it enabled
     neuron_parallel_compile_command = "neuron_parallel_compile --num_parallel 10 " + run_command + " --epochs 1"
diff --git a/torch-neuronx/training/stable_diffusion/sd_training_neuron.py b/torch-neuronx/training/stable_diffusion/sd_training_neuron.py
index 37801d0..3d7f9f3 100644
--- a/torch-neuronx/training/stable_diffusion/sd_training_neuron.py
+++ b/torch-neuronx/training/stable_diffusion/sd_training_neuron.py
@@ -335,7 +335,7 @@ def train(args):
     # Download the dataset
     xm.master_print('Downloading dataset')
     # TODO: make this a parameter of the script
-    dataset_name = "lambdalabs/pokemon-blip-captions"
+    dataset_name = "m1guelpf/nouns"
     dataset = load_dataset(dataset_name)
     args.dataset_name = dataset_name
 
@@ -473,37 +473,31 @@ def collate_fn(examples):
             xm.master_print(f"*** Running epoch {epoch} step {step} (cumulative step {cumulative_train_step})")
             start_time = time.perf_counter_ns()
     
+            # Convert input image to latent space and add noise
             with torch.no_grad():
-                if args.unroll_vae:
-                    vae_inputs_batched = batch['pixel_values']
-                    vae_inputs_unbatched = torch.split(vae_inputs_batched, 1, dim=0)
-
-                    vae_outputs = []
-                    for vae_input in vae_inputs_unbatched:
-                        these_latents = vae.encode(vae_input).latent_dist.sample()
-                        these_latents = these_latents * 0.18215
-                        these_latents = these_latents.float()  # Cast latents to bf16 (under XLA_DOWNCAST_BF16)
-
-                        vae_outputs.append(these_latents)
-                    latents = torch.cat(vae_outputs, dim=0)
-
-                    del vae_inputs_batched
-                    del vae_inputs_unbatched
-                    del vae_input
-                    del vae_outputs
-                    del these_latents
-                    
-                else:
-                    # Convert input image to latent space and add noise
-                    latents = vae.encode(batch['pixel_values']).latent_dist.sample()
-                    latents = latents * 0.18215
-                    latents = latents.float()  # Cast latents to bf16 (under XLA_DOWNCAST_BF16)
+                vae_inputs_batched = batch['pixel_values']
+                vae_inputs_unbatched = torch.split(vae_inputs_batched, 1, dim=0)
+
+                vae_outputs = []
+                # Intentionally unroll the VAE execution here. Compiler produces poor QoR for the VAE at batch > 1
+                for vae_input in vae_inputs_unbatched:
+                    these_latents = vae.encode(vae_input).latent_dist.sample()
+                    these_latents = these_latents * 0.18215
+                    these_latents = these_latents.float()  # Cast latents to bf16 (under XLA_DOWNCAST_BF16)
+
+                    vae_outputs.append(these_latents)
+                latents = torch.cat(vae_outputs, dim=0)
 
+                del vae_inputs_batched
+                del vae_inputs_unbatched
+                del vae_input
+                del vae_outputs
+                del these_latents
 
             gc.collect()
 
-            if args.mark_step_after_vae:
-                xm.mark_step()
+            # mark_step here to separate VAE into its own graph. Results in better compiler QoR.
+            xm.mark_step()
 
             with torch.no_grad():
                 noise = torch.randn(latents.size(), dtype=latents.dtype, layout=latents.layout, device='cpu')
@@ -566,6 +560,11 @@ def collate_fn(examples):
 
             before_batch_load_time = time.perf_counter_ns()
 
+            # Only need a handful of training steps for graph extraction. Cut it off so we don't take forever when
+            # using a large dataset.
+            if os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None) and cumulative_train_step > 5:
+                break
+
         if args.save_model_epochs is not None and epoch % args.save_model_epochs == 0 and not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
             save_pipeline(args.results_dir + f"-EPOCH_{epoch}", args.model_id, unet, vae, text_encoder)
         
@@ -574,6 +573,11 @@ def collate_fn(examples):
         xm.master_print(f" Given {step + 1} many steps, e2e per iteration is {(end_epoch_time - start_epoch_time) / (step + 1) / (10 ** 6)} ms")
         xm.master_print(f"!!! Finished epoch {epoch}")
 
+        # Only need a handful of training steps for graph extraction. Cut it off so we don't take forever when
+        # using a large dataset.
+        if os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None) and cumulative_train_step > 5:
+            break
+
     # Save the trained model for use in inference
     xm.rendezvous('finish-training')
     if xm.is_master_ordinal() and not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
@@ -611,8 +615,6 @@ def parse_args():
     parser.add_argument('--batch_size', type=int, help='What per-device microbatch size to use')
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help='How many gradient accumulation steps to do (1 for no gradient accumulation)')
     parser.add_argument('--epochs', type=int, default=2000, help='How many epochs to train for')
-    parser.add_argument('--unroll_vae', action='store_true', help='Whether to unroll the VAE inference step (in batch dimension)')
-    parser.add_argument('--mark_step_after_vae', action='store_true', help='Whether to mark step after the VAE inference step')
 
     # Arguments for checkpointing
     parser.add_argument("--checkpointing_steps", type=int, default=None,
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py
deleted file mode 100644
index c2bd587..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/modeling_gpt_neox_nxd.py
+++ /dev/null
@@ -1,533 +0,0 @@
-""" NxD GPTNeoX model """
-
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.utils import logging
-
-from neuronx_distributed.parallel_layers.layers import ParallelEmbedding, ColumnParallelLinear, RowParallelLinear
-from neuronx_distributed.parallel_layers.loss_functions import parallel_cross_entropy
-from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size
-import neuronx_distributed.parallel_layers.utils as neuronx_dist_utils
-from neuronx_distributed.parallel_layers import move_model_to_device, mappings, layer_norm
-import torch_xla.core.xla_model as xm
-
-from transformers.models.gpt_neox.modeling_gpt_neox import (
-    GPTNeoXAttention,
-    GPTNeoXMLP,
-    GPTNeoXLayer,
-    GPTNeoXModel,
-    GPTNeoXPreTrainedModel,
-    GPTNeoXForCausalLM,
-    RotaryEmbedding,
-    apply_rotary_pos_emb,
-)
-
-from utils import scatter_to_sequence_parallel_region
-
-from functools import partial
-def _init_normal(std, w):
-    return nn.init.normal_(w, mean=0.0, std=std)
-
-logger = logging.get_logger(__name__)
-
-
-class GPTNeoXAttentionNxD(GPTNeoXAttention):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
-        )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
-        self.rotary_emb = RotaryEmbedding(
-            self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base
-        )
-        self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype())
-
-        # NxD code change: Replace the Linear with ColumnParallelLinear and RowParallelLinear
-        self.config = config
-        self.num_attention_heads = neuronx_dist_utils.divide(config.num_attention_heads, get_tensor_model_parallel_size()) 
-        init_method = partial(_init_normal, config.initializer_range)
-        self.query_key_value = ColumnParallelLinear(
-            config.hidden_size,
-            3 * config.hidden_size,
-            stride=3,
-            gather_output=False,
-            init_method=init_method,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        self.dense = RowParallelLinear(
-            config.hidden_size,
-            config.hidden_size,
-            input_is_parallel=True,
-            init_method=init_method,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        with torch.no_grad():
-            self.query_key_value.bias.data.zero_()
-            self.dense.bias.data.zero_()
-        move_model_to_device(self, xm.xla_device())
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        head_mask=None,
-        layer_past=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        has_layer_past = layer_past is not None
-
-        # Compute QKV
-        # Attention heads [batch, seq_len, hidden_size]
-        #   --> [batch, seq_len, (np * 3 * head_size)]
-        qkv = self.query_key_value(hidden_states)
-
-        # [batch, seq_len, (num_heads * 3 * head_size)]
-        #   --> [batch, seq_len, num_heads, 3 * head_size]
-        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
-        qkv = qkv.view(*new_qkv_shape)
-
-        # NxD code change: sequence parallel uses seq_len as the 0-th dim
-        if self.config.sequence_parallel_enabled:
-            # [seq_len, batch, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
-            query = qkv[..., : self.head_size].permute(1, 2, 0, 3)
-            key = qkv[..., self.head_size : 2 * self.head_size].permute(1, 2, 0, 3)
-            value = qkv[..., 2 * self.head_size :].permute(1, 2, 0, 3)
-        else:
-            # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
-            query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
-            key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
-            value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
-
-        # Compute rotary embeddings on rotary_ndims
-        query_rot = query[..., : self.rotary_ndims]
-        query_pass = query[..., self.rotary_ndims :]
-        key_rot = key[..., : self.rotary_ndims]
-        key_pass = key[..., self.rotary_ndims :]
-
-        # Compute token offset for rotary embeddings (when decoding)
-        seq_len = key.shape[-2]
-        offset = 0
-        if has_layer_past:
-            offset = layer_past[0].shape[-2]
-            seq_len += offset
-        cos, sin = self.rotary_emb(value, seq_len=seq_len)
-        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, offset=offset)
-        query = torch.cat((query, query_pass), dim=-1)
-        key = torch.cat((key, key_pass), dim=-1)
-
-        # Cache QKV values
-        if has_layer_past:
-            past_key = layer_past[0]
-            past_value = layer_past[1]
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-        present = (key, value) if use_cache else None
-
-        # Compute attention
-        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        # Reshape outputs
-        # NxD code change: sequence parallel uses seq_len as the 0-th dim
-        if self.config.sequence_parallel_enabled:
-            # tensor [bs, num_attention_heads, seq_len, attn_head_size]
-            attn_output = attn_output.permute(2, 0, 1, 3).contiguous()
-            # -> [seq_len, bs, num_attention_heads, attn_head_size]
-            # -> [seq_len, bs hidden_size]
-        else:
-            # tensor [bs, num_attention_heads, seq_len, attn_head_size]
-            attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
-            # -> [bs, seq_len, num_attention_heads, attn_head_size]
-            # -> [bs, seq_len, hidden_size]
-        attn_output = attn_output.view(attn_output.size(0), attn_output.size(1), self.num_attention_heads * self.head_size)
-
-        attn_output = self.dense(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
-        # compute causal mask from causal mask buffer
-        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
-        key_length = key.size(-2)
-
-        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
-        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
-        attn_scores = torch.zeros(
-            batch_size * num_attention_heads,
-            query_length,
-            key_length,
-            dtype=query.dtype,
-            device=key.device,
-        )
-        attn_scores = torch.baddbmm(
-            attn_scores,
-            query,
-            key.transpose(1, 2),
-            beta=1.0,
-            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
-        )
-        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
-
-        # NxD code change: creating causal_mask on-the-fly to trigger the Neuron compiler optimization
-        causal_mask = torch.triu(torch.ones((1, 1, query_length, key_length), device='xla'), diagonal=1).bool()
-        attn_scores = attn_scores.masked_fill_(causal_mask, -10000.0)
-
-        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
-        attn_weights = attn_weights.to(value.dtype)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-
-
-class GPTNeoXMLPNxD(GPTNeoXMLP):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.act = ACT2FN[config.hidden_act]
-
-        # NxD code change: Replace the Linear with ColumnParallelLinear and RowParallelLinear
-        self.config = config
-        init_method = partial(_init_normal, config.initializer_range)
-        self.dense_h_to_4h = ColumnParallelLinear(
-            config.hidden_size,
-            config.intermediate_size,
-            gather_output=False,
-            init_method=init_method,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        self.dense_4h_to_h = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            input_is_parallel=True,
-            init_method=init_method,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        with torch.no_grad():
-            self.dense_h_to_4h.bias.data.zero_()
-            self.dense_4h_to_h.bias.data.zero_()
-        move_model_to_device(self, xm.xla_device())
-
-
-class GPTNeoXLayerNxD(GPTNeoXLayer):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.use_parallel_residual = config.use_parallel_residual
-
-        # NxD code change: Replace the nn LayerNorm with nxd LayerNorm to use sequence parallel
-        self.input_layernorm = layer_norm.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-        self.input_layernorm.bias.data.zero_()
-        self.input_layernorm.weight.data.fill_(1.0)
-        self.post_attention_layernorm = layer_norm.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-        self.post_attention_layernorm.bias.data.zero_()
-        self.post_attention_layernorm.weight.data.fill_(1.0)
-
-        self.attention = GPTNeoXAttentionNxD(config)
-        self.mlp = GPTNeoXMLPNxD(config)
-
-
-class GPTNeoXModelNxD(GPTNeoXModel):
-    def __init__(self, config):
-        GPTNeoXPreTrainedModel.__init__(self, config)
-        self.config = config
-
-        # NxD code change: Replace the Embedding with ParallelEmbedding
-        init_method = partial(_init_normal, config.initializer_range)
-        self.embed_in = ParallelEmbedding(
-                config.vocab_size,
-                config.hidden_size,
-                init_method=init_method,
-        )
-
-        self.layers = nn.ModuleList([GPTNeoXLayerNxD(config) for _ in range(config.num_hidden_layers)])
-
-        # Replace the nn LayerNorm with nxd LayerNorm to use sequence parallel
-        self.final_layer_norm = layer_norm.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-        self.final_layer_norm.bias.data.zero_()
-        self.final_layer_norm.weight.data.fill_(1.0)
-
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        r"""
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        batch_size, seq_length = input_shape
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * self.config.num_hidden_layers)
-
-        # Attention mask.
-        if attention_mask is not None:
-            assert batch_size > 0, "batch_size has to be defined and > 0"
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_in(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # NxD code change: sequence parallel uses seq_len as the 0-th dim
-        if self.config.sequence_parallel_enabled:
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-            hidden_states = scatter_to_sequence_parallel_region(hidden_states)
-
-        presents = () if use_cache else None
-        all_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for layer_past
-                        return module(*inputs, use_cache, None, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    hidden_states,
-                    attention_mask,
-                    head_mask[i],
-                )
-            else:
-                outputs = layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    layer_past=layer_past,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
-
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        # NxD code change: sequence parallel uses seq_len as the 0-th dim
-        if self.config.sequence_parallel_enabled:
-            hidden_states = mappings.gather_from_sequence_parallel_region(hidden_states, to_model_parallel=False)
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-        )
-
-
-class GPTNeoXForCausalLMNxD(GPTNeoXForCausalLM):
-
-    def __init__(self, config):
-        GPTNeoXPreTrainedModel.__init__(self, config)
-
-        self.gpt_neox = GPTNeoXModelNxD(config)
-
-        # NxD code change: Replace the Linear with ColumnParallelLinear
-        init_method = partial(_init_normal, config.initializer_range)
-        self.embed_out = ColumnParallelLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            gather_output=False,
-            init_method=init_method,
-        )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
-            only required when the model is used as a decoder in a Sequence to Sequence model.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
-            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
-        >>> config.is_decoder = True
-        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.gpt_neox(
-            input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        lm_logits = self.embed_out(hidden_states)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shift_logits = lm_logits[:, :-1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-
-            # NxD code change: Replace the CrossEntropyLoss with parallel_cross_entropy
-            loss_fct = parallel_cross_entropy
-
-            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
-
-            # NxD code change: parallel_cross_entropy requires to take an averge
-            lm_loss = torch.mean(lm_loss)
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py
deleted file mode 100644
index e24ceea..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.py
+++ /dev/null
@@ -1,656 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import torch
-import sys
-import time
-import argparse
-import json
-import queue
-from typing import Any, Dict, List
-from datetime import datetime, timezone
-from collections import namedtuple
-import torch_xla
-import torch_xla.core.xla_model as xm
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data import DistributedSampler
-import torch_xla.distributed.parallel_loader as pl
-import torch.distributed as dist
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.distributed.xla_backend
-import numpy as np
-from transformers import (
-    default_data_collator,
-    set_seed,
-    modeling_utils,
-    GPTNeoXConfig,
-)
-from transformers.optimization import get_linear_schedule_with_warmup
-
-from torch.utils.tensorboard import SummaryWriter
-import inspect
-import requests
-from neuronx_distributed.parallel_layers import parallel_state, checkpointing, move_model_to_device
-import datasets
-
-from modeling_gpt_neox_nxd import GPTNeoXForCausalLMNxD
-from neuronx_distributed.optimizer import NeuronZero1Optimizer
-from adamw_fp32_optim_params import AdamW_FP32OptimParams
-
-datetime_str = str(datetime.now())
-results = {
-    "inference_success": 1
-}
-
-Metric = namedtuple("Metric", ["name", "value", "units", "additional_data"])
-
-
-class TrainingMetrics:
-    def __init__(self, json_file):
-        self.json_file = json_file
-
-    def read_modify_write_file(self, data, key: str = "metrics") -> None:
-        """
-        data (dict of training parameters or list of metrics): Data to update in the file.
-        key (str): the dictionary key under which data is to be recorded
-        """
-        result_dict = {}
-        print(f"Writing data to the provided results file: {self.json_file}")
-        if os.path.exists(self.json_file):
-            with open(self.json_file) as json_file:
-                result_dict = json.loads(json_file.read()) or result_dict
-        print(f"Updating with {key} data: {data}")
-        if result_dict:
-            try:
-                # handle internal named entity if present
-                results = result_dict[next(iter(result_dict))]
-            except Exception:
-                results = result_dict
-            current = results.get(key)
-            if not current:
-                results[key] = data
-            else:
-                if isinstance(current, list):
-                    current.extend(data)
-                elif isinstance(current, dict):
-                    current.update(data)
-        else:
-            result_dict["results"] = {key: data}
-        with open(self.json_file, "w") as json_file:
-            json.dump(result_dict, json_file)
-
-    def store_metrics(self, metrics: List[Metric]) -> None:
-        """
-        Writes collected metrics to the file.
-        """
-        data = [
-            {
-                "MetricName": metric.name,
-                "MeasuredValue": metric.value,
-                "Units": metric.units,
-                "Timestamp": datetime.now(timezone.utc).isoformat(),
-                "AdditionalData": metric.additional_data,
-            }
-            for metric in metrics
-        ]
-        self.update(data=data, key="metrics")
-
-    def store_parameters(self, parameters: Dict[str, Any]) -> None:
-        """
-        Writes specified model and configuration parameters to the file.
-        """
-        self.update(data=parameters, key="parameters")
-
-    def update(self, **kwargs: Any) -> None:
-        """
-        Write specified data to the output file.
-        """
-        self.read_modify_write_file(**kwargs)
-
-
-class Throughput:
-    def __init__(
-        self, batch_size, world_size, grad_accum_usteps, moving_avg_window_size=10
-    ):
-        self.seqs_per_iteration = batch_size * world_size * grad_accum_usteps
-        self.moving_avg_window_size = moving_avg_window_size
-        self.moving_avg_window = queue.Queue()
-        self.window_time = 0
-        self.start_time = time.time()
-
-    def get_throughput(self):
-        step_time = time.time() - self.start_time
-        self.start_time += step_time
-        self.window_time += step_time
-        self.moving_avg_window.put(step_time)
-        window_size = self.moving_avg_window.qsize()
-        if window_size > self.moving_avg_window_size:
-            self.window_time -= self.moving_avg_window.get()
-            window_size -= 1
-        throughput = window_size * self.seqs_per_iteration / self.window_time
-        return throughput
-
-
-class Logger:
-    def __init__(self, args, world_size, model_dtype):
-        xla = "torch_xla" in sys.modules
-        self.throughputs = []
-        dtype_short = model_dtype.replace("torch.", "")
-        self.tb = SummaryWriter(
-            os.path.join(
-                args.output_dir,
-                f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}"
-                f"_{dtype_short}"
-                f"_w{world_size}"
-                f"_lr{args.lr}"
-                f"_bs{args.batch_size}"
-                f"_acc{args.grad_accum_usteps}"
-                f"_warmup{args.warmup_steps}"
-                f"_max{args.max_steps}"
-                f"_xla{xla}"
-                f"_{self.get_instance_type()}",
-            )
-        )
-        self.tb.add_text(
-            "script", "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0
-        )
-        self.golden_steploss = []
-        golden = "golden_steploss.txt"
-        if os.path.exists(golden):
-            with open(golden, "r") as f:
-                self.golden_steploss = [float(i) for i in f]
-            print(
-                f"Read {len(self.golden_steploss)} golden step loss values from {golden}"
-            )
-
-    def get_instance_type(self):
-        try:
-            token = requests.put(
-                "http://169.254.169.254/latest/api/token",
-                headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
-            )
-            data = requests.get(
-                "http://169.254.169.254/latest/meta-data/instance-type",
-                headers={"X-aws-ec2-metadata-token": token.text},
-            )
-            return data.text
-        except:
-            return os.environ.get("HOSTNAME", "unknown")
-
-    def log(self, epoch, step, step_loss, learning_rate, throughput, grad_norm=None):
-        time_now = time.asctime()
-        grad_norm_msg = f"grad-norm : {grad_norm}" if grad_norm else ""
-        print(
-            f"LOG {time_now} - ({epoch}, {step}) step_loss : {step_loss:.4f} "
-            f"learning_rate : {learning_rate:.2e} throughput : {throughput:.2f} "
-            f"{grad_norm_msg}",
-            flush=True,
-        )
-        self.tb.add_scalar("step loss", step_loss, step)
-        self.tb.add_scalar("learning rate", learning_rate, step)
-        self.tb.add_scalar("throughput", throughput, step)
-        if grad_norm:
-            self.tb.add_scalar("grad-norm", grad_norm, step)
-        self.throughputs.append(throughput)
-        if not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
-            step_0start = step - 1
-            if step_0start < len(self.golden_steploss) and step_0start >= 0:
-                np.testing.assert_allclose(
-                    step_loss, self.golden_steploss[step_0start], rtol=2.3e-1
-                )
-
-
-# Workaround because python functions are not picklable
-class WorkerInitObj(object):
-    def __init__(self, seed):
-        self.seed = seed
-
-    def __call__(self, id):
-        set_seed(self.seed)
-
-def create_pretraining_dataset(
-    data_dir, mini_batch_size, worker_init
-):
-    train_data = datasets.load_from_disk(os.path.expanduser(data_dir))
-    train_sampler = DistributedSampler(
-        train_data,
-        num_replicas=parallel_state.get_data_parallel_size(),
-        rank=parallel_state.get_data_parallel_rank(),
-        shuffle=False,
-        drop_last=True,
-    )
-    train_dataloader = DataLoader(
-        train_data,
-        collate_fn=default_data_collator,
-        sampler=train_sampler,
-        batch_size=mini_batch_size,
-        num_workers=0,
-        worker_init_fn=worker_init,
-        drop_last=True,
-        pin_memory=True,
-    )
-    return train_dataloader
-
-def get_model():
-    model_name = "EleutherAI/gpt-neox-20b"
-    config = GPTNeoXConfig.from_pretrained(model_name)
-    config.use_cache = False
-    config.sequence_parallel_enabled = True
-    xm.master_print(config)
-    model = GPTNeoXForCausalLMNxD(config)
-    xm.master_print(model)
-    model.gradient_checkpointing_enable()
-    return model
-
-def get_dtype(model) -> str:
-    """
-    Reference: https://pytorch.org/xla/release/1.12/index.html#xla-tensors-and-bfloat16
-    """
-    if "XLA_USE_BF16" in os.environ:
-        return "torch.bfloat16"
-    if "XLA_DOWNCAST_BF16" in os.environ:
-        if "torch.float" in str(model.dtype):
-            return "torch.bfloat16"
-        if "torch.double" in str(model.dtype):
-            return "torch.float32"
-    return str(model.dtype)
-
-def allreduce_sequence_parallel_gradients(optimizer):
-    """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-        Modified from megatron-lm:
-        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
-    """
-    from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
-    grads = []
-    for param_group in optimizer.__getstate__()['param_groups']:
-        for group, params in param_group.items():
-            if group == 'params':
-                for p in params:
-                    if isinstance(p, torch.Tensor) and p.grad is not None:
-                        sequence_parallel_param = getattr(p, 'sequence_parallel_enabled', False)
-                        if sequence_parallel_param:
-                            grads.append(p.grad.data)
-    for grad in grads:
-        reduce_from_tensor_model_parallel_region(grad)
-
-def train_gpt_neox(flags):
-    parallel_state.initialize_model_parallel(tensor_model_parallel_size=flags.tensor_parallel_size)
-    world_size = parallel_state.get_data_parallel_size()
-    is_root = xm.is_master_ordinal(local=False)
-    extract_graphs_only = os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None)
-    set_seed(flags.seed)
-    worker_init = WorkerInitObj(flags.seed)
-    device = xm.xla_device()
-
-    model = get_model()
-    move_model_to_device(model, device)
-    model.train()
-
-    model_dtype = get_dtype(model)
-    running_loss = torch.zeros(1, dtype=torch.double).to(device)
-
-    param_optimizer = list(model.named_parameters())
-    no_decay = ["bias", "LayerNorm"]  # gamma/beta are in LayerNorm.weight
-
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.01,
-        },
-        {
-            "params": [
-                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-
-    optimizer = NeuronZero1Optimizer(
-        optimizer_grouped_parameters,
-        AdamW_FP32OptimParams,
-        lr=flags.lr,
-        pin_layout=False,
-        sharding_groups=parallel_state.get_data_parallel_group(as_list=True),
-        grad_norm_groups=parallel_state.get_tensor_model_parallel_group(as_list=True),
-    )
-    optimizer.zero_grad()
-
-    if is_root:
-        if not os.path.exists(flags.output_dir):
-            os.makedirs(flags.output_dir, exist_ok=True)
-        if not extract_graphs_only:
-            logger = Logger(flags, world_size, model_dtype)
-        metric_writer = TrainingMetrics(flags.metrics_file)
-        throughput = Throughput(
-            flags.batch_size, world_size, flags.grad_accum_usteps
-        )
-        print("--------TRAINING CONFIG----------")
-        print(flags)
-        print("--------MODEL CONFIG----------")
-        print(model.config)
-        print("---------------------------------")
-        metric_writer.store_parameters(
-            {
-                "Model": model.name_or_path,
-                "Model configuration": str(model.config),
-                "World size": xm.xrt_world_size(),
-                "Data parallel degree": world_size,
-                "Batch size": flags.batch_size,
-                "Total steps": flags.steps_this_run,
-                "Seed": flags.seed,
-                "Optimizer": str(optimizer),
-                "Data type": model_dtype,
-                "Gradient accumulation microsteps": flags.grad_accum_usteps,
-                "Warmup steps": flags.warmup_steps,
-                "Dataset": os.path.basename(os.path.normpath(flags.data_dir)),
-                "Environment variables": {
-                    variable: value
-                    for variable, value in os.environ.items()
-                    if variable.startswith("NEURON") or variable.startswith("XLA")
-                },
-            }
-        )
-
-    def train_loop_fn(
-        model, optimizer, train_loader, epoch, global_step, training_ustep, running_loss
-    ):
-        for _, data in enumerate(train_loader):
-            training_ustep += 1
-            input_ids = data["input_ids"]
-            attention_mask = data["attention_mask"]
-            labels = data["labels"]
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels,
-            )
-            loss = outputs.loss / flags.grad_accum_usteps
-            loss.backward()
-            running_loss += loss.detach()
-
-            if training_ustep % flags.grad_accum_usteps == 0:
-                xm.mark_step()
-                # loss averaging
-                running_loss_div = running_loss / world_size
-                running_loss_reduced = xm.all_reduce(
-                    xm.REDUCE_SUM,
-                    running_loss_div,
-                    groups=parallel_state.get_data_parallel_group(as_list=True),
-                )
-                running_loss_reduced_detached = running_loss_reduced.detach()
-                running_loss.zero_()
-
-                # sequence parallel allreduce
-                allreduce_sequence_parallel_gradients(optimizer)
-
-                optimizer.step()
-
-                with torch.no_grad():
-                    total_norm = torch.zeros(1, device=device)
-                    if flags.print_grad_norm and is_root:
-                        for p in model.parameters():
-                            param_norm_sq = torch.square(p.grad).sum()
-                            total_norm += param_norm_sq
-                        total_norm = torch.sqrt(total_norm)
-
-                optimizer.zero_grad()
-                scheduler.step()
-                global_step += 1
-
-                def _print_logs(running_loss_reduced_detached, total_norm):
-                    if is_root and not extract_graphs_only:
-                        total_norm_cpu = None
-                        if flags.print_grad_norm:
-                            total_norm_cpu = total_norm.cpu().item()
-                        # NOTE: The running_loss is the loss of the global_step
-                        logger.log(
-                            epoch,
-                            global_step,
-                            running_loss_reduced_detached.cpu().item(),
-                            optimizer.param_groups[0]["lr"],
-                            throughput.get_throughput(),
-                            total_norm_cpu,
-                        )
-
-                xm.add_step_closure(
-                    _print_logs, (running_loss_reduced_detached, total_norm.detach())
-                )
-                if global_step >= flags.steps_this_run:
-                    # NOTE: Prevent runtime "Call to recv failed : Broken pipe" issue
-                    xm.mark_step()
-                    break
-
-        return (
-            global_step,
-            training_ustep,
-            running_loss,
-            running_loss_reduced_detached.cpu().item(),
-        )
-
-    scheduler_state_dict = None
-
-    if flags.resume_ckpt:
-        state_dict = checkpointing.load(flags.output_dir, model)
-        global_step = state_dict["global_step"]
-        epoch = state_dict["epoch"]
-        scheduler_state_dict = state_dict["scheduler"]
-        optimizer.load_sharded_state_dict(flags.output_dir)
-    else:
-        global_step = 0
-        epoch = 0
-
-    train_start = time.time()
-    training_ustep = 0
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=flags.warmup_steps,
-        num_training_steps=flags.max_steps,
-        last_epoch=epoch if scheduler_state_dict else -1,
-    )
-
-    if scheduler_state_dict:
-        scheduler.load_state_dict(scheduler_state_dict)
-
-    assert os.path.exists(
-        os.path.expanduser(flags.data_dir)
-    ), "ERROR: Data directory {} doesn't exist!".format(flags.data_dir)
-
-    mini_batch_size = flags.batch_size
-    train_dataloader = create_pretraining_dataset(
-        flags.data_dir, mini_batch_size, worker_init
-    )
-    train_device_loader = pl.MpDeviceLoader(train_dataloader, device)
-
-    while True:
-        xm.master_print(
-            "Epoch {} begin {}".format(epoch, time.asctime()),
-            flush=True,
-        )
-
-        global_step, training_ustep, running_loss, final_loss = train_loop_fn(
-            model,
-            optimizer,
-            train_device_loader,
-            epoch,
-            global_step,
-            training_ustep,
-            running_loss,
-        )
-
-        if is_root and not extract_graphs_only:
-            final_time = time.time()
-            time_diff = final_time - train_start
-            print(
-                "Epoch {} step {} end {} loss {} perf {} seq/sec (at train microstep {} time {} from beginning time {})".format(
-                    epoch,
-                    global_step,
-                    time.asctime(),
-                    final_loss,
-                    logger.throughputs[-1],
-                    training_ustep,
-                    final_time,
-                    train_start,
-                ),
-                flush=True,
-            )
-            additional_data = {
-                "Epoch": epoch,
-                "Global step": global_step,
-                "Microstep": training_ustep,
-            }
-            metric_data = [
-                Metric("Loss", final_loss, "", additional_data),
-                Metric(
-                    "Throughput", logger.throughputs[-1], "seq/s", additional_data
-                ),
-            ]
-            metric_writer.store_metrics(metric_data)
-
-        if global_step >= flags.steps_this_run:
-            if is_root and not extract_graphs_only:
-                # record aggregate & final statistics in the metrics file
-                additional_data = {
-                    "Epoch": epoch,
-                    "Global step": global_step,
-                    "Microstep": training_ustep,
-                }
-                average_throughput = round(
-                    sum(logger.throughputs) / len(logger.throughputs), 4
-                )
-                metric_data = [
-                    Metric("Final loss", final_loss, "", additional_data),
-                    Metric(
-                        "Time to train",
-                        round(time_diff / 60, 4),
-                        "minutes",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Average throughput",
-                        average_throughput,
-                        "seq/s",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Peak throughput",
-                        max(logger.throughputs),
-                        "seq/s",
-                        additional_data,
-                    ),
-                ]
-                metric_writer.store_metrics(metric_data)
-            state_dict = {
-                "model": model.state_dict(),
-                "global_step": global_step,
-                "epoch": epoch,
-                "scheduler": scheduler.state_dict()
-            }
-            checkpointing.save(state_dict, flags.output_dir, down_cast_bf16=True)
-            optimizer.save_sharded_state_dict(flags.output_dir)
-            return
-
-        epoch += 1
-
-
-def _mp_fn(index, flags):
-    torch.set_default_tensor_type("torch.FloatTensor")
-    train_gpt_neox(flags)
-    xm.rendezvous("_mp_fn finished")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--data_dir",
-        type=str,
-        help="Pre-tokenized dataset directory.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./output",
-        help="Directory for checkpoints and logs.",
-    )
-    parser.add_argument(
-        "--metrics_file",
-        type=str,
-        default="results.json",
-        help="training metrics results file",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Worker batch size.")
-    parser.add_argument(
-        "--max_steps",
-        type=int,
-        help="Maximum total accumulation-steps to run.",
-    )
-    parser.add_argument(
-        "--steps_this_run",
-        type=int,
-        help="Exit early at <value> steps and not go to max_steps. -1 to mean no early exit.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=12349,
-        help="Random seed. Worker seed is this value + worker rank.",
-    )
-    parser.add_argument("--lr", type=float, help="Learning rate.")
-    parser.add_argument(
-        "--warmup_steps",
-        type=int,
-        help="Number of warmup accumulation-steps for learning rate .",
-    )
-    parser.add_argument(
-        "--grad_accum_usteps",
-        type=int,
-        help="Gradient accumulation micro-steps (an accumulation-step has <value> micro-steps.",
-    )
-    parser.add_argument(
-        "--print_grad_norm",
-        default=False,
-        action="store_true",
-        help="Whether to print grad norm",
-    )
-    parser.add_argument(
-        "--resume_ckpt",
-        action="store_true",
-        help="Resume from checkpoint at resume_step."
-    )
-    parser.add_argument(
-        "--tensor_parallel_size",
-        default=8,
-        type=int,
-        help="Tensor parallel size"
-    )
-
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.steps_this_run < 0:
-        args.steps_this_run = args.max_steps
-
-    # Workaround for NaNs seen with transformers version >= 4.21.0
-    # https://github.com/aws-neuron/aws-neuron-sdk/issues/593
-    modeling_utils.get_parameter_dtype = lambda x: torch.bfloat16
-
-    # WORLD_SIZE is set by torchrun
-    if os.environ.get("WORLD_SIZE"):
-        dist.init_process_group("xla")
-        _mp_fn(0, args)
-    else:
-        xmp.spawn(_mp_fn, args=(args,))
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.sh b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.sh
deleted file mode 100644
index a5c7af3..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/bash
-
-#############################################
-# User defined parameters and env vars
-
-export NEURON_CC_FLAGS="--model-type=transformer -O1 --enable-saturate-infinity --cache_dir=~/neuron_compile_cache/"
-export NEURON_FUSE_SOFTMAX=1
-
-export XLA_DOWNCAST_BF16=1
-export NEURON_RT_STOCHASTIC_ROUNDING_EN=1
-
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-
-# TP degree
-TP_DEGREE=32
-# global batch size
-GBS=256
-# micro batch size
-MBS=1
-# number of steps to run
-TOTAL_STEPS=10000
-# warmup steps
-WARMUP_STEPS=100
-# learning rate
-LR=0.97e-5
-# data path
-DATA_PATH="~/examples_datasets/wikicorpus_gpt_neox_tokenized_2k"
-
-#############################################
-
-export NUM_NEURONCORES=32
-NODE_ID=0
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
-if [ ! -z "$SLURM_NTASKS" ]; then
-    WORLD_SIZE=$SLURM_NTASKS
-    NODE_ID=$SLURM_NODEID
-    MASTER_ADDRESS=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
-    DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000"
-    if [ $NODE_ID -eq 0 ]; then
-        echo "WORLD_SIZE=$WORLD_SIZE"
-        echo "NODE_ID=$NODE_ID"
-        echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-        echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
-    fi
-    export FI_EFA_USE_DEVICE_RDMA=1
-    export FI_PROVIDER=efa
-fi
-
-#############################################
-
-DP=$(($NUM_NEURONCORES * $WORLD_SIZE / $TP_DEGREE))
-ACC_STEPS=$(($GBS / $MBS / $DP))
-
-if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then
-    STEPS_THIS_RUN=6
-    ACC_STEPS=6
-    OUTPUT_LOG=log_compile-$NODE_ID.log
-else
-    STEPS_THIS_RUN=-1
-    OUTPUT_LOG=log_exe-$NODE_ID.log
-fi
-
-if [ $NODE_ID -eq 0 ]; then
-    echo TP_DEGREE=$TP_DEGREE
-    echo GBS=$GBS
-    echo MBS=$MBS
-    echo TOTAL_STEPS=$TOTAL_STEPS
-    echo WARMUP_STEPS=$WARMUP_STEPS
-    echo LR=$LR
-    echo DATA_PATH=$DATA_PATH
-
-    echo DP=$DP
-    echo ACC_STEPS=$ACC_STEPS
-    echo STEPS_THIS_RUN=$STEPS_THIS_RUN
-    echo OUTPUT_LOG=$OUTPUT_LOG
-fi
-
-torchrun $DISTRIBUTED_ARGS \
-    tp_dp_gpt_neox_20b_hf_pretrain.py \
-    --data_dir $DATA_PATH \
-    --tensor_parallel_size $TP_DEGREE \
-    --batch_size $MBS \
-    --steps_this_run $STEPS_THIS_RUN \
-    --max_steps $TOTAL_STEPS \
-    --warmup_steps $WARMUP_STEPS \
-    --lr $LR \
-    --grad_accum_usteps $ACC_STEPS |& tee $OUTPUT_LOG
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/utils.py b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/utils.py
deleted file mode 100644
index 929d984..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_20b_hf_pretrain/utils.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-from neuronx_distributed.parallel_layers import mappings
-
-class _ScatterToSequenceParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chunk to the rank."""
-    @staticmethod
-    def symbolic(graph, input_):
-        return mappings._split_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return mappings._split_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return mappings._gather_along_first_dim(grad_output)
-
-
-# Note: This function is going to be upstreamed to Neuronx-Distributed in the upcoming release.
-def scatter_to_sequence_parallel_region(input_):
-    return _ScatterToSequenceParallelRegion.apply(input_)
\ No newline at end of file
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.py b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.py
deleted file mode 100644
index 5cefb95..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.py
+++ /dev/null
@@ -1,770 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import torch
-import sys
-import time
-import argparse
-import json
-import queue
-from typing import Any, Dict, List
-from datetime import datetime, timezone
-from collections import namedtuple
-import torch_xla
-import torch_xla.core.xla_model as xm
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data import DistributedSampler
-import torch_xla.distributed.parallel_loader as pl
-import torch.distributed as dist
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.distributed.xla_backend
-import numpy as np
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    default_data_collator,
-    set_seed,
-    modeling_utils,
-)
-from transformers.optimization import get_linear_schedule_with_warmup
-
-from torch.utils.tensorboard import SummaryWriter
-import inspect
-import requests
-from neuronx_distributed.parallel_layers import parallel_state, checkpointing, move_model_to_device
-import datasets
-import math
-
-from transformers.models.gpt_neox import modeling_gpt_neox
-from neuronx_distributed.parallel_layers.layers import ParallelEmbedding, ColumnParallelLinear, RowParallelLinear
-from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size
-import neuronx_distributed.parallel_layers.utils as neuronx_dist_utils
-from neuronx_distributed.optimizer import NeuronZero1Optimizer
-from adamw_fp32_optim_params import AdamW_FP32OptimParams
-
-datetime_str = str(datetime.now())
-results = {
-    "inference_success": 1
-}
-
-Metric = namedtuple("Metric", ["name", "value", "units", "additional_data"])
-
-
-class TrainingMetrics:
-    def __init__(self, json_file):
-        self.json_file = json_file
-
-    def read_modify_write_file(self, data, key: str = "metrics") -> None:
-        """
-        data (dict of training parameters or list of metrics): Data to update in the file.
-        key (str): the dictionary key under which data is to be recorded
-        """
-        result_dict = {}
-        print(f"Writing data to the provided results file: {self.json_file}")
-        if os.path.exists(self.json_file):
-            with open(self.json_file) as json_file:
-                result_dict = json.loads(json_file.read()) or result_dict
-        print(f"Updating with {key} data: {data}")
-        if result_dict:
-            try:
-                # handle internal named entity if present
-                results = result_dict[next(iter(result_dict))]
-            except Exception:
-                results = result_dict
-            current = results.get(key)
-            if not current:
-                results[key] = data
-            else:
-                if isinstance(current, list):
-                    current.extend(data)
-                elif isinstance(current, dict):
-                    current.update(data)
-        else:
-            result_dict["results"] = {key: data}
-        with open(self.json_file, "w") as json_file:
-            json.dump(result_dict, json_file)
-
-    def store_metrics(self, metrics: List[Metric]) -> None:
-        """
-        Writes collected metrics to the file.
-        """
-        data = [
-            {
-                "MetricName": metric.name,
-                "MeasuredValue": metric.value,
-                "Units": metric.units,
-                "Timestamp": datetime.now(timezone.utc).isoformat(),
-                "AdditionalData": metric.additional_data,
-            }
-            for metric in metrics
-        ]
-        self.update(data=data, key="metrics")
-
-    def store_parameters(self, parameters: Dict[str, Any]) -> None:
-        """
-        Writes specified model and configuration parameters to the file.
-        """
-        self.update(data=parameters, key="parameters")
-
-    def update(self, **kwargs: Any) -> None:
-        """
-        Write specified data to the output file.
-        """
-        self.read_modify_write_file(**kwargs)
-
-
-class Throughput:
-    def __init__(
-        self, batch_size, world_size, grad_accum_usteps, moving_avg_window_size=10
-    ):
-        self.seqs_per_iteration = batch_size * world_size * grad_accum_usteps
-        self.moving_avg_window_size = moving_avg_window_size
-        self.moving_avg_window = queue.Queue()
-        self.window_time = 0
-        self.start_time = time.time()
-
-    def get_throughput(self):
-        step_time = time.time() - self.start_time
-        self.start_time += step_time
-        self.window_time += step_time
-        self.moving_avg_window.put(step_time)
-        window_size = self.moving_avg_window.qsize()
-        if window_size > self.moving_avg_window_size:
-            self.window_time -= self.moving_avg_window.get()
-            window_size -= 1
-        throughput = window_size * self.seqs_per_iteration / self.window_time
-        return throughput
-
-
-class Logger:
-    def __init__(self, args, world_size, model_dtype):
-        xla = "torch_xla" in sys.modules
-        self.throughputs = []
-        dtype_short = model_dtype.replace("torch.", "")
-        self.tb = SummaryWriter(
-            os.path.join(
-                args.output_dir,
-                f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}"
-                f"_{dtype_short}"
-                f"_w{world_size}"
-                f"_lr{args.lr}"
-                f"_bs{args.batch_size}"
-                f"_acc{args.grad_accum_usteps}"
-                f"_warmup{args.warmup_steps}"
-                f"_max{args.max_steps}"
-                f"_xla{xla}"
-                f"_{self.get_instance_type()}",
-            )
-        )
-        self.tb.add_text(
-            "script", "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0
-        )
-        self.golden_steploss = []
-        golden = "golden_steploss.txt"
-        if os.path.exists(golden):
-            with open(golden, "r") as f:
-                self.golden_steploss = [float(i) for i in f]
-            print(
-                f"Read {len(self.golden_steploss)} golden step loss values from {golden}"
-            )
-
-    def get_instance_type(self):
-        try:
-            token = requests.put(
-                "http://169.254.169.254/latest/api/token",
-                headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
-            )
-            data = requests.get(
-                "http://169.254.169.254/latest/meta-data/instance-type",
-                headers={"X-aws-ec2-metadata-token": token.text},
-            )
-            return data.text
-        except:
-            return os.environ.get("HOSTNAME", "unknown")
-
-    def log(self, epoch, step, step_loss, learning_rate, throughput, grad_norm=None):
-        time_now = time.asctime()
-        grad_norm_msg = f"grad-norm : {grad_norm}" if grad_norm else ""
-        print(
-            f"LOG {time_now} - ({epoch}, {step}) step_loss : {step_loss:.4f} "
-            f"learning_rate : {learning_rate:.2e} throughput : {throughput:.2f} "
-            f"{grad_norm_msg}",
-            flush=True,
-        )
-        self.tb.add_scalar("step loss", step_loss, step)
-        self.tb.add_scalar("learning rate", learning_rate, step)
-        self.tb.add_scalar("throughput", throughput, step)
-        if grad_norm:
-            self.tb.add_scalar("grad-norm", grad_norm, step)
-        self.throughputs.append(throughput)
-        if not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
-            step_0start = step - 1
-            if step_0start < len(self.golden_steploss) and step_0start >= 0:
-                np.testing.assert_allclose(
-                    step_loss, self.golden_steploss[step_0start], rtol=2.3e-1
-                )
-
-
-# Workaround because python functions are not picklable
-class WorkerInitObj(object):
-    def __init__(self, seed):
-        self.seed = seed
-
-    def __call__(self, id):
-        set_seed(self.seed)
-
-def create_pretraining_dataset(
-    data_dir, mini_batch_size, worker_init
-):
-    train_data = datasets.load_from_disk(os.path.expanduser(data_dir))
-    train_sampler = DistributedSampler(
-        train_data,
-        num_replicas=parallel_state.get_data_parallel_size(),
-        rank=parallel_state.get_data_parallel_rank(),
-        shuffle=False,
-        drop_last=True,
-    )
-    train_dataloader = DataLoader(
-        train_data,
-        collate_fn=default_data_collator,
-        sampler=train_sampler,
-        batch_size=mini_batch_size,
-        num_workers=0,
-        worker_init_fn=worker_init,
-        drop_last=True,
-        pin_memory=True,
-    )
-    return train_dataloader
-
-def get_model():
-    class GPTNeoXAttention(modeling_gpt_neox.GPTNeoXAttention):
-        def __init__(self, config):
-            super().__init__(config)
-            self.num_attention_heads = neuronx_dist_utils.divide(config.num_attention_heads, get_tensor_model_parallel_size())
-            self.query_key_value = ColumnParallelLinear(
-                config.hidden_size,
-                3 * config.hidden_size,
-                stride=3,
-                gather_output=False,
-            )
-            self.dense = RowParallelLinear(
-                config.hidden_size,
-                config.hidden_size,
-                input_is_parallel=True,
-            )
-            self.query_key_value.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            self.dense.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            with torch.no_grad():
-                self.query_key_value.bias.data.zero_()
-                self.dense.bias.data.zero_()
-
-        def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-            # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
-            # compute causal mask from causal mask buffer
-            batch_size, num_attention_heads, query_length, attn_head_size = query.size()
-            key_length = key.size(-2)
-
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
-
-            query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
-            key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
-            attn_scores = torch.zeros(
-                batch_size * num_attention_heads,
-                query_length,
-                key_length,
-                dtype=query.dtype,
-                device=key.device,
-            )
-            attn_scores = torch.baddbmm(
-                attn_scores,
-                query,
-                key.transpose(1, 2),
-                beta=1.0,
-                alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
-            )
-            attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
-
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            # Use a negative number for mask_value instead of dtype.min for a compiler walk-around
-            mask_value = torch.tensor(-10000.0, dtype=attn_scores.dtype).to(attn_scores.device)
-            attn_scores = torch.where(causal_mask, attn_scores, mask_value)
-
-            if attention_mask is not None:
-                # Apply the attention mask
-                attn_scores = attn_scores + attention_mask
-
-            attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
-            attn_weights = attn_weights.to(value.dtype)
-
-            # Mask heads if we want to
-            if head_mask is not None:
-                attn_weights = attn_weights * head_mask
-
-            attn_output = torch.matmul(attn_weights, value)
-            return attn_output, attn_weights
-
-    class GPTNeoXMLP(modeling_gpt_neox.GPTNeoXMLP):
-        def __init__(self, config):
-            super().__init__(config)
-            self.dense_h_to_4h = ColumnParallelLinear(
-                config.hidden_size,
-                config.intermediate_size,
-                gather_output=False,
-            )
-            self.dense_4h_to_h = RowParallelLinear(
-                config.intermediate_size,
-                config.hidden_size,
-                input_is_parallel=True,
-            )
-            self.dense_h_to_4h.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            self.dense_4h_to_h.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            with torch.no_grad():
-                self.dense_h_to_4h.bias.data.zero_()
-                self.dense_4h_to_h.bias.data.zero_()
-
-    def get_sharded_data(data, dim):
-        tp_rank = parallel_state.get_tensor_model_parallel_rank()
-        per_partition_size = data.shape[dim] // parallel_state.get_tensor_model_parallel_size()
-        if dim == 0:
-            return data[
-                per_partition_size * tp_rank : per_partition_size * (tp_rank + 1)
-            ].clone()
-        elif dim == 1:
-            return data[
-                :, per_partition_size * tp_rank : per_partition_size * (tp_rank + 1)
-            ].clone()
-        else:
-            raise Exception(
-                f"Partiton value of 0,1 are supported, found {dim}."
-            )
-
-    model_name = "EleutherAI/pythia-6.9b"
-    config = AutoConfig.from_pretrained(model_name)
-    config.use_cache = False
-    xm.master_print(config)
-    model = AutoModelForCausalLM.from_config(config)
-    model.gradient_checkpointing_enable()
-
-    for layer in model.gpt_neox.layers:
-        orig_attn = layer.attention
-        layer.attention = GPTNeoXAttention(config)
-        layer.attention.query_key_value.weight.data = get_sharded_data(orig_attn.query_key_value.weight.data, 0)
-        layer.attention.dense.weight.data = get_sharded_data(orig_attn.dense.weight.data, 1)
-        del orig_attn
-
-        orig_mlp = layer.mlp
-        layer.mlp = GPTNeoXMLP(config)
-        layer.mlp.dense_h_to_4h.weight.data = get_sharded_data(orig_mlp.dense_h_to_4h.weight.data, 0)
-        layer.mlp.dense_4h_to_h.weight.data = get_sharded_data(orig_mlp.dense_4h_to_h.weight.data, 1)
-        del orig_mlp
-
-    orig_embed_in = model.gpt_neox.embed_in
-    model.gpt_neox.embed_in = ParallelEmbedding(config.vocab_size, config.hidden_size,)
-    model.gpt_neox.embed_in.weight.data = get_sharded_data(orig_embed_in.weight.data, 0)
-    del orig_embed_in
-
-    xm.master_print(model)
-    return model
-
-def get_and_move_model_sequential(device, num_workers_per_step=11):
-    local_rank = xm.get_local_ordinal()
-    local_world_size = neuronx_dist_utils.get_local_world_size()
-    for worker in range(math.ceil(local_world_size / num_workers_per_step)):
-        if local_rank // num_workers_per_step == worker:
-            model = get_model()
-            move_model_to_device(model, device)
-        xm.rendezvous("get_and_move_model_sequential" + str(worker))
-    return model
-
-def get_dtype(model) -> str:
-    """
-    Reference: https://pytorch.org/xla/release/1.12/index.html#xla-tensors-and-bfloat16
-    """
-    if "XLA_USE_BF16" in os.environ:
-        return "torch.bfloat16"
-    if "XLA_DOWNCAST_BF16" in os.environ:
-        if "torch.float" in str(model.dtype):
-            return "torch.bfloat16"
-        if "torch.double" in str(model.dtype):
-            return "torch.float32"
-    return str(model.dtype)
-
-def train_gpt_neox(flags):
-    parallel_state.initialize_model_parallel(tensor_model_parallel_size=flags.tensor_parallel_size)
-    world_size = parallel_state.get_data_parallel_size()
-    is_root = xm.is_master_ordinal(local=False)
-    extract_graphs_only = os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None)
-    set_seed(flags.seed)
-    worker_init = WorkerInitObj(flags.seed)
-    device = xm.xla_device()
-
-    model = get_and_move_model_sequential(device)
-    model.train()
-
-    model_dtype = get_dtype(model)
-    running_loss = torch.zeros(1, dtype=torch.double).to(device)
-
-    param_optimizer = list(model.named_parameters())
-    no_decay = ["bias", "LayerNorm"]  # gamma/beta are in LayerNorm.weight
-
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.01,
-        },
-        {
-            "params": [
-                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-
-    optimizer = NeuronZero1Optimizer(
-        optimizer_grouped_parameters,
-        AdamW_FP32OptimParams,
-        lr=flags.lr,
-        pin_layout=False,
-        sharding_groups=parallel_state.get_data_parallel_group(as_list=True),
-        grad_norm_groups=parallel_state.get_tensor_model_parallel_group(as_list=True),
-    )
-    optimizer.zero_grad()
-
-    if is_root:
-        if not os.path.exists(flags.output_dir):
-            os.makedirs(flags.output_dir, exist_ok=True)
-        if not extract_graphs_only:
-            logger = Logger(flags, world_size, model_dtype)
-        metric_writer = TrainingMetrics(flags.metrics_file)
-        throughput = Throughput(
-            flags.batch_size, world_size, flags.grad_accum_usteps
-        )
-        print("--------TRAINING CONFIG----------")
-        print(flags)
-        print("--------MODEL CONFIG----------")
-        print(model.config)
-        print("---------------------------------")
-        metric_writer.store_parameters(
-            {
-                "Model": model.name_or_path,
-                "Model configuration": str(model.config),
-                "World size": xm.xrt_world_size(),
-                "Data parallel degree": world_size,
-                "Batch size": flags.batch_size,
-                "Total steps": flags.steps_this_run,
-                "Seed": flags.seed,
-                "Optimizer": str(optimizer),
-                "Data type": model_dtype,
-                "Gradient accumulation microsteps": flags.grad_accum_usteps,
-                "Warmup steps": flags.warmup_steps,
-                "Dataset": os.path.basename(os.path.normpath(flags.data_dir)),
-                "Environment variables": {
-                    variable: value
-                    for variable, value in os.environ.items()
-                    if variable.startswith("NEURON") or variable.startswith("XLA")
-                },
-            }
-        )
-
-    def train_loop_fn(
-        model, optimizer, train_loader, epoch, global_step, training_ustep, running_loss
-    ):
-        max_grad_norm = 1.0
-
-        for _, data in enumerate(train_loader):
-            training_ustep += 1
-            input_ids = data["input_ids"]
-            attention_mask = data["attention_mask"]
-            labels = data["labels"]
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels,
-            )
-            loss = outputs.loss / flags.grad_accum_usteps
-            loss.backward()
-            running_loss += loss.detach()
-
-            if training_ustep % flags.grad_accum_usteps == 0:
-                xm.mark_step()
-                # loss averaging
-                running_loss_div = running_loss / world_size
-                running_loss_reduced = xm.all_reduce(
-                    xm.REDUCE_SUM,
-                    running_loss_div,
-                    groups=parallel_state.get_data_parallel_group(as_list=True),
-                )
-                running_loss_reduced_detached = running_loss_reduced.detach()
-                running_loss.zero_()
-                optimizer.step()
-
-                with torch.no_grad():
-                    total_norm = torch.zeros(1, device=device)
-                    if flags.print_grad_norm and is_root:
-                        for p in model.parameters():
-                            param_norm_sq = torch.square(p.grad).sum()
-                            total_norm += param_norm_sq
-                        total_norm = torch.sqrt(total_norm)
-
-                optimizer.zero_grad()
-                scheduler.step()
-                global_step += 1
-
-                def _print_logs(running_loss_reduced_detached, total_norm):
-                    if is_root and not extract_graphs_only:
-                        total_norm_cpu = None
-                        if flags.print_grad_norm:
-                            total_norm_cpu = total_norm.cpu().item()
-                        # NOTE: The running_loss is the loss of the global_step
-                        logger.log(
-                            epoch,
-                            global_step,
-                            running_loss_reduced_detached.cpu().item(),
-                            optimizer.param_groups[0]["lr"],
-                            throughput.get_throughput(),
-                            total_norm_cpu,
-                        )
-
-                xm.add_step_closure(
-                    _print_logs, (running_loss_reduced_detached, total_norm.detach())
-                )
-                if global_step >= flags.steps_this_run:
-                    # NOTE: Prevent runtime "Call to recv failed : Broken pipe" issue
-                    xm.mark_step()
-                    break
-
-        return (
-            global_step,
-            training_ustep,
-            running_loss,
-            running_loss_reduced_detached.cpu().item(),
-        )
-
-    scheduler_state_dict = None
-
-    if flags.resume_ckpt:
-        state_dict = checkpointing.load(flags.output_dir, model)
-        global_step = state_dict["global_step"]
-        epoch = state_dict["epoch"]
-        scheduler_state_dict = state_dict["scheduler"]
-        optimizer.load_sharded_state_dict(flags.output_dir)
-    else:
-        global_step = 0
-        epoch = 0
-
-    train_start = time.time()
-    training_ustep = 0
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=flags.warmup_steps,
-        num_training_steps=flags.max_steps,
-        last_epoch=epoch if scheduler_state_dict else -1,
-    )
-
-    if scheduler_state_dict:
-        scheduler.load_state_dict(scheduler_state_dict)
-
-    assert os.path.exists(
-        os.path.expanduser(flags.data_dir)
-    ), "ERROR: Data directory {} doesn't exist!".format(flags.data_dir)
-
-    mini_batch_size = flags.batch_size
-    train_dataloader = create_pretraining_dataset(
-        flags.data_dir, mini_batch_size, worker_init
-    )
-    train_device_loader = pl.MpDeviceLoader(train_dataloader, device)
-
-    while True:
-        xm.master_print(
-            "Epoch {} begin {}".format(epoch, time.asctime()),
-            flush=True,
-        )
-
-        global_step, training_ustep, running_loss, final_loss = train_loop_fn(
-            model,
-            optimizer,
-            train_device_loader,
-            epoch,
-            global_step,
-            training_ustep,
-            running_loss,
-        )
-
-        if is_root and not extract_graphs_only:
-            final_time = time.time()
-            time_diff = final_time - train_start
-            print(
-                "Epoch {} step {} end {} loss {} perf {} seq/sec (at train microstep {} time {} from beginning time {})".format(
-                    epoch,
-                    global_step,
-                    time.asctime(),
-                    final_loss,
-                    logger.throughputs[-1],
-                    training_ustep,
-                    final_time,
-                    train_start,
-                ),
-                flush=True,
-            )
-            additional_data = {
-                "Epoch": epoch,
-                "Global step": global_step,
-                "Microstep": training_ustep,
-            }
-            metric_data = [
-                Metric("Loss", final_loss, "", additional_data),
-                Metric(
-                    "Throughput", logger.throughputs[-1], "seq/s", additional_data
-                ),
-            ]
-            metric_writer.store_metrics(metric_data)
-
-        if global_step >= flags.steps_this_run:
-            if is_root and not extract_graphs_only:
-                # record aggregate & final statistics in the metrics file
-                additional_data = {
-                    "Epoch": epoch,
-                    "Global step": global_step,
-                    "Microstep": training_ustep,
-                }
-                average_throughput = round(
-                    sum(logger.throughputs) / len(logger.throughputs), 4
-                )
-                metric_data = [
-                    Metric("Final loss", final_loss, "", additional_data),
-                    Metric(
-                        "Time to train",
-                        round(time_diff / 60, 4),
-                        "minutes",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Average throughput",
-                        average_throughput,
-                        "seq/s",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Peak throughput",
-                        max(logger.throughputs),
-                        "seq/s",
-                        additional_data,
-                    ),
-                ]
-                metric_writer.store_metrics(metric_data)
-            state_dict = {
-                "model": model.state_dict(),
-                "global_step": global_step,
-                "epoch": epoch,
-                "scheduler": scheduler.state_dict()
-            }
-            checkpointing.save(state_dict, flags.output_dir, down_cast_bf16=True)
-            optimizer.save_sharded_state_dict(flags.output_dir)
-            return
-
-        epoch += 1
-
-
-def _mp_fn(index, flags):
-    torch.set_default_tensor_type("torch.FloatTensor")
-    train_gpt_neox(flags)
-    xm.rendezvous("_mp_fn finished")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--data_dir",
-        type=str,
-        help="Pre-tokenized dataset directory.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./output",
-        help="Directory for checkpoints and logs.",
-    )
-    parser.add_argument(
-        "--metrics_file",
-        type=str,
-        default="results.json",
-        help="training metrics results file",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Worker batch size.")
-    parser.add_argument(
-        "--max_steps",
-        type=int,
-        help="Maximum total accumulation-steps to run.",
-    )
-    parser.add_argument(
-        "--steps_this_run",
-        type=int,
-        help="Exit early at <value> steps and not go to max_steps. -1 to mean no early exit.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=12349,
-        help="Random seed. Worker seed is this value + worker rank.",
-    )
-    parser.add_argument("--lr", type=float, help="Learning rate.")
-    parser.add_argument(
-        "--warmup_steps",
-        type=int,
-        help="Number of warmup accumulation-steps for learning rate .",
-    )
-    parser.add_argument(
-        "--grad_accum_usteps",
-        type=int,
-        help="Gradient accumulation micro-steps (an accumulation-step has <value> micro-steps.",
-    )
-    parser.add_argument(
-        "--print_grad_norm",
-        default=False,
-        action="store_true",
-        help="Whether to print grad norm",
-    )
-    parser.add_argument(
-        "--resume_ckpt",
-        action="store_true",
-        help="Resume from checkpoint at resume_step."
-    )
-    parser.add_argument(
-        "--tensor_parallel_size",
-        default=8,
-        type=int,
-        help="Tensor parallel size"
-    )
-
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.steps_this_run < 0:
-        args.steps_this_run = args.max_steps
-
-    # Workaround for NaNs seen with transformers version >= 4.21.0
-    # https://github.com/aws-neuron/aws-neuron-sdk/issues/593
-    modeling_utils.get_parameter_dtype = lambda x: torch.bfloat16
-
-    # WORLD_SIZE is set by torchrun
-    if os.environ.get("WORLD_SIZE"):
-        dist.init_process_group("xla")
-        _mp_fn(0, args)
-    else:
-        xmp.spawn(_mp_fn, args=(args,))
diff --git a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.sh b/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.sh
deleted file mode 100644
index d166a3d..0000000
--- a/torch-neuronx/training/tp_dp_gpt_neox_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain/tp_dp_gpt_neox_6.9b_hf_pretrain.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-
-#############################################
-# User defined parameters and env vars
-
-export NEURON_CC_FLAGS="--model-type=transformer -O1 --enable-saturate-infinity --cache_dir=~/neuron_compile_cache/"
-export NEURON_FUSE_SOFTMAX=1
-
-export XLA_DOWNCAST_BF16=1
-export NEURON_RT_STOCHASTIC_ROUNDING_EN=1
-
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-
-# TP degree
-TP_DEGREE=8
-# global batch size
-GBS=256
-# micro batch size
-MBS=1
-# number of steps to run
-TOTAL_STEPS=1550
-# warmup steps
-WARMUP_STEPS=15
-# learning rate
-LR=1.2e-4
-# data path
-DATA_PATH="~/examples_datasets/wikicorpus_gpt_neox_tokenized_2k"
-
-#############################################
-
-export NUM_NEURONCORES=32
-NODE_ID=0
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES"
-if [ ! -z "$SLURM_NTASKS" ]; then
-    WORLD_SIZE=$SLURM_NTASKS
-    NODE_ID=$SLURM_NODEID
-    MASTER_ADDRESS=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
-    DISTRIBUTED_ARGS="--nproc_per_node $NUM_NEURONCORES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000"
-    if [ $NODE_ID -eq 0 ]; then
-        echo "WORLD_SIZE=$WORLD_SIZE"
-        echo "NODE_ID=$NODE_ID"
-        echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-        echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
-    fi
-    export FI_EFA_USE_DEVICE_RDMA=1
-    export FI_PROVIDER=efa
-fi
-
-#############################################
-
-DP=$(($NUM_NEURONCORES * $WORLD_SIZE / $TP_DEGREE))
-ACC_STEPS=$(($GBS / $MBS / $DP))
-
-if [ ! -z "$NEURON_EXTRACT_GRAPHS_ONLY" ]; then
-    STEPS_THIS_RUN=6
-    OUTPUT_LOG=log_compile-$NODE_ID.log
-else
-    STEPS_THIS_RUN=-1
-    OUTPUT_LOG=log_exe-$NODE_ID.log
-fi
-
-if [ $NODE_ID -eq 0 ]; then
-    echo TP_DEGREE=$TP_DEGREE
-    echo GBS=$GBS
-    echo MBS=$MBS
-    echo TOTAL_STEPS=$TOTAL_STEPS
-    echo WARMUP_STEPS=$WARMUP_STEPS
-    echo LR=$LR
-    echo DATA_PATH=$DATA_PATH
-
-    echo DP=$DP
-    echo ACC_STEPS=$ACC_STEPS
-    echo STEPS_THIS_RUN=$STEPS_THIS_RUN
-    echo OUTPUT_LOG=$OUTPUT_LOG
-fi
-
-torchrun $DISTRIBUTED_ARGS \
-    tp_dp_gpt_neox_6.9b_hf_pretrain.py \
-    --data_dir $DATA_PATH \
-    --tensor_parallel_size $TP_DEGREE \
-    --batch_size $MBS \
-    --steps_this_run $STEPS_THIS_RUN \
-    --max_steps $TOTAL_STEPS \
-    --warmup_steps $WARMUP_STEPS \
-    --lr $LR \
-    --grad_accum_usteps $ACC_STEPS |& tee $OUTPUT_LOG
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/adamw_fp32_optim_params.py b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/adamw_fp32_optim_params.py
deleted file mode 100644
index 8f6d544..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/adamw_fp32_optim_params.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Modifications Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# AdamW adapted from HuggingFace, with high-precision optimizer states for BF16/FP32 training.
-# source: https://github.com/huggingface/transformers/blob/main/src/transformers/optimization.py#L358
-
-import os
-import math
-import warnings
-from functools import partial
-from typing import Callable, Iterable, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.optim import Optimizer
-from transformers.utils import logging
-from transformers.utils.versions import require_version
-
-class AdamW_FP32OptimParams(Optimizer):
-    """
-    Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
-    Regularization](https://arxiv.org/abs/1711.05101), designed with high-precision optimizer states.
-    Parameters:
-        params (`Iterable[nn.parameter.Parameter]`):
-            Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (`float`, *optional*, defaults to 1e-3):
-            The learning rate to use.
-        betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
-            Adam's betas parameters (b1, b2).
-        eps (`float`, *optional*, defaults to 1e-6):
-            Adam's epsilon for numerical stability.
-        weight_decay (`float`, *optional*, defaults to 0):
-            Decoupled weight decay to apply.
-        correct_bias (`bool`, *optional*, defaults to `True`):
-            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
-        no_deprecation_warning (`bool`, *optional*, defaults to `False`):
-            A flag used to disable the deprecation warning (set to `True` to disable the warning).
-    """
-
-    def __init__(
-        self,
-        params: Iterable[nn.parameter.Parameter],
-        lr: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-6,
-        weight_decay: float = 0.0,
-        correct_bias: bool = True,
-        no_deprecation_warning: bool = True,
-        differentiable: bool = False,
-    ):
-        if not no_deprecation_warning:
-            warnings.warn(
-                "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
-                " implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
-                " warning",
-                FutureWarning,
-            )
-        require_version("torch>=1.5.0")  # add_ with alpha
-        if lr < 0.0:
-            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
-        if not 0.0 <= eps:
-            raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
-        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
-        self.upcast_optim_states = os.environ.get('XLA_DOWNCAST_BF16', '0') == '1'
-        super().__init__(params, defaults)
-
-    def step(self, closure: Callable = None):
-        """
-        Performs a single optimization step.
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                # Upcast grad to fp64 so that XLA_DOWNCAST_BF16=1 keeps grad operations in fp32
-                if self.upcast_optim_states:
-                    grad = p.grad.data.double()
-                else:
-                    grad = p.grad.data
-
-                if grad.is_sparse:
-                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Use fp64 for exp_avg_* so that XLA_DOWNCAST_BF16=1 keeps them in fp32
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(grad)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(grad)
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
-                denom = exp_avg_sq.sqrt().add_(group["eps"])
-
-                step_size = group["lr"]
-                if group["correct_bias"]:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1 ** state["step"]
-                    bias_correction2 = 1.0 - beta2 ** state["step"]
-                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
-
-                p.data.addcdiv_(exp_avg, denom, value=-step_size)
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                # Add weight decay at the end (fixed version)
-                if group["weight_decay"] > 0.0:
-                    p.data.add_(p.data, alpha=(-group["lr"] * group["weight_decay"]))
-
-        return loss
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/config.json b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/config.json
deleted file mode 100644
index 50125d8..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "architectures": [
-      "LlamaForCausalLM"
-    ],
-    "bos_token_id": 1,
-    "eos_token_id": 2,
-    "hidden_act": "silu",
-    "hidden_size": 4096,
-    "initializer_range": 0.02,
-    "intermediate_size": 11008,
-    "max_position_embeddings": 2048,
-    "model_type": "llama",
-    "num_attention_heads": 32,
-    "num_hidden_layers": 32,
-    "num_key_value_heads": 32,
-    "pad_token_id": 0,
-    "pretraining_tp": 1,
-    "rms_norm_eps": 1e-05,
-    "rope_scaling": null,
-    "tie_word_embeddings": false,
-    "torch_dtype": "float16",
-    "transformers_version": "4.31.0",
-    "use_cache": true,
-    "vocab_size": 32000,
-    "sequence_parallel_enabled": false,
-    "selective_checkpoint_enabled": false
-  }
-  
\ No newline at end of file
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/get_dataset.py b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/get_dataset.py
deleted file mode 100644
index b2e4937..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/get_dataset.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from datasets import load_dataset
-from transformers import AutoTokenizer
-from itertools import chain
-import os
-
-dataset_name = "wikicorpus"
-dataset_config_name = "raw_en"
-save_path = "~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
-tokenizer_path = "~/examples/tp_zero1_llama2_7b_hf_pretrain"
-
-save_path = os.path.expanduser(save_path)
-tokenizer_path = os.path.expanduser(tokenizer_path)
-if not os.path.exists(save_path):
-    os.makedirs(save_path)
-
-block_size = 4096
-
-raw_datasets = load_dataset(dataset_name, dataset_config_name)
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-
-column_names = raw_datasets["train"].column_names
-text_column_name = "text" if "text" in column_names else column_names[0]
-
-def tokenize_function(examples):
-    return tokenizer(examples[text_column_name])
-
-tokenized_datasets = raw_datasets.map(
-    tokenize_function,
-    batched=True,
-    remove_columns=column_names,
-    load_from_cache_file=True,
-    desc="Running tokenizer on dataset",
-)
-
-if block_size > tokenizer.model_max_length:
-    print("block_size > tokenizer.model_max_length")
-block_size = min(block_size, tokenizer.model_max_length)
-
-# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-def group_texts(examples):
-    # Concatenate all texts.
-    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-    total_length = len(concatenated_examples[list(examples.keys())[0]])
-    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-    total_length = (total_length // block_size) * block_size
-    # Split by chunks of max_len.
-    result = {
-        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-        for k, t in concatenated_examples.items()
-    }
-    result["labels"] = result["input_ids"].copy()
-    return result
-
-lm_datasets = tokenized_datasets.map(
-    group_texts,
-    batched=True,
-    load_from_cache_file=True,
-    desc=f"Grouping texts in chunks of {block_size}",
-)
-
-train_dataset = lm_datasets["train"]
-print(len(train_dataset))
-
-train_dataset.save_to_disk(save_path)
-
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/modeling_llama2_nxd.py b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/modeling_llama2_nxd.py
deleted file mode 100644
index 7d6757f..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/modeling_llama2_nxd.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-from neuronx_distributed.parallel_layers.layers import ParallelEmbedding, ColumnParallelLinear, RowParallelLinear
-from neuronx_distributed.parallel_layers.loss_functions import parallel_cross_entropy
-from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size, get_tensor_model_parallel_rank
-import neuronx_distributed.parallel_layers.utils as neuronx_dist_utils
-from neuronx_distributed.utils.model_utils import move_model_to_device
-from neuronx_distributed.parallel_layers import mappings
-import torch_xla.core.xla_model as xm
-
-from transformers.models.llama.modeling_llama import (
-    LlamaRMSNorm,
-    LlamaRotaryEmbedding,
-    LlamaLinearScalingRotaryEmbedding,
-    LlamaMLP,
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaPreTrainedModel,
-    LlamaModel,
-    LlamaForCausalLM,
-    LlamaForSequenceClassification,
-    rotate_half,
-    apply_rotary_pos_emb,
-    repeat_kv,
-    LLAMA_START_DOCSTRING,
-    LLAMA_INPUTS_DOCSTRING,
-)
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNormNxD(LlamaRMSNorm):
-    def __init__(self, hidden_size, eps=1e-6, sequence_parallel_enabled=False):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__(hidden_size, eps=eps)
-        setattr(self.weight, "sequence_parallel_enabled", sequence_parallel_enabled)
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-
-        hidden_states = hidden_states.to(torch.double)
-
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaMLPNxD(LlamaMLP):
-    def __init__(self, config):
-        nn.Module.__init__(self)
-        self.config = config
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.act_fn = ACT2FN[config.hidden_act]
-
-        self.gate_up_proj = ColumnParallelLinear(
-            self.hidden_size,
-            2 * self.intermediate_size,
-            stride=2,
-            bias=False,
-            gather_output=False,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        self.down_proj = RowParallelLinear(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        self.split_size = self.intermediate_size // get_tensor_model_parallel_size()
-        move_model_to_device(self, xm.xla_device())
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            gate_proj, up_proj = self.gate_up_proj(x).split(self.split_size, dim=2)
-            down_proj = self.down_proj(self.act_fn(gate_proj) * up_proj)
-
-        return down_proj
-
-
-class LlamaAttentionNxD(LlamaAttention):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        nn.Module.__init__(self)
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self._init_rope()
-
-        if self.num_heads == self.num_key_value_heads:
-            self.qkv_proj = ColumnParallelLinear(
-                self.hidden_size,
-                3 * self.num_heads * self.head_dim,
-                stride=3,
-                bias=False,
-                gather_output=False,
-                sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-            )
-            self.split_size = self.num_heads * self.head_dim // get_tensor_model_parallel_size()
-        else:
-            self.q_proj = ColumnParallelLinear(
-                self.hidden_size,
-                self.num_heads * self.head_dim,
-                bias=False,
-                gather_output=False,
-                sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-            )
-            self.k_proj = ColumnParallelLinear(
-                self.hidden_size,
-                self.num_key_value_heads * self.head_dim,
-                bias=False,
-                gather_output=False,
-                sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-            )
-            self.v_proj = ColumnParallelLinear(
-                self.hidden_size,
-                self.num_key_value_heads * self.head_dim,
-                bias=False,
-                gather_output=False,
-                sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-            )
-        self.o_proj = RowParallelLinear(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            sequence_parallel_enabled=self.config.sequence_parallel_enabled,
-        )
-        self.num_heads = neuronx_dist_utils.divide(config.num_attention_heads, get_tensor_model_parallel_size())
-        self.num_key_value_heads = neuronx_dist_utils.divide(config.num_key_value_heads, get_tensor_model_parallel_size())
-        move_model_to_device(self, xm.xla_device())
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.sequence_parallel_enabled:
-            q_len, bsz, _ = hidden_states.size()
-            q_len = q_len * get_tensor_model_parallel_size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            if self.num_heads == self.num_key_value_heads:
-                qkv_states = self.qkv_proj(hidden_states)
-                query_states, key_states, value_states = qkv_states.split(self.split_size, dim=2)
-            else:
-                query_states = self.q_proj(hidden_states)
-                key_states = self.k_proj(hidden_states)
-                value_states = self.v_proj(hidden_states)
-
-        if self.config.sequence_parallel_enabled:
-            query_states = query_states.view(q_len, bsz, self.num_heads, self.head_dim).permute(1, 2, 0, 3)
-            key_states = key_states.view(q_len, bsz, self.num_key_value_heads, self.head_dim).permute(1, 2, 0, 3)
-            value_states = value_states.view(q_len, bsz, self.num_key_value_heads, self.head_dim).permute(1, 2, 0, 3)
-        else:
-            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        def core_attn(query_states, key_states, value_states):
-            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
-                )
-
-            causal_mask = torch.triu(torch.ones((1, 1, q_len, kv_seq_len), device='xla'), diagonal=1).bool()
-            attn_weights = attn_weights.masked_fill_(causal_mask, -10000.0)
-
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.double).to(query_states.dtype)
-
-            attn_output = torch.matmul(attn_weights, value_states)
-            return attn_output
-
-        if self.config.selective_checkpoint_enabled:
-            attn_output = torch.utils.checkpoint.checkpoint(core_attn, query_states, key_states, value_states)
-        else:
-            attn_output = core_attn(query_states, key_states, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        if self.config.sequence_parallel_enabled:
-            attn_output = attn_output.permute(2, 0, 1, 3)
-            attn_output = attn_output.reshape(q_len, bsz, self.hidden_size // get_tensor_model_parallel_size())
-        else:
-            attn_output = attn_output.transpose(1, 2).contiguous()
-            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size // get_tensor_model_parallel_size())
-
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayerNxD(LlamaDecoderLayer):
-    def __init__(self, config: LlamaConfig):
-        nn.Module.__init__(self)
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttentionNxD(config=config)
-        self.mlp = LlamaMLPNxD(config)
-        self.input_layernorm = LlamaRMSNormNxD(config.hidden_size, eps=config.rms_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-        self.post_attention_layernorm = LlamaRMSNormNxD(config.hidden_size, eps=config.rms_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModelNxD(LlamaModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        LlamaPreTrainedModel.__init__(self, config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = ParallelEmbedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayerNxD(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNormNxD(config.hidden_size, eps=config.rms_norm_eps, sequence_parallel_enabled=config.sequence_parallel_enabled)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            pass
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        # if attention_mask is None:
-        #     attention_mask = torch.ones(
-        #         (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-        #     )
-        # attention_mask = self._prepare_decoder_attention_mask(
-        #     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        # )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        if self.config.sequence_parallel_enabled:
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-            hidden_states = mappings.scatter_to_sequence_parallel_region(hidden_states)
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        if self.config.sequence_parallel_enabled:
-            hidden_states = mappings.gather_from_sequence_parallel_region(hidden_states, to_model_parallel=False)
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLMNxD(LlamaForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        LlamaPreTrainedModel.__init__(self, config)
-        self.model = LlamaModelNxD(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-
-        self.lm_head = ColumnParallelLinear(
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            gather_output=False
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-
-        logits = logits.double()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = parallel_cross_entropy
-            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-            loss = torch.mean(loss)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/requirements.txt b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/requirements.txt
deleted file mode 100644
index d05b079..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-transformers==4.31.0
-regex
-tensorboard
-datasets
-sentencepiece
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py
deleted file mode 100644
index 69578c3..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.py
+++ /dev/null
@@ -1,733 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import torch
-import sys
-import time
-import argparse
-import json
-import queue
-from typing import Any, Dict, List
-from datetime import datetime, timezone
-from collections import namedtuple
-import torch_xla
-import torch_xla.core.xla_model as xm
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data import DistributedSampler
-import torch_xla.distributed.parallel_loader as pl
-import torch.distributed as dist
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.distributed.xla_backend
-import numpy as np
-from transformers import (
-    AdamW,
-    default_data_collator,
-    set_seed,
-    LlamaConfig,
-)
-from transformers.optimization import get_linear_schedule_with_warmup
-
-import copy
-from torch.utils.tensorboard import SummaryWriter
-import inspect
-import requests
-from neuronx_distributed.parallel_layers import parallel_state, layers, grads, checkpointing
-from neuronx_distributed.utils.model_utils import move_model_to_device
-from neuronx_distributed.parallel_layers.grads import bucket_allreduce_gradients
-import datasets
-
-from neuronx_distributed.optimizer import NeuronZero1Optimizer
-from adamw_fp32_optim_params import AdamW_FP32OptimParams
-from modeling_llama2_nxd import LlamaForCausalLMNxD
-
-# For PT autocast.
-torch.cuda.is_bf16_supported = lambda: True
-
-# Workaround for NaNs seen with transformers version >= 4.21.0
-# https://github.com/aws-neuron/aws-neuron-sdk/issues/593
-import transformers.modeling_utils as modeling_utils
-
-if os.environ.get("XLA_USE_BF16") or os.environ.get("XLA_DOWNCAST_BF16"):
-    modeling_utils.get_parameter_dtype = lambda x: torch.bfloat16
-
-datetime_str = str(datetime.now())
-results = {
-    "inference_success": 1
-}
-
-
-Metric = namedtuple("Metric", ["name", "value", "units", "additional_data"])
-
-
-class TrainingMetrics:
-    def __init__(self, json_file):
-        self.json_file = json_file
-
-    def read_modify_write_file(self, data, key: str = "metrics") -> None:
-        """
-        data (dict of training parameters or list of metrics): Data to update in the file.
-        key (str): the dictionary key under which data is to be recorded
-        """
-        result_dict = {}
-        print(f"Writing data to the provided results file: {self.json_file}")
-        if os.path.exists(self.json_file):
-            with open(self.json_file) as json_file:
-                result_dict = json.loads(json_file.read()) or result_dict
-        print(f"Updating with {key} data: {data}")
-        if result_dict:
-            try:
-                # handle internal named entity if present
-                results = result_dict[next(iter(result_dict))]
-            except Exception:
-                results = result_dict
-            current = results.get(key)
-            if not current:
-                results[key] = data
-            else:
-                if isinstance(current, list):
-                    current.extend(data)
-                elif isinstance(current, dict):
-                    current.update(data)
-        else:
-            result_dict["results"] = {key: data}
-        with open(self.json_file, "w") as json_file:
-            json.dump(result_dict, json_file)
-
-    def store_metrics(self, metrics: List[Metric]) -> None:
-        """
-        Writes collected metrics to the file.
-        """
-        data = [
-            {
-                "MetricName": metric.name,
-                "MeasuredValue": metric.value,
-                "Units": metric.units,
-                "Timestamp": datetime.now(timezone.utc).isoformat(),
-                "AdditionalData": metric.additional_data,
-            }
-            for metric in metrics
-        ]
-        self.update(data=data, key="metrics")
-
-    def store_parameters(self, parameters: Dict[str, Any]) -> None:
-        """
-        Writes specified model and configuration parameters to the file.
-        """
-        self.update(data=parameters, key="parameters")
-
-    def update(self, **kwargs: Any) -> None:
-        """
-        Write specified data to the output file.
-        """
-        self.read_modify_write_file(**kwargs)
-
-
-class Throughput:
-    def __init__(
-        self, batch_size, world_size, grad_accum_usteps, moving_avg_window_size=10
-    ):
-        self.seqs_per_iteration = batch_size * world_size * grad_accum_usteps
-        self.moving_avg_window_size = moving_avg_window_size
-        self.moving_avg_window = queue.Queue()
-        self.window_time = 0
-        self.start_time = time.time()
-
-    def get_throughput(self):
-        step_time = time.time() - self.start_time
-        self.start_time += step_time
-        self.window_time += step_time
-        self.moving_avg_window.put(step_time)
-        window_size = self.moving_avg_window.qsize()
-        if window_size > self.moving_avg_window_size:
-            self.window_time -= self.moving_avg_window.get()
-            window_size -= 1
-        throughput = window_size * self.seqs_per_iteration / self.window_time
-        return throughput
-
-
-class Logger:
-    def __init__(self, args, world_size, model_dtype):
-        xla = "torch_xla" in sys.modules
-        self.throughputs = []
-        dtype_short = model_dtype.replace("torch.", "")
-        self.tb = SummaryWriter(
-            os.path.join(
-                args.output_dir,
-                f"neuron_tblogs_{time.strftime('%m%d%y_%H%M')}"
-                f"_{dtype_short}"
-                f"_w{world_size}"
-                f"_lr{args.lr}"
-                f"_bs{args.batch_size}"
-                f"_acc{args.grad_accum_usteps}"
-                f"_warmup{args.warmup_steps}"
-                f"_max{args.max_steps}"
-                f"_xla{xla}"
-                f"_{self.get_instance_type()}",
-            )
-        )
-        self.tb.add_text(
-            "script", "```\n" + inspect.getsource(sys.modules[__name__]) + "\n```", 0
-        )
-        self.golden_steploss = []
-        golden = "golden_steploss.txt"
-        if os.path.exists(golden):
-            with open(golden, "r") as f:
-                self.golden_steploss = [float(i) for i in f]
-            print(
-                f"Read {len(self.golden_steploss)} golden step loss values from {golden}"
-            )
-
-    def get_instance_type(self):
-        try:
-            token = requests.put(
-                "http://169.254.169.254/latest/api/token",
-                headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
-            )
-            data = requests.get(
-                "http://169.254.169.254/latest/meta-data/instance-type",
-                headers={"X-aws-ec2-metadata-token": token.text},
-            )
-            return data.text
-        except:
-            return os.environ.get("HOSTNAME", "unknown")
-
-    def log(self, epoch, step, step_loss, learning_rate, throughput, grad_norm=None):
-        time_now = time.asctime()
-        grad_norm_msg = f"grad-norm : {grad_norm}" if grad_norm else ""
-        print(
-            f"LOG {time_now} - ({epoch}, {step}) step_loss : {step_loss:.4f} "
-            f"learning_rate : {learning_rate:.2e} throughput : {throughput:.2f} "
-            f"{grad_norm_msg}",
-            flush=True,
-        )
-        self.tb.add_scalar("step loss", step_loss, step)
-        self.tb.add_scalar("learning rate", learning_rate, step)
-        self.tb.add_scalar("throughput", throughput, step)
-        if grad_norm:
-            self.tb.add_scalar("grad-norm", grad_norm, step)
-        self.throughputs.append(throughput)
-        if not os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None):
-            step_0start = step - 1
-            if step_0start < len(self.golden_steploss) and step_0start >= 0:
-                np.testing.assert_allclose(
-                    step_loss, self.golden_steploss[step_0start], rtol=2.3e-1
-                )
-
-
-# Workaround because python functions are not picklable
-class WorkerInitObj(object):
-    def __init__(self, seed):
-        self.seed = seed
-
-    def __call__(self, id):
-        set_seed(self.seed)
-
-def create_pretraining_dataset(
-    data_dir, mini_batch_size, worker_init
-):
-    train_data = datasets.load_from_disk(os.path.expanduser(data_dir))
-    train_sampler = DistributedSampler(
-        train_data,
-        num_replicas=parallel_state.get_data_parallel_size(),
-        rank=parallel_state.get_data_parallel_rank(),
-        shuffle=False,
-        drop_last=True,
-    )
-    train_dataloader = DataLoader(
-        train_data,
-        collate_fn=default_data_collator,
-        sampler=train_sampler,
-        batch_size=mini_batch_size,
-        num_workers=0,
-        worker_init_fn=worker_init,
-        drop_last=True,
-        pin_memory=True,
-    )
-    return train_dataloader
-
-def get_model(flags):
-    model_path, seq_len = flags.model_path, flags.seq_len
-    config = LlamaConfig.from_pretrained(model_path)
-    config.use_cache = False
-    config.max_position_embeddings = max(config.max_position_embeddings, seq_len)
-    if flags.num_layers > 0:
-        config.num_hidden_layers = flags.num_layers
-    if flags.sequence_parallel_enabled:
-        config.sequence_parallel_enabled = True
-    if flags.selective_checkpoint_enabled:
-        config.selective_checkpoint_enabled = True
-    xm.master_print(config)
-    model = LlamaForCausalLMNxD(config)
-    xm.master_print(model)
-    return model
-
-def get_dtype(model) -> str:
-    """
-    Reference: https://pytorch.org/xla/release/1.12/index.html#xla-tensors-and-bfloat16
-    """
-    if "XLA_USE_BF16" in os.environ:
-        return "torch.bfloat16"
-    if "XLA_DOWNCAST_BF16" in os.environ:
-        if "torch.float" in str(model.dtype):
-            return "torch.bfloat16"
-        if "torch.double" in str(model.dtype):
-            return "torch.float32"
-    return str(model.dtype)    
-
-def allreduce_sequence_parallel_gradients(optimizer):
-    """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-        Modified from megatron-lm:
-        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
-    """
-    from neuronx_distributed.parallel_layers.mappings import reduce_from_tensor_model_parallel_region
-    grads = []
-    for param_group in optimizer.__getstate__()['param_groups']:
-        for group, params in param_group.items():
-            if group == 'params':
-                for p in params:
-                    if isinstance(p, torch.Tensor) and p.grad is not None:
-                        sequence_parallel_param = getattr(p, 'sequence_parallel_enabled', False)
-                        if sequence_parallel_param:
-                            grads.append(p.grad.data)
-    for grad in grads:
-        # sum v.s. average: sum
-        reduce_from_tensor_model_parallel_region(grad)
-
-def train_llama(flags):
-    parallel_state.initialize_model_parallel(tensor_model_parallel_size=flags.tensor_parallel_size)
-    world_size = parallel_state.get_data_parallel_size()
-    is_root = xm.is_master_ordinal(local=False)
-    extract_graphs_only = os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None)
-    set_seed(flags.seed)
-    worker_init = WorkerInitObj(flags.seed)
-    device = xm.xla_device()
-
-    model = get_model(flags)
-    move_model_to_device(model, device)
-    model.train()
-
-    model_dtype = get_dtype(model)
-    running_loss = torch.zeros(1, dtype=torch.double).to(device)
-
-    param_optimizer = list(model.named_parameters())
-    no_decay = ["bias", "LayerNorm"]  # gamma/beta are in LayerNorm.weight
-
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.01,
-        },
-        {
-            "params": [
-                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-
-    if flags.use_mix_precision:
-        optimizer_cls = AdamW_FP32OptimParams
-    else:
-        optimizer_cls = AdamW
-
-    if flags.use_zero_1:
-        optimizer = NeuronZero1Optimizer(
-            optimizer_grouped_parameters,
-            optimizer_cls,
-            lr=flags.lr,
-            pin_layout=False,
-            sharding_groups=parallel_state.get_data_parallel_group(as_list=True),
-            grad_norm_groups=parallel_state.get_tensor_model_parallel_group(as_list=True),
-        )
-    else:
-        optimizer = optimizer_cls(optimizer_grouped_parameters, flags.lr)
-    optimizer.zero_grad()
-
-    if is_root:
-        if not os.path.exists(flags.output_dir):
-            os.makedirs(flags.output_dir, exist_ok=True)
-        if not extract_graphs_only:
-            logger = Logger(flags, world_size, model_dtype)
-        metric_writer = TrainingMetrics(flags.metrics_file)
-        throughput = Throughput(
-            flags.batch_size, world_size, flags.grad_accum_usteps
-        )
-        print("--------TRAINING CONFIG----------")
-        print(flags)
-        print("--------MODEL CONFIG----------")
-        print(model.config)
-        print("---------------------------------")
-        metric_writer.store_parameters(
-            {
-                "Model": model.name_or_path,
-                "Model configuration": str(model.config),
-                "World size": xm.xrt_world_size(),
-                "Data parallel degree": world_size,
-                "Batch size": flags.batch_size,
-                "Total steps": flags.steps_this_run,
-                "Seed": flags.seed,
-                "Optimizer": str(optimizer),
-                "Data type": model_dtype,
-                "Gradient accumulation microsteps": flags.grad_accum_usteps,
-                "Warmup steps": flags.warmup_steps,
-                "Dataset": os.path.basename(os.path.normpath(flags.data_dir)),
-                "Environment variables": {
-                    variable: value
-                    for variable, value in os.environ.items()
-                    if variable.startswith("NEURON") or variable.startswith("XLA")
-                },
-            }
-        )
-
-    def train_loop_fn(
-        model, optimizer, train_loader, epoch, global_step, training_ustep, running_loss, use_zero_1
-    ):
-        for _, data in enumerate(train_loader):
-            training_ustep += 1
-            input_ids = data["input_ids"]
-            attention_mask = data["attention_mask"]
-            labels = data["labels"]
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels,
-            )
-            loss = outputs.loss / flags.grad_accum_usteps
-            loss.backward()
-            running_loss += loss.detach()
-
-            if training_ustep % flags.grad_accum_usteps == 0:
-                xm.mark_step()
-                # loss averaging
-                running_loss_div = running_loss / world_size
-                running_loss_reduced = xm.all_reduce(
-                    xm.REDUCE_SUM,
-                    running_loss_div,
-                    groups=parallel_state.get_data_parallel_group(as_list=True),
-                )
-                running_loss_reduced_detached = running_loss_reduced.detach()
-                running_loss.zero_()
-
-                allreduce_sequence_parallel_gradients(optimizer)
-                if not use_zero_1:
-                    # all-reduce and then clip. Order matters.
-                    if parallel_state.get_data_parallel_size() > 1:
-                        bucket_allreduce_gradients(xm._fetch_gradients(optimizer))
-                    max_grad_norm = 1.0
-                    grads.clip_grad_norm(
-                        model.parameters(), max_grad_norm
-                    )  # Gradient clipping is not in AdamW anymore
-                optimizer.step()
-
-                with torch.no_grad():
-                    total_norm = torch.zeros(1, device=device)
-                    if flags.print_grad_norm and is_root:
-                        for p in model.parameters():
-                            param_norm_sq = torch.square(p.grad).sum()
-                            total_norm += param_norm_sq
-                        total_norm = torch.sqrt(total_norm)
-
-                optimizer.zero_grad()
-                scheduler.step()
-                global_step += 1
-
-                def _print_logs(running_loss_reduced_detached, total_norm):
-                    if is_root and not extract_graphs_only:
-                        total_norm_cpu = None
-                        if flags.print_grad_norm:
-                            total_norm_cpu = total_norm.cpu().item()
-                        # NOTE: The running_loss is the loss of the global_step
-                        logger.log(
-                            epoch,
-                            global_step,
-                            running_loss_reduced_detached.cpu().item(),
-                            optimizer.param_groups[0]["lr"],
-                            throughput.get_throughput(),
-                            total_norm_cpu,
-                        )
-
-                xm.add_step_closure(
-                    _print_logs, (running_loss_reduced_detached, total_norm.detach())
-                )
-                if global_step >= flags.steps_this_run:
-                    # NOTE: Prevent runtime "Call to recv failed : Broken pipe" issue
-                    xm.mark_step()
-                    break
-
-        return (
-            global_step,
-            training_ustep,
-            running_loss,
-            running_loss_reduced_detached.cpu().item(),
-        )
-
-    scheduler_state_dict = None
-
-    if flags.resume_ckpt:
-        state_dict = checkpointing.load(flags.output_dir, model)
-        optimizer.load_state_dict(state_dict["optimizer"])
-        global_step = state_dict["global_step"]
-        epoch = state_dict["epoch"]
-        scheduler_state_dict = state_dict["scheduler"]
-    else:
-        global_step = 0
-        epoch = 0
-
-    train_start = time.time()
-    training_ustep = 0
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer,
-        num_warmup_steps=flags.warmup_steps,
-        num_training_steps=flags.max_steps,
-        last_epoch=epoch if scheduler_state_dict else -1,
-    )
-
-    if scheduler_state_dict:
-        scheduler.load_state_dict(scheduler_state_dict)
-
-    assert os.path.exists(
-        os.path.expanduser(flags.data_dir)
-    ), "ERROR: Data directory {} doesn't exist!".format(flags.data_dir)
-
-    mini_batch_size = flags.batch_size
-    train_dataloader = create_pretraining_dataset(
-        flags.data_dir, mini_batch_size, worker_init
-    )
-    train_device_loader = pl.MpDeviceLoader(train_dataloader, device)
-
-    while True:
-        xm.master_print(
-            "Epoch {} begin {}".format(epoch, time.asctime()),
-            flush=True,
-        )
-
-        global_step, training_ustep, running_loss, final_loss = train_loop_fn(
-            model,
-            optimizer,
-            train_device_loader,
-            epoch,
-            global_step,
-            training_ustep,
-            running_loss,
-            flags.use_zero_1,
-        )
-
-        if is_root and not extract_graphs_only:
-            final_time = time.time()
-            time_diff = final_time - train_start
-            print(
-                "Epoch {} step {} end {} loss {} perf {} seq/sec (at train microstep {} time {} from beginning time {})".format(
-                    epoch,
-                    global_step,
-                    time.asctime(),
-                    final_loss,
-                    logger.throughputs[-1],
-                    training_ustep,
-                    final_time,
-                    train_start,
-                ),
-                flush=True,
-            )
-            additional_data = {
-                "Epoch": epoch,
-                "Global step": global_step,
-                "Microstep": training_ustep,
-            }
-            metric_data = [
-                Metric("Loss", final_loss, "", additional_data),
-                Metric(
-                    "Throughput", logger.throughputs[-1], "seq/s", additional_data
-                ),
-            ]
-            metric_writer.store_metrics(metric_data)
-
-        if global_step >= flags.steps_this_run:
-            if is_root and not extract_graphs_only:
-                # record aggregate & final statistics in the metrics file
-                additional_data = {
-                    "Epoch": epoch,
-                    "Global step": global_step,
-                    "Microstep": training_ustep,
-                }
-                average_throughput = round(
-                    sum(logger.throughputs) / len(logger.throughputs), 4
-                )
-                metric_data = [
-                    Metric("Final loss", final_loss, "", additional_data),
-                    Metric(
-                        "Time to train",
-                        round(time_diff / 60, 4),
-                        "minutes",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Average throughput",
-                        average_throughput,
-                        "seq/s",
-                        additional_data,
-                    ),
-                    Metric(
-                        "Peak throughput",
-                        max(logger.throughputs),
-                        "seq/s",
-                        additional_data,
-                    ),
-                ]
-                metric_writer.store_metrics(metric_data)
-            # TODO may incur HOST OOM
-            state_dict = {
-               "model": model.state_dict(),
-               "global_step": global_step,
-               "epoch": epoch,
-               "scheduler": scheduler.state_dict()
-            }
-            checkpointing.save(state_dict, flags.output_dir)
-            optimizer.save_sharded_state_dict(flags.output_dir)
-            return
-
-        epoch += 1
-
-
-def _mp_fn(index, flags):
-    torch.set_default_tensor_type("torch.FloatTensor")
-    train_llama(flags)
-    xm.rendezvous("_mp_fn finished")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Model weight and config path.",
-    )
-    parser.add_argument(
-        "--data_dir",
-        type=str,
-        help="Pre-tokenized dataset directory.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./output",
-        help="Directory for checkpoints and logs.",
-    )
-    parser.add_argument(
-        "--metrics_file",
-        type=str,
-        default="results.json",
-        help="training metrics results file",
-    )
-    parser.add_argument("--batch_size", type=int, default=8, help="Worker batch size.")
-    parser.add_argument(
-        "--max_steps",
-        type=int,
-        help="Maximum total accumulation-steps to run.",
-    )
-    parser.add_argument(
-        "--steps_this_run",
-        type=int,
-        help="Exit early at <value> steps and not go to max_steps. -1 to mean no early exit.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=12349,
-        help="Random seed. Worker seed is this value + worker rank.",
-    )
-    parser.add_argument("--lr", type=float, default=4e-4, help="Learning rate.")
-    parser.add_argument(
-        "--warmup_steps",
-        type=int,
-        default=2000,
-        help="Number of warmup accumulation-steps for learning rate .",
-    )
-    parser.add_argument(
-        "--grad_accum_usteps",
-        type=int,
-        default=64,
-        help="Gradient accumulation micro-steps (an accumulation-step has <value> micro-steps.",
-    )
-    parser.add_argument(
-        "--print_grad_norm",
-        default=False,
-        action="store_true",
-        help="Whether to print grad norm",
-    )
-    parser.add_argument(
-        "--resume_ckpt",
-        action="store_true",
-        help="Resume from checkpoint at resume_step."
-    )
-    parser.add_argument(
-        "--tensor_parallel_size",
-        default=2,
-        type=int,
-        help="Tensor parallel size"
-    )
-    parser.add_argument(
-        "--seq_len",
-        default=2048,
-        type=int,
-        help="Sequence length"
-    )
-    parser.add_argument(
-        "--use_mix_precision", action="store_true", help="Use mix precision."
-    )
-    parser.add_argument(
-        "--use_zero_1", action="store_true", help="Use ZeRO-1."
-    )
-    parser.add_argument(
-        "--num_layers",
-        type=int,
-        default=-1,
-        help="Override number of layers for this LLaMA model",
-    )
-    parser.add_argument(
-        "--sequence_parallel_enabled",
-        default=False,
-        action="store_true",
-        help="Enable sequence parallel",
-    )
-    parser.add_argument(
-        "--selective_checkpoint_enabled",
-        default=False,
-        action="store_true",
-        help="Enable selective checkpoint",
-    )
-
-    args = parser.parse_args(sys.argv[1:])
-
-    if args.steps_this_run < 0:
-        args.steps_this_run = args.max_steps
-
-    os.environ["NEURON_RT_STOCHASTIC_ROUNDING_EN"] = "1"
-    if args.use_mix_precision:
-        os.environ["XLA_DOWNCAST_BF16"]="1"
-    else:
-        os.environ["XLA_USE_BF16"]="1"
-
-
-    # WORLD_SIZE is set by torchrun
-    if os.environ.get("WORLD_SIZE"):
-        dist.init_process_group("xla")
-        _mp_fn(0, args)
-    else:
-        xmp.spawn(_mp_fn, args=(args,))
diff --git a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh b/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh
deleted file mode 100644
index a5aaecc..0000000
--- a/torch-neuronx/training/tp_zero1_llama2_7b_hf_pretrain/tp_zero1_llama2_7b_hf_pretrain.sh
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/bin/bash
-
-#############################################
-# User defined parameters and env vars
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-export NEURON_CC_FLAGS="--model-type=transformer --enable-experimental-O1 --enable-saturate-infinity"
-export NEURON_FUSE_SOFTMAX=1
-
-# Async Runtime
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-
-# HOST OOM
-export MALLOC_ARENA_MAX=64
-
-# TP degree
-TP_DEGREE=8
-# 0: bf16; 1: mixed precision
-USE_MIX_PRECISION=1
-# 0: use pure DP; 1: use ZeRO-1
-USE_ZERO_1=1
-# global batch size
-GBS=256
-# micro batch size
-MBS=1
-# number of steps to run
-TOTAL_STEPS=10000
-# warmup steps
-WARMUP_STEPS=2000
-# learning rate
-LR=2.0e-5
-# model path
-MODEL_PATH=$SCRIPT_DIR
-# data path
-DATA_PATH="~/examples_datasets/wikicorpus_llama2_7B_tokenized_4k"
-# sequence length
-SEQ_LEN=4096
-
-#############################################
-
-export NEURON_NUM_DEVICES=32
-NODE_ID=0
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $NEURON_NUM_DEVICES"
-if [ ! -z "$SLURM_NTASKS" ]; then
-    WORLD_SIZE=$SLURM_NTASKS
-    NODE_ID=$SLURM_NODEID
-    MASTER_ADDRESS=(`scontrol show hostnames $SLURM_JOB_NODELIST`)
-    DISTRIBUTED_ARGS="--nproc_per_node $NEURON_NUM_DEVICES --nnodes $WORLD_SIZE --node_rank $NODE_ID --master_addr $MASTER_ADDRESS --master_port 44000"
-    if [ $NODE_ID -eq 0 ]; then
-        echo "WORLD_SIZE=$WORLD_SIZE"
-        echo "NODE_ID=$NODE_ID"
-        echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-        echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
-    fi
-    export FI_EFA_USE_DEVICE_RDMA=1
-    export FI_PROVIDER=efa
-fi
-
-echo "WORLD_SIZE=$WORLD_SIZE"
-echo "NODE_ID=$NODE_ID"
-echo "MASTER_ADDRESS=$MASTER_ADDRESS"
-
-sudo sysctl -w net.ipv4.ip_local_reserved_ports=44000,48620
-
-export NEURON_RT_NUM_CORES=32
-export NEURON_NUM_DEVICES=$NEURON_RT_NUM_CORES
-export TPU_NUM_DEVICES=$NEURON_RT_NUM_CORES
-export TPU_CHIPS_PER_HOST_BOUNDS=$NEURON_RT_NUM_CORES
-export NEURON_RT_ROOT_COMM_ID=localhost:48620
-
-#############################################
-
-EXTRA_ARGS=" "
-if [ $USE_MIX_PRECISION -gt 0 ]; then
-    EXTRA_ARGS+=" --use_mix_precision"
-fi
-if [ $USE_ZERO_1 -gt 0 ]; then
-    EXTRA_ARGS+=" --use_zero_1"
-fi
-
-DP=$(($NEURON_RT_NUM_CORES * $WORLD_SIZE / $TP_DEGREE))
-ACC_STEPS=$(($GBS / $MBS / $DP))
-
-
-if [ $NEURON_EXTRACT_GRAPHS_ONLY -gt 0 ]; then
-    STEPS_THIS_RUN=2
-    OUTPUT_LOG=log_compile-$NODE_ID.log
-else
-    STEPS_THIS_RUN=-1
-    OUTPUT_LOG=log_exe-$NODE_ID.log
-fi
-
-echo TP_DEGREE=$TP_DEGREE
-echo USE_MIX_PRECISION=$USE_MIX_PRECISION
-echo USE_ZERO_1=$USE_ZERO_1
-echo GBS=$GBS
-echo MBS=$MBS
-echo TOTAL_STEPS=$TOTAL_STEPS
-echo WARMUP_STEPS=$WARMUP_STEPS
-echo LR=$LR
-echo MODEL_PATH=$MODEL_PATH
-echo DATA_PATH=$DATA_PATH
-echo SEQ_LEN=$SEQ_LEN
-
-echo EXTRA_ARGS=$EXTRA_ARGS
-echo DP=$DP
-echo ACC_STEPS=$ACC_STEPS
-echo STEPS_THIS_RUN=$STEPS_THIS_RUN
-echo OUTPUT_LOG=$OUTPUT_LOG
-
-torchrun $DISTRIBUTED_ARGS \
-    tp_zero1_llama2_7b_hf_pretrain.py \
-    --model_path $MODEL_PATH \
-    --data_dir $DATA_PATH \
-    --tensor_parallel_size $TP_DEGREE \
-    --batch_size $MBS \
-    --steps_this_run $STEPS_THIS_RUN\
-    --max_steps $TOTAL_STEPS \
-    --warmup_steps $WARMUP_STEPS \
-    --lr $LR \
-    --grad_accum_usteps $ACC_STEPS \
-    --seq_len $SEQ_LEN \
-    --sequence_parallel_enabled \
-    --selective_checkpoint_enabled \
-    $EXTRA_ARGS |& tee $OUTPUT_LOG
-
diff --git a/torch-neuronx/training/unet_image_segmentation/unet.ipynb b/torch-neuronx/training/unet_image_segmentation/unet.ipynb
index d191ea6..4261c5b 100644
--- a/torch-neuronx/training/unet_image_segmentation/unet.ipynb
+++ b/torch-neuronx/training/unet_image_segmentation/unet.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# UNET training - Pytorch\n",
+    "# UNET training - Pytorch 2.1\n",
     "This notebook shows how to fine-tune a pretrained UNET PyTorch model with AWS Trainium (trn1 instances) using NeuronSDK.\\\n",
     "The model implementation is provided by milesial/Pytorch-UNet. \n",
     "\n",
@@ -35,7 +35,7 @@
    "outputs": [],
    "source": [
     "#Install Neuron Compiler and Neuron/XLA packages\n",
-    "%pip install -U \"torchvision==0.14.1\" \"tensorboard==2.6\" \"protobuf<4\" \"timm\"\n",
+    "%pip install -U \"timm\" \"tensorboard\" torchvision==0.16.*\n",
     "%pip install -U \"Pillow\" \"glob2\" \"scikit-learn\" \n",
     "# use --force-reinstall if you're facing some issues while loading the modules\n",
     "# now restart the kernel again"
@@ -125,7 +125,6 @@
     "    --image_dim {image_dim} \\\n",
     "    --num_epochs 2 \\\n",
     "    --batch_size {batch_size} \\\n",
-    "    --do_eval \\\n",
     "    --drop_last \\\n",
     "    --data_dir {dataset_path} \\\n",
     "    --lr {learning_rate}\"\"\"\n",
diff --git a/torch-neuronx/training/zero1_gpt2/run_clm.sh b/torch-neuronx/training/zero1_gpt2/run_clm.sh
index c426165..964737c 100644
--- a/torch-neuronx/training/zero1_gpt2/run_clm.sh
+++ b/torch-neuronx/training/zero1_gpt2/run_clm.sh
@@ -41,13 +41,13 @@ export NEURON_RT_STOCHASTIC_ROUNDING_EN=1
 if [[ "BF16" == $TRAINING_PRECISION ]]; then
     echo "USING BF16 ONLY"
     export XLA_USE_BF16=1
-    export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy FSDP --model-type transformer"
+    export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer"
 elif [[ "MIXED" == $TRAINING_PRECISION ]]; then
     echo "USING MIXED PRECISION BF16 and FP32"
-    export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy FSDP --enable-mixed-precision-accumulation --model-type transformer --enable-experimental-spmd --internal-ccop-bucketing --internal-ccop-bucketing-allgather-size-in-bytes 62481600 --internal-ccop-bucketing-reducescatter-size-in-bytes 62481600 --internal-ccop-bucketing-allreduce-size-in-bytes 1 --tensorizer-options=\'--no-enable-tritium-loopfusion\'"
+    export NEURON_CC_FLAGS="--retry_failed_compilation --enable-mixed-precision-accumulation --distribution-strategy llm-training --model-type transformer"
 else
     echo "USING FP32 as default"
-    export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy FSDP --model-type transformer"
+    export NEURON_CC_FLAGS="--retry_failed_compilation --distribution-strategy llm-training --model-type transformer"
 fi
 
 NEURON_CC_FLAGS+=" --cache_dir=$HOME/neuron_cache/gpt_1p5B/`hostname`"
diff --git a/torch-neuronx/training/zero1_gpt2/run_clm_no_trainer.py b/torch-neuronx/training/zero1_gpt2/run_clm_no_trainer.py
index 56a1af0..094678f 100644
--- a/torch-neuronx/training/zero1_gpt2/run_clm_no_trainer.py
+++ b/torch-neuronx/training/zero1_gpt2/run_clm_no_trainer.py
@@ -81,6 +81,22 @@
 from torch_xla.distributed.zero_redundancy_optimizer import ZeroRedundancyOptimizer
 from neuron_utils import *
 from accelerate.utils.imports import is_tpu_available
+
+# Work around `Check failed: tensor_data`` error in torch-neuronx 2.1 when using torch.utils.data.DataLoader with shuffle=True
+import copy
+import torch_xla.core.xla_model as xm
+def mesh_reduce(tag, data, reduce_fn):
+    xm.rendezvous(tag)
+    xdatain = copy.deepcopy(data)
+    xdatain = xdatain.to("xla")
+    xdata = xm.all_gather(xdatain, pin_layout=False)
+    cpu_xdata = xdata.detach().to("cpu")
+    cpu_xdata_split = torch.split(cpu_xdata, xdatain.shape[0])
+    xldata = [x for x in cpu_xdata_split]
+    xm.mark_step()
+    return reduce_fn(xldata)
+xm.mesh_reduce = mesh_reduce
+
 # we need to use the torch_xla checkpoint. Otherwise the some checkpointing patterns will be eliminated by the compiler common expression elimination
 torch.utils.checkpoint.checkpoint = torch_xla.utils.checkpoint.checkpoint
 
@@ -534,7 +550,7 @@ def group_texts(examples):
 
     # DataLoaders creation:
     train_dataloader = DataLoader(
-        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
+        train_dataset, shuffle=(os.environ.get("NEURON_EXTRACT_GRAPHS_ONLY", None) == None), collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
     )
     eval_dataloader = DataLoader(
         eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
@@ -653,7 +669,6 @@ def group_texts(examples):
     xm.mark_step()
 
     optimizer_step_done_at_least_once=0
-    torch_neuronx.xla_impl.ops.set_unload_prior_neuron_models_mode(True)
     running_loss = torch.zeros(1, ).to(device)
     for epoch in range(starting_epoch, args.num_train_epochs):
         model.train()
@@ -672,7 +687,6 @@ def group_texts(examples):
             if optimizer_step_done_at_least_once < 2:
                 optimizer_step_done_at_least_once+=1
                 if optimizer_step_done_at_least_once==2:
-                    torch_neuronx.xla_impl.ops.set_unload_prior_neuron_models_mode(False)
                     time.sleep(1)
                     xm.rendezvous("Init Complete")
 
diff --git a/torch-neuronx/transformers-neuronx/inference/codellama-13b-16k-sampling.ipynb b/torch-neuronx/transformers-neuronx/inference/codellama-13b-16k-sampling.ipynb
new file mode 100644
index 0000000..79475f4
--- /dev/null
+++ b/torch-neuronx/transformers-neuronx/inference/codellama-13b-16k-sampling.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "59913016-f89e-4a0e-9afe-b3a06e9112d5",
+   "metadata": {},
+   "source": [
+    "# Run Hugging Face `codellama/CodeLlama-13b-hf` autoregressive sampling on Inf2 & Trn1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8454655-ec27-45e3-8da7-f82b744321ee",
+   "metadata": {},
+   "source": [
+    "In this example we compile and deploy the Hugging Face [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) model for tensor parallel inference on Neuron using the `transformers-neuronx` package. We use a sequence length of 16k.\n",
+    "\n",
+    "The example has the following main sections:\n",
+    "1. Set up the Jupyter Notebook\n",
+    "1. Install dependencies\n",
+    "1. Download the model\n",
+    "1. Construct the model|\n",
+    "1. Split the model `state_dict` into multiple files\n",
+    "1. Perform autoregressive sampling using tensor parallelism\n",
+    "\n",
+    "This Jupyter Notebook can be run on an Inf2 instance (`inf2.48xlarge`) or Trn1 instance (`trn1.32xlarge`)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af2b7693-2950-41fc-a038-17cba44bf003",
+   "metadata": {},
+   "source": [
+    "## Set up the Jupyter Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c47ef383-0dea-4423-8c38-29c73927fd78",
+   "metadata": {},
+   "source": [
+    "The following steps set up Jupyter Notebook and launch this tutorial:\n",
+    "1. Clone the [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples) repo to your instance using\n",
+    "```\n",
+    "git clone https://github.com/aws-neuron/aws-neuron-samples.git\n",
+    "```\n",
+    "2. Navigate to the `transformers-neuronx` inference samples folder\n",
+    "```\n",
+    "cd aws-neuron-samples/torch-neuronx/transformers-neuronx/inference\n",
+    "```\n",
+    "3. Follow the instructions in [Jupyter Notebook QuickStart](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/notebook/setup-jupyter-notebook-steps-troubleshooting.html) to run Jupyter Notebook on your instance.\n",
+    "4. Locate this tutorial in your Jupyter Notebook session (`codellama-13b-16k-sampling.ipynb`) and launch it. Follow the rest of the instructions in this tutorial. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a727963e-8178-4d2a-a5cd-a4f2bf00197e",
+   "metadata": {},
+   "source": [
+    "## Install Dependencies\n",
+    "This tutorial requires the following pip packages:\n",
+    "\n",
+    " - `torch-neuronx`\n",
+    " - `neuronx-cc`\n",
+    " - `sentencepiece`\n",
+    " - `transformers`\n",
+    " - `transformers-neuronx`\n",
+    "\n",
+    "\n",
+    "Most of these packages will be installed when configuring your environment using the [torch-neuronx inference setup guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). The additional dependencies must be installed here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c4899b2-39b2-4309-b7df-48fe74b56eb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers-neuronx sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14400e26-2058-44b0-b680-b1cee57203aa",
+   "metadata": {},
+   "source": [
+    "## Download and construct the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e233a69-5658-4180-8f6c-91f377a01001",
+   "metadata": {},
+   "source": [
+    "We download and construct the model checkpoint using the `from_pretrained` function simply using the Huggingface model name.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b06fb496-f8c6-4222-b5ad-39b3e0bc0e22",
+   "metadata": {},
+   "source": [
+    "## Construct the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c2669028-8cf1-49f6-8e93-f6ceb39588fd",
+   "metadata": {},
+   "source": [
+    "After downloading the model and converting it to the Hugging Face format we construct the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ea21739-a65e-4a5c-9a10-7f963a99a72a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LlamaForCausalLM\n",
+    "\n",
+    "model = LlamaForCausalLM.from_pretrained('codellama/CodeLlama-13b-hf', low_cpu_mem_usage=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92b76098-172a-472a-a126-f0ef7606c77f",
+   "metadata": {},
+   "source": [
+    "## Split the model state_dict into multiple files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "34ef7157-da52-4a9a-9839-6394682d39ca",
+   "metadata": {},
+   "source": [
+    "For the sake of reducing host memory usage, it is recommended to save the model `state_dict` as\n",
+    "multiple files, as opposed to one monolithic file given by `torch.save`. This \"split-format\"\n",
+    "`state_dict` can be created using the `save_pretrained_split` function. With this checkpoint format,\n",
+    "the Neuron model loader can load parameters to the Neuron device high-bandwidth memory (HBM) directly\n",
+    "by keeping at most one layer of model parameters in the CPU main memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9226f998-08a2-4c42-a8b7-75fe3626c587",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers_neuronx.module import save_pretrained_split\n",
+    "\n",
+    "save_pretrained_split(model, './CodeLlama-13b-split')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e1ededb-e0d6-4c1d-aac8-bc3d29bd6ebe",
+   "metadata": {},
+   "source": [
+    "## Perform autoregressive sampling using tensor parallelism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1a87b9f-2948-4db9-946f-b618533f03a7",
+   "metadata": {},
+   "source": [
+    "Now we have all of the necessary files for running `codellama/CodeLlama-13b-hf` autoregressive sampling. \n",
+    "\n",
+    "The memory required to host any model can be computed with:\n",
+    "```\n",
+    "total memory = bytes per parameter * number of parameters\n",
+    "```\n",
+    "When using `float16` casted weights for a 13 billion parameter model, this works out to `2 * 13B` or ~26GB of weights. Each NeuronCore has 16GB of memory which means that a 26GB model cannot fit on a single NeuronCore. In reality, the total space required is often greater than just the number of parameters due to caching attention layer projections (KV caching). This caching mechanism grows memory allocations linearly with sequence length and batch size.\n",
+    "\n",
+    "To get very large language models to fit on Inf2 & Trn1, tensor parallelism is used to split weights, data, and compute across multiple NeuronCores. The number of NeuronCores that the weights are split across can be controlled by setting the `tp_degree` parameter. This parallelism degree must be chosen to ensure that the memory usage per NeuronCore will be less than the physical 16GB limit. When configuring tensor parallelism, the memory per NeuronCore can be computed with:\n",
+    "\n",
+    "```\n",
+    "memory per core = (bytes per parameter * number of parameters) / tp_degree\n",
+    "```\n",
+    "\n",
+    "This can be used to compute the minimum instance sizing by ensuring that the value selected for `tp_degree` results in less than 16GB allocated per NeuronCore.\n",
+    "\n",
+    "Note that increasing the `tp_degree` beyond the minimum requirement almost always results in a faster model. Increasing the tensor parallelism degree improves memory bandwidth which improves model performance. To optimize performance it's recommended to use the highest tensor parallelism degree that is supported by the instance. In this sample we use tensor parallelism degree 32 to optimize performance on `trn1.32xlarge`, but this should be changed to 24 if you are using a `inf2.48xlarge`. \n",
+    "\n",
+    "We will use the Neuron `LlamaForSampling` class to implement tensor parallelism for the CodeLlama model. We supply the n_positions as 16384 and context length estimates to precompile various possible prompt lengths. Tensor parallelism is enabled through the argument `tp_degree=32`. We enable `float16` casting with the `amp='f16'` flag. The model computational graph is compiled by `neuronx-cc` for optimized inference on Neuron."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc93079d-7d17-48fd-bf9d-7176bc061a47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers_neuronx import LlamaForSampling\n",
+    "import requests\n",
+    "import os\n",
+    "# Compiler flag -O1 is a workaround for “Too many instructions after unroll” in SDK 2.14\n",
+    "# It can also help with faster compilation\n",
+    "os.environ['NEURON_CC_FLAGS'] = '-O1'\n",
+    "\n",
+    "# we supply a list of context_length_estimate that allows us to get low context encoding latency even for \n",
+    "# larger prompts than 8192 (largest context_length_estimate by default is 1/2 of n_positions input)\n",
+    "context_length_estimate = [64, 128, 256, 512, 1024, 2048, 4096, 8192, 12288, 16384]\n",
+    "\n",
+    "# load codellama/CodeLlama-13b-hf to the NeuronCores with 32-way tensor parallelism and run compilation\n",
+    "# we pass n_positions as 16384 so that the model supports 16k sequence length, and pass the context_length_estimate\n",
+    "# list generated above.\n",
+    "neuron_model = LlamaForSampling.from_pretrained('./CodeLlama-13b-split', n_positions=16384, \\\n",
+    "                                                context_length_estimate=context_length_estimate, \\\n",
+    "                                                batch_size=1, tp_degree=32, amp='f16')\n",
+    "neuron_model.to_neuron()\n",
+    "\n",
+    "# construct a tokenizer and encode prompt text (prompt is loaded from a library and appended with instruction to write a function)\n",
+    "tokenizer = AutoTokenizer.from_pretrained('codellama/CodeLlama-13b-hf')\n",
+    "prompt = requests.get(\"https://raw.githubusercontent.com/kedartatwawadi/stanford_compression_library/e2fca703ac812331a277644ecc4ae5cfef160ab3/scl/compressors/lz77_sliding_window.py\").text\n",
+    "prompt += \"\\n\\n# Function to load binary data from user-provided file and compress it with LZ77 and write output to file\\n\"\n",
+    "input_ids = tokenizer.encode(prompt, return_tensors=\"pt\") \n",
+    "num_input_tokens = len(input_ids[0]) # over 11k tokens\n",
+    "print(f\"num_input_tokens: {num_input_tokens}\")\n",
+    "\n",
+    "# run inference with top-k sampling\n",
+    "with torch.inference_mode():\n",
+    "    start = time.time()\n",
+    "    generated_sequences = neuron_model.sample(input_ids, sequence_length=16384, top_k=50)\n",
+    "    elapsed = time.time() - start\n",
+    "\n",
+    "# display the new generated tokens\n",
+    "generated_sequences = [tokenizer.decode(seq[num_input_tokens:]) for seq in generated_sequences]\n",
+    "print(f'generated sequence {generated_sequences[0]} in {elapsed} seconds')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94ac4991-7606-4c2f-90af-230998b0de20",
+   "metadata": {},
+   "source": [
+    "## Save and load the compiled model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be6a4ba9-40fd-4544-81ab-9fd249f22e4d",
+   "metadata": {},
+   "source": [
+    "The ```save``` and ```load``` functions can be used to save and load compiled model artifacts respectively. Loading compiled model artifacts from a provided directory will avoid model recompilation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07176c29-b30b-4d16-8291-3bd0142dc42d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "neuron_model.save('./neuron_artifacts') # can be copied and used on a different neuron instance\n",
+    "del neuron_model\n",
+    "neuron_model = LlamaForSampling.from_pretrained('./CodeLlama-13b-split', n_positions=16384, \\\n",
+    "                                                context_length_estimate=context_length_estimate, \\\n",
+    "                                                batch_size=1, tp_degree=32, amp='f16')\n",
+    "neuron_model.load('neuron_artifacts') # Load the compiled Neuron artifacts\n",
+    "neuron_model.to_neuron() # will skip compile\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    start = time.time()\n",
+    "    generated_sequences = neuron_model.sample(input_ids, sequence_length=16384, top_k=50)\n",
+    "    elapsed = time.time() - start\n",
+    "\n",
+    "generated_sequences = [tokenizer.decode(seq[num_input_tokens:]) for seq in generated_sequences]\n",
+    "print(f'generated sequence {generated_sequences[0]} in {elapsed} seconds')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (torch-neuronx)",
+   "language": "python",
+   "name": "aws_neuron_venv_pytorch"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/torch-neuronx/transformers-neuronx/inference/llama-70b-sampling.ipynb b/torch-neuronx/transformers-neuronx/inference/llama-70b-sampling.ipynb
index 3ec5ab9..11ae18a 100644
--- a/torch-neuronx/transformers-neuronx/inference/llama-70b-sampling.ipynb
+++ b/torch-neuronx/transformers-neuronx/inference/llama-70b-sampling.ipynb
@@ -19,8 +19,6 @@
     "1. Set up the Jupyter Notebook\n",
     "1. Install dependencies\n",
     "1. Download the model\n",
-    "1. Construct the model|\n",
-    "1. Split the model `state_dict` into multiple files\n",
     "1. Perform autoregressive sampling using tensor parallelism\n",
     "\n",
     "This Jupyter Notebook should be run on an Inf2 instance (`inf2.48xlarge`). To run on a larger Trn1 instance (`trn1.32xlarge`) will require changing the `tp_degree` specified in compilation section."
@@ -100,7 +98,7 @@
     "After gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:\n",
     "```\n",
     "python src/transformers/models/llama/convert_llama_weights_to_hf.py \\\n",
-    "    --input_dir /path/to/downloaded/llama/weights --model_size 70Bf --output_dir ./Llama-2-70b\n",
+    "    --input_dir /path/to/downloaded/llama/weights --model_size 70Bf --output_dir ./Llama-2-70b --safe_serialization\n",
     "```\n",
     "\n",
     "Note: For the purposes of this sample we assume you have saved the Llama-2-70b model in a directory called `Llama-2-70b` with the following format:\n",
@@ -108,22 +106,22 @@
     "Llama-2-70b/\n",
     "├── config.json\n",
     "├── generation_config.json\n",
-    "├── pytorch_model-00001-of-00015.bin\n",
-    "├── pytorch_model-00002-of-00015.bin\n",
-    "├── pytorch_model-00003-of-00015.bin\n",
-    "├── pytorch_model-00004-of-00015.bin\n",
-    "├── pytorch_model-00005-of-00015.bin\n",
-    "├── pytorch_model-00006-of-00015.bin\n",
-    "├── pytorch_model-00007-of-00015.bin\n",
-    "├── pytorch_model-00008-of-00015.bin\n",
-    "├── pytorch_model-00009-of-00015.bin\n",
-    "├── pytorch_model-00010-of-00015.bin\n",
-    "├── pytorch_model-00011-of-00015.bin\n",
-    "├── pytorch_model-00012-of-00015.bin\n",
-    "├── pytorch_model-00013-of-00015.bin\n",
-    "├── pytorch_model-00014-of-00015.bin\n",
-    "├── pytorch_model-00015-of-00015.bin\n",
-    "├── pytorch_model.bin.index.json\n",
+    "├── model-00001-of-00015.safetensors\n",
+    "├── model-00002-of-00015.safetensors\n",
+    "├── model-00003-of-00015.safetensors\n",
+    "├── model-00004-of-00015.safetensors\n",
+    "├── model-00005-of-00015.safetensors\n",
+    "├── model-00006-of-00015.safetensors\n",
+    "├── model-00007-of-00015.safetensors\n",
+    "├── model-00008-of-00015.safetensors\n",
+    "├── model-00009-of-00015.safetensors\n",
+    "├── model-00010-of-00015.safetensors\n",
+    "├── model-00011-of-00015.safetensors\n",
+    "├── model-00012-of-00015.safetensors\n",
+    "├── model-00013-of-00015.safetensors\n",
+    "├── model-00014-of-00015.safetensors\n",
+    "├── model-00015-of-00015.safetensors\n",
+    "├── model.safetensors.index.json\n",
     "├── special_tokens_map.json\n",
     "├── tokenizer.json\n",
     "├── tokenizer.model\n",
@@ -131,67 +129,6 @@
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "b06fb496-f8c6-4222-b5ad-39b3e0bc0e22",
-   "metadata": {},
-   "source": [
-    "## Construct the model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c2669028-8cf1-49f6-8e93-f6ceb39588fd",
-   "metadata": {},
-   "source": [
-    "After downloading the model and converting it to the Hugging Face format we construct the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8ea21739-a65e-4a5c-9a10-7f963a99a72a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import LlamaForCausalLM\n",
-    "\n",
-    "model = LlamaForCausalLM.from_pretrained('Llama-2-70b')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "92b76098-172a-472a-a126-f0ef7606c77f",
-   "metadata": {},
-   "source": [
-    "## Split the model state_dict into multiple files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "34ef7157-da52-4a9a-9839-6394682d39ca",
-   "metadata": {},
-   "source": [
-    "For the sake of reducing host memory usage, it is recommended to save the model `state_dict` as\n",
-    "multiple files, as opposed to one monolithic file given by `torch.save`. This \"split-format\"\n",
-    "`state_dict` can be created using the `save_pretrained_split` function. With this checkpoint format,\n",
-    "the Neuron model loader can load parameters to the Neuron device high-bandwidth memory (HBM) directly\n",
-    "by keeping at most one layer of model parameters in the CPU main memory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9226f998-08a2-4c42-a8b7-75fe3626c587",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from transformers_neuronx.module import save_pretrained_split\n",
-    "\n",
-    "save_pretrained_split(model, './Llama-2-70b-split')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "0e1ededb-e0d6-4c1d-aac8-bc3d29bd6ebe",
@@ -239,13 +176,13 @@
     "import time\n",
     "import torch\n",
     "from transformers import AutoTokenizer\n",
-    "from transformers_neuronx.llama.model import LlamaForSampling\n",
+    "from transformers_neuronx import LlamaForSampling\n",
     "\n",
     "os.environ['NEURON_CC_FLAGS'] = '--enable-mixed-precision-accumulation'\n",
     "\n",
     "# Load meta-llama/Llama-2-70b to the NeuronCores with 8-way tensor parallelism and run compilation\n",
     "neuron_model = LlamaForSampling.from_pretrained(\n",
-    "    './Llama-2-70b-split',  # Should reference the split checkpoint produced by \"save_pretrained_split\"\n",
+    "    'Llama-2-70b',          # The reference to the safetensors checkpoint folder\n",
     "    batch_size=1,           # Batch size must be determined prior to inference time.\n",
     "    tp_degree=24,           # Controls the number of NeuronCores to execute on. Change to 32 for trn1.32xlarge\n",
     "    amp='f16',              # This automatically casts the weights to the specified dtype.\n",
diff --git a/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb b/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb
index 41f60d9..b7a45e7 100644
--- a/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb
+++ b/torch-neuronx/transformers-neuronx/inference/meta-llama-2-13b-sampling.ipynb
@@ -19,8 +19,6 @@
     "1. Set up the Jupyter Notebook\n",
     "1. Install dependencies\n",
     "1. Download the model\n",
-    "1. Construct the model|\n",
-    "1. Split the model `state_dict` into multiple files\n",
     "1. Perform autoregressive sampling using tensor parallelism\n",
     "\n",
     "This Jupyter Notebook can be run on an Inf2 instance (`inf2.48xlarge`) or Trn1 instance (`trn1.32xlarge`)."
@@ -100,7 +98,7 @@
     "After gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:\n",
     "```\n",
     "python src/transformers/models/llama/convert_llama_weights_to_hf.py \\\n",
-    "    --input_dir /path/to/downloaded/llama/weights --model_size 13Bf --output_dir ./Llama-2-13b\n",
+    "    --input_dir /path/to/downloaded/llama/weights --model_size 13Bf --output_dir ./Llama-2-13b --safe_serialization\n",
     "```\n",
     "\n",
     "Note: For the purposes of this sample we assume you have saved the Llama-2-13b model in a directory called `Llama-2-13b` with the following format:\n",
@@ -108,10 +106,10 @@
     "Llama-2-13b/\n",
     "├── config.json\n",
     "├── generation_config.json\n",
-    "├── pytorch_model-00001-of-00003.bin\n",
-    "├── pytorch_model-00002-of-00003.bin\n",
-    "├── pytorch_model-00003-of-00003.bin\n",
-    "├── pytorch_model.bin.index.json\n",
+    "├── model-00001-of-00003.safetensors\n",
+    "├── model-00002-of-00003.safetensors\n",
+    "├── model-00003-of-00003.safetensors\n",
+    "├── model.safetensors.index.json\n",
     "├── special_tokens_map.json\n",
     "├── tokenizer.json\n",
     "├── tokenizer.model\n",
@@ -119,67 +117,6 @@
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "b06fb496-f8c6-4222-b5ad-39b3e0bc0e22",
-   "metadata": {},
-   "source": [
-    "## Construct the model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c2669028-8cf1-49f6-8e93-f6ceb39588fd",
-   "metadata": {},
-   "source": [
-    "After downloading the model and converting it to the Hugging Face format we construct the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8ea21739-a65e-4a5c-9a10-7f963a99a72a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import LlamaForCausalLM\n",
-    "\n",
-    "model = LlamaForCausalLM.from_pretrained('Llama-2-13b')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "92b76098-172a-472a-a126-f0ef7606c77f",
-   "metadata": {},
-   "source": [
-    "## Split the model state_dict into multiple files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "34ef7157-da52-4a9a-9839-6394682d39ca",
-   "metadata": {},
-   "source": [
-    "For the sake of reducing host memory usage, it is recommended to save the model `state_dict` as\n",
-    "multiple files, as opposed to one monolithic file given by `torch.save`. This \"split-format\"\n",
-    "`state_dict` can be created using the `save_pretrained_split` function. With this checkpoint format,\n",
-    "the Neuron model loader can load parameters to the Neuron device high-bandwidth memory (HBM) directly\n",
-    "by keeping at most one layer of model parameters in the CPU main memory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9226f998-08a2-4c42-a8b7-75fe3626c587",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from transformers_neuronx.module import save_pretrained_split\n",
-    "\n",
-    "save_pretrained_split(model, './Llama-2-13b-split')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "0e1ededb-e0d6-4c1d-aac8-bc3d29bd6ebe",
@@ -224,14 +161,10 @@
     "import time\n",
     "import torch\n",
     "from transformers import AutoTokenizer\n",
-    "from transformers_neuronx.llama.model import LlamaForSampling\n",
-    "\n",
-    "import os\n",
-    "# Compiler flag -O1 is a workaround for “Too many instructions after unroll” in SDK 2.14\n",
-    "# os.environ['NEURON_CC_FLAGS'] = '-O1'\n",
+    "from transformers_neuronx import LlamaForSampling\n",
     "\n",
     "# load meta-llama/Llama-2-13b to the NeuronCores with 24-way tensor parallelism and run compilation\n",
-    "neuron_model = LlamaForSampling.from_pretrained('./Llama-2-13b-split', batch_size=1, tp_degree=24, amp='f16')\n",
+    "neuron_model = LlamaForSampling.from_pretrained('Llama-2-13b', batch_size=1, tp_degree=24, amp='f16')\n",
     "neuron_model.to_neuron()\n",
     "\n",
     "# construct a tokenizer and encode prompt text\n",
@@ -274,7 +207,7 @@
    "source": [
     "neuron_model.save('./neuron_artifacts') # can be copied and used on a different neuron instance\n",
     "del neuron_model\n",
-    "neuron_model = LlamaForSampling.from_pretrained('./Llama-2-13b-split', batch_size=1, tp_degree=24, amp='f16')\n",
+    "neuron_model = LlamaForSampling.from_pretrained('Llama-2-13b', batch_size=1, tp_degree=24, amp='f16')\n",
     "neuron_model.load('neuron_artifacts') # Load the compiled Neuron artifacts\n",
     "neuron_model.to_neuron() # will skip compile\n",
     "\n",
@@ -289,9 +222,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (torch-neuronx)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "aws_neuron_venv_pytorch"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -303,7 +236,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.18"
   }
  },
  "nbformat": 4,
diff --git a/torch-neuronx/transformers-neuronx/inference/mistralai-Mistral-7b-Instruct-v0.2.ipynb b/torch-neuronx/transformers-neuronx/inference/mistralai-Mistral-7b-Instruct-v0.2.ipynb
new file mode 100644
index 0000000..b46332a
--- /dev/null
+++ b/torch-neuronx/transformers-neuronx/inference/mistralai-Mistral-7b-Instruct-v0.2.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b73d8dd8-921b-4939-b2d9-2cc814b71384",
+   "metadata": {},
+   "source": [
+    "# Run Hugging Face mistralai/Mistral-7B-Instruct-v0.2 autoregressive sampling on Inf2 & Trn1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c36be3e-473b-457b-b994-0f5dc7fe4e62",
+   "metadata": {},
+   "source": [
+    "In this example we compile and deploy the Hugging Face [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) model for tensor parallel inference on Neuron using the `transformers-neuronx` package.\n",
+    "\n",
+    "The example has the following main sections:\n",
+    "1. Set up the Jupyter Notebook\n",
+    "1. Install dependencies\n",
+    "1. Load the model\n",
+    "1. Perform autoregressive sampling using tensor parallelism\n",
+    "\n",
+    "This Jupyter Notebook should be run on an Inf2 instance (`inf2.48xlarge`). To run on a larger Trn1 instance (`trn1.32xlarge`) will require changing the `tp_degree` specified in compilation section."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6827638f-2c02-4c22-8b47-d827f7f7ae44",
+   "metadata": {},
+   "source": [
+    "## Set up the Jupyter Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12e16db5-98ee-446b-aac3-e928642b355e",
+   "metadata": {},
+   "source": [
+    "The following steps set up Jupyter Notebook and launch this tutorial:\n",
+    "1. Clone the [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples) repo to your instance using\n",
+    "```\n",
+    "git clone https://github.com/aws-neuron/aws-neuron-samples.git\n",
+    "```\n",
+    "2. Navigate to the `transformers-neuronx` inference samples folder\n",
+    "```\n",
+    "cd aws-neuron-samples/torch-neuronx/transformers-neuronx/inference\n",
+    "```\n",
+    "3. Follow the instructions in [Jupyter Notebook QuickStart](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/notebook/setup-jupyter-notebook-steps-troubleshooting.html) to run Jupyter Notebook on your instance.\n",
+    "4. Locate this tutorial in your Jupyter Notebook session (`mistralai-Mistral-7b-Instruct-v0.2.ipynb`) and launch it. Follow the rest of the instructions in this tutorial. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e4fa60a-de2e-4587-a13b-6369b62a56a9",
+   "metadata": {},
+   "source": [
+    "## Install Dependencies\n",
+    "This tutorial requires the following pip packages:\n",
+    "\n",
+    " - `torch-neuronx`\n",
+    " - `neuronx-cc`\n",
+    " - `sentencepiece`\n",
+    " - `transformers`\n",
+    " - `transformers-neuronx`\n",
+    "\n",
+    "\n",
+    "Most of these packages will be installed when configuring your environment using the [torch-neuronx inference setup guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). The additional dependencies must be installed here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d4f3e6d-6e3d-45bc-a970-b9001a88fecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers-neuronx sentencepiece -U"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58ad0970-6456-4313-a469-9d9dcd721c5c",
+   "metadata": {},
+   "source": [
+    "# Load the model\n",
+    "\n",
+    "The memory required to host any model can be computed with:\n",
+    "```\n",
+    "total memory = bytes per parameter * number of parameters\n",
+    "```\n",
+    "When using `float16` casted weights for a 7 billion parameter model, this works out to `2 * 7B` or ~14GB of weights. In theory, this means it is possible to fit this model on a single NeuronCore (16GB capacity). In this example, we will show splitting the compute across 8 NeuronCores.\n",
+    "\n",
+    "Increasing the `tp_degree` beyond the minimum requirement for a model almost always results in a faster model. Increasing the tensor parallelism degree increases both available compute power and memory bandwidth which improve model performance. To minimize model latency, it is recommended to use the highest tensor parallelism degree that is supported by the instance.\n",
+    "\n",
+    "In the following code, we will use the `NeuronAutoModelForCausalLM` class to automatically load a checkpoint directly from the huggingface hub. The default model config supports sampling up to sequence length 2048. Tensor parallelism is enabled through the argument `tp_degree=8`. We enable `bfloat16` casting with the `amp='bf16'` flag. The model computational graph is compiled by `neuronx-cc` for optimized inference on Neuron. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fc75623-cf79-4003-ab79-e314902b593a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers_neuronx import NeuronAutoModelForCausalLM\n",
+    "\n",
+    "name = 'mistralai/Mistral-7B-Instruct-v0.2'\n",
+    "\n",
+    "model = NeuronAutoModelForCausalLM.from_pretrained(\n",
+    "    name,           # The reference to the huggingface model\n",
+    "    tp_degree=8,    # The Number of NeuronCores to shard the model across. Using 8 means 3 replicas can be used on a inf2.48xlarge\n",
+    "    amp='bf16',     # Ensure the model weights/compute are bfloat16 for faster compute\n",
+    ")\n",
+    "model.to_neuron()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15969ced-cb05-47db-87bc-7a8d19131573",
+   "metadata": {},
+   "source": [
+    "# Perform autoregressive sampling using tensor parallelism\n",
+    "\n",
+    "In this code we demonstrate using the model to answer prompts and stream the output results token-by-token as they are produced. Here we use Top-K sampling to select tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60a6db21-5e3a-42b6-aadd-af9ddfb73592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer, TextStreamer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(name)\n",
+    "streamer = TextStreamer(tokenizer)\n",
+    "\n",
+    "prompt = \"[INST] What is your favourite condiment? [/INST]\"\n",
+    "input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    generated_sequences = model.sample(input_ids, sequence_length=2048, top_k=50, streamer=streamer)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/torch-neuronx/transformers-neuronx/inference/mixtral-8x7b-sampling.ipynb b/torch-neuronx/transformers-neuronx/inference/mixtral-8x7b-sampling.ipynb
new file mode 100644
index 0000000..71ca9b4
--- /dev/null
+++ b/torch-neuronx/transformers-neuronx/inference/mixtral-8x7b-sampling.ipynb
@@ -0,0 +1,172 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "59913016-f89e-4a0e-9afe-b3a06e9112d5",
+   "metadata": {},
+   "source": [
+    "# Run Hugging Face `mistralai/Mixtral-8x7B-v0.1` autoregressive sampling on Inf2 & Trn1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8454655-ec27-45e3-8da7-f82b744321ee",
+   "metadata": {},
+   "source": [
+    "In this example, we compile and deploy the Hugging Face [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) model for tensor parallel inference on AWS Neuron devices using the `transformers-neuronx` package.\n",
+    "\n",
+    "The example has the following main sections:\n",
+    "1. Set up the Jupyter Notebook\n",
+    "1. Install dependencies\n",
+    "1. Perform autoregressive sampling\n",
+    "\n",
+    "This Jupyter Notebook can be run on an Inf2 instance (`inf2.48xlarge`) or Trn1 instance (`trn1.32xlarge`)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af2b7693-2950-41fc-a038-17cba44bf003",
+   "metadata": {},
+   "source": [
+    "## Set up the Jupyter Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c47ef383-0dea-4423-8c38-29c73927fd78",
+   "metadata": {},
+   "source": [
+    "The following steps set up Jupyter Notebook and launch this tutorial:\n",
+    "1. Clone the [AWS Neuron Samples](https://github.com/aws-neuron/aws-neuron-samples) repo to your instance using\n",
+    "```\n",
+    "git clone https://github.com/aws-neuron/aws-neuron-samples.git\n",
+    "```\n",
+    "2. Navigate to the `transformers-neuronx` inference samples folder\n",
+    "```\n",
+    "cd aws-neuron-samples/torch-neuronx/transformers-neuronx/inference\n",
+    "```\n",
+    "3. Follow the instructions in [Jupyter Notebook QuickStart](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/notebook/setup-jupyter-notebook-steps-troubleshooting.html) to run Jupyter Notebook on your instance.\n",
+    "4. Locate this tutorial in your Jupyter Notebook session (`mixtral-8x7b-sampling.ipynb`) and launch it. Follow the rest instructions in this tutorial. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a727963e-8178-4d2a-a5cd-a4f2bf00197e",
+   "metadata": {},
+   "source": [
+    "## Install Dependencies\n",
+    "This tutorial requires the following pip packages:\n",
+    "\n",
+    " - `torch-neuronx`\n",
+    " - `neuronx-cc`\n",
+    " - `sentencepiece`\n",
+    " - `transformers`\n",
+    " - `transformers-neuronx`\n",
+    "\n",
+    "\n",
+    "Most of these packages will be installed when configuring your environment using the [torch-neuronx inference setup guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). The additional dependencies must be installed here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c4899b2-39b2-4309-b7df-48fe74b56eb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers-neuronx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e1ededb-e0d6-4c1d-aac8-bc3d29bd6ebe",
+   "metadata": {},
+   "source": [
+    "## Perform autoregressive sampling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1a87b9f-2948-4db9-946f-b618533f03a7",
+   "metadata": {},
+   "source": [
+    "Before running autoregressive sampling, we first consider the model memory footprint and tensor parallelism (TP) degree to be used. Due to the model size and mixture-of-expert (MoE) implementation in `transformers-neuronx`, the supported TP degrees are {8, 16, 32}. Detail analysis is described as follows.\n",
+    "\n",
+    "The memory required to host a model can be computed as:\n",
+    "```\n",
+    "total memory = bytes per parameter * number of parameters\n",
+    "```\n",
+    "The `mistralai/Mixtral-8x7B-v0.1` model consists of 46.7 billion parameters.  With `float16` casted weights, we need 93.4GB to store the model weights. In reality, the total space required is often greater than just the model parameters due to caching attention layer projections (KV caching). This caching mechanism grows memory allocations linearly with sequence length and batch size. The exact calculation can be found from the [AWS Neuron documentation page](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/appnotes/transformers-neuronx/generative-llm-inference-with-neuron.html).\n",
+    "\n",
+    "To get very large language models to fit on Inf2 & Trn1, tensor parallelism is used to split weights, data, and compute across multiple NeuronCores, each equipped with 16GB high-bandwidth memory (HBM). For this model, we need at least 6 NeuronCores. \n",
+    "\n",
+    "The `mistralai/Mixtral-8x7B-v0.1` model adopts the MoE architecture with 8 experts in total. `transformers-neuronx` in Neuron SDK 2.18 employs expert parallelism for MoE architecture, splitting the 8 experts  across multiple NeuronCores. Note that increasing the TP degree beyond the minimum requirement almost always improves the model performance as more compute and memory bandwidth are available. To get better performance, it's recommended to use higher TP degree, for example, 32 for `trn1.32xlarge`. Note that we don't support TP degree of 24 on `inf2.48xlarge` for this model and the max TP degree that can be used on `inf2.48xlarge` is 16. If using TP degree 8 to run this model, users can use [int8 weight storage] (https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/transformers-neuronx-developer-guide.html) to reduce the model memory footprint.\n",
+    "\n",
+    "Starting from Neuron SDK 2.18, `transformers-neuronx` supports directly loading Hugging Face models in safetensor format and save_pretrained_split will be deprecated. In the following, we use the `MixtralForSampling` class in `transformers-neuronx` to create the model with model checkpoint loaded from Hugging Face. We enable tensor parallelism with the argument `tp_degree=16` and the use of data type `float16` with the argument `amp='f16'`. We set the max sequence length with `n_positions=1024`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dff1b319",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers_neuronx.mixtral.model import MixtralForSampling\n",
+    "\n",
+    "# set the directory for storing compiled model files\n",
+    "os.environ['NEURON_COMPILE_CACHE_URL'] = f'./neuron_cache'\n",
+    "\n",
+    "# load mistralai/Mixtral-8x7B-v0.1 to the NeuronCores with 16-way tensor parallelism\n",
+    "neuron_model = MixtralForSampling.from_pretrained(\n",
+    "    'mistralai/Mixtral-8x7B-v0.1',\n",
+    "    batch_size=1,\n",
+    "    tp_degree=16,\n",
+    "    n_positions=1024,\n",
+    "    amp='f16')\n",
+    "\n",
+    "# compile model\n",
+    "neuron_model.to_neuron()\n",
+    "\n",
+    "# construct a tokenizer and encode prompt text\n",
+    "tokenizer = AutoTokenizer.from_pretrained('mistralai/Mixtral-8x7B-v0.1')\n",
+    "prompt = \"Hello, I'm a language model,\"\n",
+    "input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
+    "\n",
+    "# run inference with top-k sampling\n",
+    "with torch.inference_mode():\n",
+    "    start = time.time()\n",
+    "    generated_sequences = neuron_model.sample(input_ids, sequence_length=512, top_k=1) # sequence_length <= n_positions\n",
+    "    elapsed = time.time() - start\n",
+    "\n",
+    "generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences]\n",
+    "print(f'generated sequences {generated_sequences} in {elapsed} seconds')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/torch-neuronx/transformers-neuronx/inference/speculative_sampling.ipynb b/torch-neuronx/transformers-neuronx/inference/speculative_sampling.ipynb
new file mode 100644
index 0000000..759dc62
--- /dev/null
+++ b/torch-neuronx/transformers-neuronx/inference/speculative_sampling.ipynb
@@ -0,0 +1,361 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run speculative sampling on Meta Llama models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In speculative sampling, we use use a smaller draft model to speculate future tokens. These are then sent to the larger target model, which accepts/rejects these tokens.  \n",
+    "\n",
+    "For a more detailed understanding, please refer to the original paper by DeepMind titled [\"Accelerating Large Language Model Decoding with Speculative Sampling\"](https://arxiv.org/abs/2302.01318)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example we perform speculative sampling using the Hugging Face [\"meta-llama/Llama-2-70b\"](https://huggingface.co/meta-llama/Llama-2-70b) model and Hugging Face [\"meta-llama/Llama-2-7b\"](https://huggingface.co/meta-llama/Llama-2-7b).\n",
+    "Here, the 70b model is considered the target model and the 7b model is considered the draft model.\n",
+    "\n",
+    "The example has the following main sections:\n",
+    "\n",
+    "1. Set up the Jupyter Notebook\n",
+    "2. Install dependencies\n",
+    "3. Download and construct the model\n",
+    "5. Split the model `state_dict` into multiple files\n",
+    "6. Perform speculative sampling\n",
+    "\n",
+    "This Jupyter Notebook should be run on a Trn1 instance (`trn1.32xlarge`). To run on an Inf2 instance (`inf2.48xlarge`) will require changing the `tp_degree` specified in compilation section."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set up the Jupyter Notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following steps set up Jupyter Notebook and launch this tutorial:\n",
+    "\n",
+    "1. Clone the [\"AWS Neuron Samples\"](https://github.com/aws-neuron/aws-neuron-samples) repo to your instance using\n",
+    "```\n",
+    "git clone https://github.com/aws-neuron/aws-neuron-samples.git\n",
+    "```\n",
+    "2. Navigate to the `transformers-neuronx` inference samples folder\n",
+    "```\n",
+    "    cd aws-neuron-samples/torch-neuronx/transformers-neuronx/inference\n",
+    "```\n",
+    "3. Follow the instructions in [\"Jupyter Notebook Quickstart\"](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/notebook/setup-jupyter-notebook-steps-troubleshooting.html) to run Jupyter Notebook on your instance.\n",
+    "\n",
+    "4. Locate this tutorial in your Jupyter Notebook session (`speculative_sampling.ipynb`) and launch it. Follow the rest of the instructions in this tutorial.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial requires the following pip packages:\n",
+    "\n",
+    "- `torch-neuronx`\n",
+    "- `neuronx-cc`\n",
+    "- `sentencepiece`\n",
+    "- `transformers`\n",
+    "- `transformers-neuronx`\n",
+    "\n",
+    "Most of these packages will be installed when configuring your environment using the [\"torch-neuronx inference setup guide\"](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). The additional dependencies must be installed here:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers-neuronx sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use of the Llama 2 model is governed by the Meta license and must be downloaded and converted to the standard Hugging Face format prior to running this sample.\n",
+    "\n",
+    "Follow the steps described in [\"meta-llama/Llama-2-70b\"](https://huggingface.co/meta-llama/Llama-2-70b) and [\"meta-llama/Llama-2-7b\"](https://huggingface.co/meta-llama/Llama-2-7b) to get access to the Llama 2 models from Meta and download the weights and tokenizer.\n",
+    "\n",
+    "After gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [\"conversion script\"](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:\n",
+    "\n",
+    "```\n",
+    "python src/transformers/models/llama/convert_llama_weights_to_hf.py \\\n",
+    "    --input_dir /path/to/downloaded/llama/weights --model_size 70Bf --output_dir ./Llama-2-70b\n",
+    " ```\n",
+    "\n",
+    "Note: For the purposes of this sample we assume you have saved the Llama-2-70b model and the Llama-2-7b model in separate directories called `Llama-2-70b`  and `Llama-2-7b` with the following formats:"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "Llama-2-70b/\n",
+    "├── config.json\n",
+    "├── generation_config.json\n",
+    "├── pytorch_model-00001-of-00015.bin\n",
+    "├── pytorch_model-00002-of-00015.bin\n",
+    "├── pytorch_model-00003-of-00015.bin\n",
+    "├── pytorch_model-00004-of-00015.bin\n",
+    "├── pytorch_model-00005-of-00015.bin\n",
+    "├── pytorch_model-00006-of-00015.bin\n",
+    "├── pytorch_model-00007-of-00015.bin\n",
+    "├── pytorch_model-00008-of-00015.bin\n",
+    "├── pytorch_model-00009-of-00015.bin\n",
+    "├── pytorch_model-00010-of-00015.bin\n",
+    "├── pytorch_model-00011-of-00015.bin\n",
+    "├── pytorch_model-00012-of-00015.bin\n",
+    "├── pytorch_model-00013-of-00015.bin\n",
+    "├── pytorch_model-00014-of-00015.bin\n",
+    "├── pytorch_model-00015-of-00015.bin\n",
+    "├── pytorch_model.bin.index.json\n",
+    "├── special_tokens_map.json\n",
+    "├── tokenizer.json\n",
+    "├── tokenizer.model\n",
+    "└── tokenizer_config.json"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "Llama-2-7b/\n",
+    "├── config.json\n",
+    "├── generation_config.json\n",
+    "├── pytorch_model-00001-of-00015.bin\n",
+    "├── pytorch_model-00002-of-00015.bin\n",
+    "├── pytorch_model-00003-of-00015.bin\n",
+    "├── pytorch_model-00004-of-00015.bin\n",
+    "├── pytorch_model-00005-of-00015.bin\n",
+    "├── pytorch_model-00006-of-00015.bin\n",
+    "├── pytorch_model-00007-of-00015.bin\n",
+    "├── pytorch_model-00008-of-00015.bin\n",
+    "├── pytorch_model-00009-of-00015.bin\n",
+    "├── pytorch_model-00010-of-00015.bin\n",
+    "├── pytorch_model-00011-of-00015.bin\n",
+    "├── pytorch_model-00012-of-00015.bin\n",
+    "├── pytorch_model-00013-of-00015.bin\n",
+    "├── pytorch_model-00014-of-00015.bin\n",
+    "├── pytorch_model-00015-of-00015.bin\n",
+    "├── pytorch_model.bin.index.json\n",
+    "├── special_tokens_map.json\n",
+    "├── tokenizer.json\n",
+    "├── tokenizer.model\n",
+    "└── tokenizer_config.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Construct the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We download and construct the draft and target models using the Hugging Face `from_pretrained` method.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LlamaForCausalLM\n",
+    "\n",
+    "draft_model = LlamaForCausalLM.from_pretrained('Llama-2-7b')\n",
+    "target_model = LlamaForCausalLM.from_pretrained('Llama-2-70b')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Split the model state_dict into multiple files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For the sake of reducing host memory usage, it is recommended to save the model `state_dict` as multiple files, as opposed to one monolithic file given by `torch.save`. This \"split-format\" `state_dict` can be created using the `save_pretrained_split` function. With this checkpoint format, the Neuron model loader can load parameters to the Neuron device high-bandwidth memory (HBM) directly by keeping at most one layer of model parameters in the CPU main memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "import re\n",
+    "import json\n",
+    "from transformers_neuronx.module import save_pretrained_split\n",
+    "\n",
+    "save_pretrained_split(draft_model, './Llama-2-7b-split')\n",
+    "save_pretrained_split(target_model, './Llama-2-70b-split')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perform speculative sampling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0401a3e7",
+   "metadata": {},
+   "source": [
+    "We now load and compile the draft model and the target model.\n",
+    "We use the Neuron `LlamaForSampling` class to load both models. Without extra configuration, autoregressive sampling is used as default.\n",
+    "\n",
+    "Since we need to perform regular autoregressive sampling in the draft model, we load and compile it using the default options.\n",
+    "For the target model, we need to explicitly enable speculative decoding by calling the function enable_speculative_decoder(k) and this will let the model compiled for computing a window of k tokens at a time.\n",
+    "\n",
+    "Note that when loading the models, we must use the same `tp_degree`. Attempting to use a different value for the draft/target model will result in a load failure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdb21762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers_neuronx.llama.model import LlamaForSampling\n",
+    "\n",
+    "print(\"\\nStarting to compile Draft Model....\")\n",
+    "# Load draft model\n",
+    "draft_neuron_model = LlamaForSampling.from_pretrained('./Llama-2-7b-split', n_positions=128, batch_size=1, tp_degree=32, amp='f32')\n",
+    "# compile to neuron \n",
+    "draft_neuron_model.to_neuron()\n",
+    "print(\"\\nCompleted compilation of Draft Model\")\n",
+    "\n",
+    "print(\"\\nStarting to compile Target Model....\")\n",
+    "# Load target model\n",
+    "target_neuron_model = LlamaForSampling.from_pretrained('./Llama-2-70b-split', n_positions=128, batch_size=1, tp_degree=32, amp='f32')\n",
+    "# Enable speculative decoder\n",
+    "target_neuron_model.enable_speculative_decoder(7)\n",
+    "# compile to neuron \n",
+    "target_neuron_model.to_neuron()\n",
+    "print(\"\\nCompleted compilation of Target Model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44ca0c3f",
+   "metadata": {},
+   "source": [
+    "Next, we initialize the tokenizer and the text prompt. \n",
+    "\n",
+    "We then initialize the `SpeculativeGenerator` class and pass the draft model, target model and speculation length as arguments. We can use this to call the `sample()` function and get the final sampled tokens after using the tokenizer to decode them. \n",
+    "\n",
+    "Comparing the response generation time between speculative sampling and autoregressive sampling, we see that speculative sampling is faster than autoregressive sampling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14457e2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers_neuronx.speculation import SpeculativeGenerator, DraftModelForSpeculation, DefaultTokenAcceptor\n",
+    "import sentencepiece\n",
+    "from transformers import LlamaTokenizer\n",
+    "\n",
+    "#Initialize tokenizer and text prompt\n",
+    "tokenizer = LlamaTokenizer.from_pretrained(\"Llama-2-70b\")\n",
+    "prompt = \"Hello, I'm a generative AI language model.\"\n",
+    "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
+    "\n",
+    "# create SpeculativeGenerator\n",
+    "spec_gen = SpeculativeGenerator(draft_neuron_model, target_neuron_model, 7)\n",
+    "\n",
+    "# call speculative sampling on given input\n",
+    "start_spec_timer = time.time()\n",
+    "\n",
+    "print(\"Starting to call Speculative Sampling..\")\n",
+    "response = spec_gen.sample(\n",
+    "    input_ids=input_ids,\n",
+    "    sequence_length=50,\n",
+    ")\n",
+    "end_spec_timer = time.time()\n",
+    "\n",
+    "generated_text = tokenizer.decode(response[0])\n",
+    "print(f\"\\nDecoded tokens: {generated_text}\")\n",
+    "\n",
+    "print(f\"\\nSpeculative sampling response generation took {end_spec_timer - start_spec_timer} ms\")\n",
+    "\n",
+    "start_auto_r_timer = time.time()\n",
+    "autor_response = target_neuron_model.sample(input_ids=input_ids, sequence_length=50)\n",
+    "end_auto_r_timer = time.time()\n",
+    "\n",
+    "print(f\"\\nAutoregressive sampling response generation took {end_auto_r_timer - start_auto_r_timer} ms\")\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}