From 65aaaec2499e36c04602ca74a1d7d14ba371a699 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Thu, 5 Sep 2024 19:41:20 +0000
Subject: [PATCH 01/23] init

---
 .gitignore                |  2 ++
 Dockerfile                |  4 ++++
 open_instruct/dpo_tune.py |  8 +++++++-
 open_instruct/finetune.py |  8 +++++++-
 open_instruct/utils.py    | 38 ++++++++++++++++++++++++++++++++++++++
 requirements.txt          |  2 +-
 6 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 55d2caedc..5952c206c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,8 @@ rejection_sampling/shards1
 token_length.png
 *.tfevents.*
 
+oe-eval-internal/
+
 results
 models
 wandb
diff --git a/Dockerfile b/Dockerfile
index dd6b95a97..7579a02fc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -90,6 +90,10 @@ RUN pip install --upgrade pip "setuptools<70.0.0" wheel
 RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 RUN pip install packaging
 RUN pip install flash-attn==2.6.3 --no-build-isolation
+# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
+# core is a dependency of ai2-olmo
+RUN pip install ai2-olmo-core==0.1.0
+RUN pip install ai2-olmo>=0.5.0 --no-deps
 RUN pip install -r requirements.txt
 
 # NLTK download
diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
index 9c5cf1714..74bb522fd 100644
--- a/open_instruct/dpo_tune.py
+++ b/open_instruct/dpo_tune.py
@@ -17,13 +17,13 @@
 DPO tuning script. Adapted from our finetuning script.
 """
 
+import json
 import logging
 import math
 import os
 import random
 import subprocess
 import time
-import json
 from copy import deepcopy
 from dataclasses import dataclass, field
 from datetime import timedelta
@@ -65,6 +65,7 @@
 from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate
 from open_instruct.utils import (
     ArgumentParserPlus,
+    check_hf_olmo_availability,
     clean_last_n_checkpoints,
     get_datasets,
     get_last_checkpoint_path,
@@ -499,6 +500,11 @@ def prepare_deepspeed(accelerator, model):
 
 
 def main(args: FlatArguments):
+    # try to import OLMo for automodel
+    if check_hf_olmo_availability():
+        # allows AutoModel... to work with not in transformers olmo models
+        import hf_olmo  # noqa
+
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index e8d69c08e..48ba726e1 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import logging
 import math
 import os
 import random
 import subprocess
 import time
-import json
 from dataclasses import dataclass, field
 from datetime import timedelta
 from functools import partial
@@ -55,6 +55,7 @@
 from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate
 from open_instruct.utils import (
     ArgumentParserPlus,
+    check_hf_olmo_availability,
     clean_last_n_checkpoints,
     get_datasets,
     get_last_checkpoint_path,
@@ -448,6 +449,11 @@ def _concat_messages(messages):
 
 
 def main(args: FlatArguments):
+    # try to import OLMo for automodel
+    if check_hf_olmo_availability():
+        # allows AutoModel... to work with not in transformers olmo models
+        import hf_olmo  # noqa
+
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
index 34fc96dfe..0a95b3f28 100644
--- a/open_instruct/utils.py
+++ b/open_instruct/utils.py
@@ -14,6 +14,7 @@
 
 import dataclasses
 import functools
+import importlib
 import json
 import logging
 import os
@@ -51,6 +52,43 @@
 """
 
 
+# ----------------------------------------------------------------------------
+# Import utilities
+def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool]:
+    pkg_name = "hf_olmo"
+
+    # Check if the package spec exists
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+
+    if package_exists:
+        try:
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method
+            try:
+                package = importlib.import_module(pkg_name)
+                package_version = getattr(package, "__version__", "N/A")
+                if package_version == "N/A":
+                    package_exists = False
+            except ImportError:
+                package_exists = False
+
+        logger.debug(f"Detected {pkg_name} version: {package_version}")
+
+    if return_version:
+        return {
+            "available": package_exists,
+            "version": package_version,
+            "python_version": sys.version,
+            "os": os.name,
+            "platform": sys.platform,
+        }
+    else:
+        return package_exists
+
+
 # ----------------------------------------------------------------------------
 # Dataset utilities
 def is_openai_format(messages: Any) -> bool:
diff --git a/requirements.txt b/requirements.txt
index 39a3073d6..9b9ff095b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -44,4 +44,4 @@ isort
 autoflake
 pytest
 hf_transfer
-beaker-py
+beaker-py
\ No newline at end of file

From 7da49a9b5a8f1491267603f2371700d4aca5c2f7 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Thu, 5 Sep 2024 20:35:18 +0000
Subject: [PATCH 02/23] up

---
 Dockerfile                                  |  2 +-
 configs/train_configs/sft/olmo_7b_0924.yaml | 22 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 configs/train_configs/sft/olmo_7b_0924.yaml

diff --git a/Dockerfile b/Dockerfile
index 7579a02fc..87181fb2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -92,7 +92,7 @@ RUN pip install packaging
 RUN pip install flash-attn==2.6.3 --no-build-isolation
 # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
 # core is a dependency of ai2-olmo
-RUN pip install ai2-olmo-core==0.1.0
+RUN pip install ai2-olmo-core==0.1.0 omegaconf
 RUN pip install ai2-olmo>=0.5.0 --no-deps
 RUN pip install -r requirements.txt
 
diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
new file mode 100644
index 000000000..d76b2e560
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -0,0 +1,22 @@
+model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_revision: main
+use_flash_attn: true
+tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048
+max_seq_length: 2048
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: output/olmo_instruct/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file

From d929a895aa80a04ad4254989e1051a5b36c76e44 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 17:23:18 +0000
Subject: [PATCH 03/23] branch dockerfile

---
 .github/workflows/push-image-olmo.yml |  82 ++++++++++++++++++
 .github/workflows/push-image.yml      |   2 -
 Dockerfile                            |   4 +-
 Dockerfile.olmo                       | 115 ++++++++++++++++++++++++++
 requirements-olmo.txt                 |  48 +++++++++++
 5 files changed, 247 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/push-image-olmo.yml
 create mode 100644 Dockerfile.olmo
 create mode 100644 requirements-olmo.txt

diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml
new file mode 100644
index 000000000..8fdafc49f
--- /dev/null
+++ b/.github/workflows/push-image-olmo.yml
@@ -0,0 +1,82 @@
+# This is an example workflow file.
+#
+# When you add a new image, copy this file and then change all mentions of "hello-world" with
+# the name of your new image.
+#
+# Read through the rest of the comments in this file to figure out how it works, and what else
+# you need to change.
+name: build_open_instruct_olmo
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  push:
+    # Run this workflow anytime a push updates one of the files in the image's directory
+    # (other than the README), and anytime there's a new release tag for this image.
+    paths:
+      - 'open_instruct/**'
+      - '!open_instruct/README.md'
+      - 'requirements-olmo.txt'
+      - 'Dockerfile.olmo'
+      - '.github/workflows/push-image-olmo.yml'
+      # Note, add .olmo dockerfile + requirements if adding auto build to those
+    branches: [main]
+  # pull_request: # note, comment this out for running on every push
+  #   # Also run on PRs that update the files in the image's directory (other than README).
+  #   branches: [main]
+  #   paths:
+  #     - 'open_instruct/**'
+  #     - '!open_instruct/README.md'
+  #     - 'requirements-olmo.txt'
+  #     - 'Dockerfile.olmo'
+  workflow_dispatch:  # This allows us to manually trigger a build through the GitHub UI.
+
+env:
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  build:
+    name: open_instruct
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          repository: allenai/oe-eval-internal
+          path: './oe-eval-internal'
+          ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }}
+
+      - name: Setup environment
+        uses: ./.github/actions/setup
+        with:
+          beaker_token: ${{ secrets.BEAKER_TOKEN }}
+          # ghcr_token: ${{ secrets.GHCR_TOKEN }}
+          # ghcr_user: ${{ secrets.GHCR_USER }}
+
+      # big images fail, trying this
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache /usr/share/dotnet "$AGENT_TOOLSDIRECTORY"
+
+      - name: Build image
+        run: |
+          docker build \
+              --build-arg BUILDKIT_INLINE_CACHE=1 \
+              --build-arg CUDA=12.1.0 --build-arg \
+              TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
+              --build-arg REQUIRE=requirements.txt \
+              -f Dockerfile.olmo . \
+              -t open_instruct_olmo
+              
+      - name: Check image
+        run: |
+          docker run --rm open_instruct_olmo
+      - name: Push image
+        # if: github.event_name != 'pull_request'
+        uses: ./.github/actions/push
+        with:
+          image: open_instruct_olmo  # this is the tag of the image we just built in the previous step
+          beaker: open_instruct_olmo_auto  # this is the name of the image on Beaker
+          latest: true  # this flag says we should also push this as the 'latest' version to GHCR
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
index 40e205fb3..4e29f4d28 100644
--- a/.github/workflows/push-image.yml
+++ b/.github/workflows/push-image.yml
@@ -44,8 +44,6 @@ jobs:
     timeout-minutes: 60
     if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
     steps:
-      - uses: actions/checkout@v3
-
       - uses: actions/checkout@v3
         with:
           repository: allenai/oe-eval-internal
diff --git a/Dockerfile b/Dockerfile
index 87181fb2f..9f4e1fb00 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -92,8 +92,8 @@ RUN pip install packaging
 RUN pip install flash-attn==2.6.3 --no-build-isolation
 # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
 # core is a dependency of ai2-olmo
-RUN pip install ai2-olmo-core==0.1.0 omegaconf
-RUN pip install ai2-olmo>=0.5.0 --no-deps
+# RUN pip install ai2-olmo-core==0.1.0 omegaconf
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
 RUN pip install -r requirements.txt
 
 # NLTK download
diff --git a/Dockerfile.olmo b/Dockerfile.olmo
new file mode 100644
index 000000000..38f8a1db4
--- /dev/null
+++ b/Dockerfile.olmo
@@ -0,0 +1,115 @@
+ARG CUDA
+ARG DIST
+ARG TARGET
+FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    parallel \
+    iputils-ping \
+    tmux
+
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place (that THOR requires).
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+
+# Install conda. We give anyone in the users group the ability to run
+# conda commands and install packages in the base (default) environment.
+# Things installed into the default environment won't persist, but we prefer
+# convenience in this case and try to make sure the user is aware of this
+# with a message that's printed when the session starts.
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
+    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
+        | sha256sum --check \
+    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
+    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+
+ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install a few additional utilities via pip
+RUN /opt/miniconda3/bin/pip install --no-cache-dir \
+    gpustat \
+    jupyter \
+    beaker-gantry \
+    oocmap
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Make the base image friendlier for interactive workloads. This makes things like the man command
+# work.
+RUN yes | unminimize
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 5.8-1.1.2.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# The -l flag makes bash act as a login shell and load /etc/profile, etc.
+ENTRYPOINT ["bash", "-l"]
+
+WORKDIR /stage/
+
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+COPY requirements.txt .
+RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
+# TODO, unpin setuptools when this issue in flash attention is resolved
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install packaging
+RUN pip install flash-attn==2.5.9 --no-build-isolation
+# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
+# core is a dependency of ai2-olmo
+# RUN pip install ai2-olmo-core==0.1.0 omegaconf
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
+RUN pip install -r requirements-olmo.txt
+
+# NLTK download
+RUN python -m nltk.downloader punkt
+COPY open_instruct open_instruct
+COPY oe-eval-internal oe-eval-internal
+
+# install the package in editable mode
+COPY pyproject.toml .
+RUN pip install -e .
+COPY .git/ ./.git/
+COPY eval eval
+COPY configs configs
+COPY scripts scripts
+COPY mason.py mason.py
+RUN chmod +x scripts/*
+
+# for interactive session
+RUN chmod -R 777 /stage/
diff --git a/requirements-olmo.txt b/requirements-olmo.txt
new file mode 100644
index 000000000..611ebda53
--- /dev/null
+++ b/requirements-olmo.txt
@@ -0,0 +1,48 @@
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the Dockerfile 
+torch==2.4.0
+ai2-olmo-core==0.1.0
+ai2-olmo>=0.5.0
+scipy
+packaging
+sentencepiece
+datasets
+deepspeed==0.14.4
+accelerate==0.31.0
+peft>=0.11.1
+bitsandbytes>=0.41.1
+evaluate>=0.4.0
+tokenizers==0.19.1
+protobuf
+transformers==4.43.4
+openai>=1.0.0
+tiktoken
+rouge_score
+tensorboard
+wandb
+gradio>=3.50.2
+termcolor
+jsonlines
+unidic-lite
+einops
+flash-attn==2.5.8 # should really only be in dockerfile. Local env often doesn't have GPUs
+fire
+alpaca-eval==0.6.2
+# for human eval web app
+flask
+openpyxl
+# for ifeval
+nltk==3.8.1
+langdetect
+immutabledict
+# for math evaluations
+antlr4-python3-runtime==4.11.0
+mpmath==1.3.0
+sympy==1.12.0
+# for linting
+black
+flake8
+isort
+autoflake
+pytest
+hf_transfer
+beaker-py
\ No newline at end of file

From 6ef706c10b9d6e9656db34d544cd57b5a40cb274 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 17:57:40 +0000
Subject: [PATCH 04/23] update

---
 Dockerfile.olmo       | 8 ++++----
 requirements-olmo.txt | 4 +---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
index 38f8a1db4..6259756ae 100644
--- a/Dockerfile.olmo
+++ b/Dockerfile.olmo
@@ -84,16 +84,16 @@ WORKDIR /stage/
 
 # TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
-COPY requirements.txt .
+COPY requirements-olmo.txt .
 RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
 # TODO, unpin setuptools when this issue in flash attention is resolved
 RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 RUN pip install packaging
-RUN pip install flash-attn==2.5.9 --no-build-isolation
+RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
 # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
 # core is a dependency of ai2-olmo
-# RUN pip install ai2-olmo-core==0.1.0 omegaconf
-# RUN pip install ai2-olmo>=0.5.0 --no-deps
+RUN pip install ai2-olmo-core==0.1.0 omegaconf
+RUN pip install ai2-olmo>=0.5.0 --no-deps
 RUN pip install -r requirements-olmo.txt
 
 # NLTK download
diff --git a/requirements-olmo.txt b/requirements-olmo.txt
index 611ebda53..93367a9e0 100644
--- a/requirements-olmo.txt
+++ b/requirements-olmo.txt
@@ -1,7 +1,5 @@
 # TODO When updating flash-attn or torch in the future, make sure to update the version in the Dockerfile 
 torch==2.4.0
-ai2-olmo-core==0.1.0
-ai2-olmo>=0.5.0
 scipy
 packaging
 sentencepiece
@@ -24,7 +22,7 @@ termcolor
 jsonlines
 unidic-lite
 einops
-flash-attn==2.5.8 # should really only be in dockerfile. Local env often doesn't have GPUs
+flash-attn==2.5.9.post1 # should really only be in dockerfile. Local env often doesn't have GPUs
 fire
 alpaca-eval==0.6.2
 # for human eval web app

From a4797fb6eda9a1ad6fe8f9d643995e7cdfbfe8b5 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 20:55:35 +0000
Subject: [PATCH 05/23] debugging and minor fixes

---
 .github/workflows/push-image-olmo.yml       | 1 -
 .github/workflows/push-image.yml            | 1 -
 README.md                                   | 2 +-
 configs/train_configs/sft/olmo_7b_0924.yaml | 2 +-
 open_instruct/dpo_tune.py                   | 3 ++-
 open_instruct/finetune.py                   | 3 ++-
 open_instruct/utils.py                      | 2 --
 requirements-olmo.txt                       | 2 +-
 8 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml
index 8fdafc49f..28a8c3467 100644
--- a/.github/workflows/push-image-olmo.yml
+++ b/.github/workflows/push-image-olmo.yml
@@ -66,7 +66,6 @@ jobs:
               --build-arg BUILDKIT_INLINE_CACHE=1 \
               --build-arg CUDA=12.1.0 --build-arg \
               TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
-              --build-arg REQUIRE=requirements.txt \
               -f Dockerfile.olmo . \
               -t open_instruct_olmo
               
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
index 4e29f4d28..f5a35fdc0 100644
--- a/.github/workflows/push-image.yml
+++ b/.github/workflows/push-image.yml
@@ -67,7 +67,6 @@ jobs:
               --build-arg BUILDKIT_INLINE_CACHE=1 \
               --build-arg CUDA=12.1.0 --build-arg \
               TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
-              --build-arg REQUIRE=requirements.txt . \
               -t open_instruct
               
 
diff --git a/README.md b/README.md
index e0ba44914..701358cd0 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ pip install -r weight-diff-requirements.txt
 For a second installation strategy, if you'd like to *run experiments within a Docker environment*, you can create one using:
 
 ```bash
-docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 --build-arg REQUIRE=requirements.txt . -t open_instruct
+docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct
 
 # if you are interally at AI2, you can create an image like this:
 beaker image create open_instruct -n open_instruct -w ai2/$(whoami)
diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index d76b2e560..317e106d9 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -1,6 +1,6 @@
 model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 model_revision: main
-use_flash_attn: true
+use_flash_attn: false
 tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 use_slow_tokenizer: false # olmo models only use fast tokenizers
 dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048
diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
index 74bb522fd..13b7c2f44 100644
--- a/open_instruct/dpo_tune.py
+++ b/open_instruct/dpo_tune.py
@@ -504,6 +504,7 @@ def main(args: FlatArguments):
     if check_hf_olmo_availability():
         # allows AutoModel... to work with not in transformers olmo models
         import hf_olmo  # noqa
+        from hf_olmo import OLMoTokenizerFast
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
@@ -678,7 +679,7 @@ def load_model():
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index 48ba726e1..211c43887 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -453,6 +453,7 @@ def main(args: FlatArguments):
     if check_hf_olmo_availability():
         # allows AutoModel... to work with not in transformers olmo models
         import hf_olmo  # noqa
+        from hf_olmo import OLMoTokenizerFast
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
@@ -649,7 +650,7 @@ def main(args: FlatArguments):
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
index 0a95b3f28..08046e17e 100644
--- a/open_instruct/utils.py
+++ b/open_instruct/utils.py
@@ -75,8 +75,6 @@ def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool]
             except ImportError:
                 package_exists = False
 
-        logger.debug(f"Detected {pkg_name} version: {package_version}")
-
     if return_version:
         return {
             "available": package_exists,
diff --git a/requirements-olmo.txt b/requirements-olmo.txt
index 93367a9e0..1ec51fb2f 100644
--- a/requirements-olmo.txt
+++ b/requirements-olmo.txt
@@ -33,7 +33,7 @@ nltk==3.8.1
 langdetect
 immutabledict
 # for math evaluations
-antlr4-python3-runtime==4.11.0
+antlr4-python3-runtime==4.9.2
 mpmath==1.3.0
 sympy==1.12.0
 # for linting

From 56d6d8644c8e66e3586350ba963394b6e89a6def Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 21:32:33 +0000
Subject: [PATCH 06/23] nit and style

---
 open_instruct/dpo_tune.py | 4 +++-
 open_instruct/finetune.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
index 13b7c2f44..cd0676648 100644
--- a/open_instruct/dpo_tune.py
+++ b/open_instruct/dpo_tune.py
@@ -679,7 +679,9 @@ def load_model():
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (
+        check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)
+    ):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index 211c43887..4e917ff77 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -650,7 +650,7 @@ def main(args: FlatArguments):
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast):  # noqa
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token

From 50500ea5f6f995b18e791242649cb6cb14044319 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 22:13:51 +0000
Subject: [PATCH 07/23] fixes

---
 open_instruct/finetune.py | 4 ++--
 open_instruct/utils.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index 4e917ff77..da6398f94 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -650,7 +650,7 @@ def main(args: FlatArguments):
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast):  # noqa
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
@@ -1015,7 +1015,7 @@ def main(args: FlatArguments):
     if is_beaker_job() and accelerator.is_main_process:
         # dpo script only supports these two options right now for datasets
         if args.dataset_mixer:
-            dataset_list = args.dataset_mixer.keys()
+            dataset_list = list(args.dataset_mixer.keys())
         elif args.dataset_mixer_list:
             dataset_list = args.dataset_mixer_list[::2]  # even indices
         elif args.dataset_name:
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
index 08046e17e..c45e5873d 100644
--- a/open_instruct/utils.py
+++ b/open_instruct/utils.py
@@ -54,7 +54,7 @@
 
 # ----------------------------------------------------------------------------
 # Import utilities
-def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool]:
+def check_hf_olmo_availability(return_version: bool = False) -> Union[dict, bool]:
     pkg_name = "hf_olmo"
 
     # Check if the package spec exists

From 5eb61ccb38da34d43f40d02c97dcc6741e047c06 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 9 Sep 2024 23:16:00 +0000
Subject: [PATCH 08/23] add weka mounting

---
 configs/train_configs/sft/olmo_7b_0924.yaml | 10 ++++++----
 scripts/submit_finetune_job.py              | 12 +++++++++++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index 317e106d9..74bd932c2 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -1,10 +1,12 @@
-model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 model_revision: main
 use_flash_attn: false
-tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
-dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048
-max_seq_length: 2048
+dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
+max_seq_length: 4096
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
 gradient_accumulation_steps: 16
diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py
index ef4e87138..37df9509a 100644
--- a/scripts/submit_finetune_job.py
+++ b/scripts/submit_finetune_job.py
@@ -25,6 +25,8 @@ def main():
     parser.add_argument("--num_nodes", type=int, default=1, help="Number of nodes to use")
     parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.")
     parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.")
+    parser.add_argument("--mount_on_weka", type=str, default=None, help="Mount a Weka directory to the job")
+    parser.add_argument("--weka_mount_path", type=str, default="/models", help="Path to mount the Weka directory")
     # allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping
     # Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py)
     
@@ -166,7 +168,7 @@ def parse_args(args):
     d['tasks'][0]['arguments'][0] = new_arguments
 
     # name and description
-    exp_name = f"open_instruct_finetune_{model_name}_{now}"
+    exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128]
     d['description'] = exp_name
     d['tasks'][0]['name'] = exp_name
 
@@ -220,6 +222,14 @@ def parse_args(args):
     d['tasks'][0]['envVars'].append({
         'name': 'WANDB_API_KEY', 'secret': f"{beaker_whoami}_WANDB_API_KEY"
     })
+    
+    # Weka setting
+    if args.mount_on_weka:
+        if d['tasks'][0].get('datasets') is None:
+            d['tasks'][0]['datasets'] = []
+        d['tasks'][0]['datasets'].append({
+            'mountPath': f"{args.weka_mount_path}", 'source': {'weka': f"{args.mount_on_weka}"}
+        })
 
     # optionally, print to debug config
     print(d)

From b1344102cd0738e0579d6623c32793cb3dd02f4f Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 10 Sep 2024 00:46:57 +0000
Subject: [PATCH 09/23] up

---
 configs/train_configs/sft/olmo_7b_0924.yaml   |  8 +++----
 .../train_configs/sft/olmo_7b_0924_fw2.yaml   | 24 +++++++++++++++++++
 scripts/submit_finetune_job.py                |  4 ++--
 3 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2.yaml

diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index 74bd932c2..91d72a1ec 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -1,21 +1,21 @@
 # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 model_revision: main
 use_flash_attn: false
 # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
 dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
 max_seq_length: 4096
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 8
 learning_rate: 2.0e-06
 lr_scheduler_type: linear
 warmup_ratio: 0.03
 weight_decay: 0.0
 num_train_epochs: 3
-output_dir: output/olmo_instruct/
+output_dir: /output/olmo_instruct/
 with_tracking: true
 report_to:
   - wandb
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
new file mode 100644
index 000000000..ff376aebf
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
@@ -0,0 +1,24 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: false
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # designed for 4 nodes
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/olmo_instruct/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py
index 37df9509a..9991003f8 100644
--- a/scripts/submit_finetune_job.py
+++ b/scripts/submit_finetune_job.py
@@ -26,7 +26,7 @@ def main():
     parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.")
     parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.")
     parser.add_argument("--mount_on_weka", type=str, default=None, help="Mount a Weka directory to the job")
-    parser.add_argument("--weka_mount_path", type=str, default="/models", help="Path to mount the Weka directory")
+    parser.add_argument("--weka_mount_path", type=str, default="/adapt-data", help="Path to mount the Weka directory")
     # allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping
     # Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py)
     
@@ -173,7 +173,7 @@ def parse_args(args):
     d['tasks'][0]['name'] = exp_name
 
     # add cluster-specific env vars
-    if args.cluster == "ai2/jupiter-cirrascale-2":
+    if args.cluster == "ai2/jupiter-cirrascale-2" and args.num_nodes > 1:
         d['tasks'][0]['envVars'] += [
             {
                 "name": "NCCL_SOCKET_IFNAME",

From d007da01bd5d75489fdf92d8d73c9c792eea3dc0 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 10 Sep 2024 16:07:33 +0000
Subject: [PATCH 10/23] add hardcode flash_attn

---
 open_instruct/finetune.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index da6398f94..f0aa3cab2 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -616,6 +616,7 @@ def main(args: FlatArguments):
                 trust_remote_code=args.trust_remote_code,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+                flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0
                 revision=args.model_revision,
                 token=os.getenv("HF_TOKEN", None),
             )
@@ -628,6 +629,7 @@ def main(args: FlatArguments):
                 low_cpu_mem_usage=args.low_cpu_mem_usage,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+                flash_attention=True if args.use_flash_attn else False,  # TODO remove with ai2-olmo > 0.5.0
                 revision=args.model_revision,
                 token=os.getenv("HF_TOKEN", None),
             )

From 5d82ea2bd5293ab19c7c8346aba20031c01f5b6e Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 10 Sep 2024 16:28:16 +0000
Subject: [PATCH 11/23] tweaks

---
 open_instruct/finetune.py | 40 +++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index f0aa3cab2..b2e9ed283 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -453,7 +453,7 @@ def main(args: FlatArguments):
     if check_hf_olmo_availability():
         # allows AutoModel... to work with not in transformers olmo models
         import hf_olmo  # noqa
-        from hf_olmo import OLMoTokenizerFast
+        from hf_olmo import OLMoTokenizerFast, OLMoConfig
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
@@ -616,23 +616,35 @@ def main(args: FlatArguments):
                 trust_remote_code=args.trust_remote_code,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
-                flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0
                 revision=args.model_revision,
                 token=os.getenv("HF_TOKEN", None),
             )
         else:
-            model = AutoModelForCausalLM.from_pretrained(
-                args.model_name_or_path,
-                from_tf=bool(".ckpt" in args.model_name_or_path),
-                config=config,
-                trust_remote_code=args.trust_remote_code,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                torch_dtype=torch.bfloat16,
-                attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
-                flash_attention=True if args.use_flash_attn else False,  # TODO remove with ai2-olmo > 0.5.0
-                revision=args.model_revision,
-                token=os.getenv("HF_TOKEN", None),
-            )
+            if (check_hf_olmo_availability() and isinstance(config, OLMoConfig)):
+                # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0
+                config.flash_attention = args.use_flash_attn
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path,
+                    from_tf=bool(".ckpt" in args.model_name_or_path),
+                    config=config,
+                    trust_remote_code=args.trust_remote_code,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    torch_dtype=torch.bfloat16,
+                    revision=args.model_revision,
+                    token=os.getenv("HF_TOKEN", None),
+                )
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path,
+                    from_tf=bool(".ckpt" in args.model_name_or_path),
+                    config=config,
+                    trust_remote_code=args.trust_remote_code,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    torch_dtype=torch.bfloat16,
+                    attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+                    revision=args.model_revision,
+                    token=os.getenv("HF_TOKEN", None),
+                )
     else:
         logger.info("Training new model from scratch")
         model = AutoModelForCausalLM.from_config(config)

From 11397bf76e67d718da72ecd77856722315db338e Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 10 Sep 2024 17:07:05 +0000
Subject: [PATCH 12/23] making it work nicely

---
 configs/train_configs/sft/olmo_7b_0924.yaml   |  2 +-
 .../train_configs/sft/olmo_7b_0924_fw2.yaml   |  2 +-
 .../sft/olmo_7b_0924_fw2_permissive.yaml      | 31 +++++++++++++++++++
 open_instruct/finetune.py                     |  9 ++++--
 open_instruct/utils.py                        |  3 +-
 5 files changed, 40 insertions(+), 7 deletions(-)
 create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml

diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index 91d72a1ec..b08a6ac2a 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -1,7 +1,7 @@
 # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 model_revision: main
-use_flash_attn: false
+use_flash_attn: true
 # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
index ff376aebf..3a40cd915 100644
--- a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
@@ -1,7 +1,7 @@
 # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
 model_revision: main
-use_flash_attn: false
+use_flash_attn: true
 # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
new file mode 100644
index 000000000..72d3637b6
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
@@ -0,0 +1,31 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_mixer:
+    ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License
+    nvidia/Daring-Anteater: 1.0 # CC BY 4.0, 
+    natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0
+    natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0
+    Open-Orca/OpenOrca: .02 # MIT
+    allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0
+    ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # designed for 4 nodes
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/olmo_instruct/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index b2e9ed283..5842e6811 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -453,7 +453,7 @@ def main(args: FlatArguments):
     if check_hf_olmo_availability():
         # allows AutoModel... to work with not in transformers olmo models
         import hf_olmo  # noqa
-        from hf_olmo import OLMoTokenizerFast, OLMoConfig
+        from hf_olmo import OLMoConfig, OLMoTokenizerFast
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
@@ -620,7 +620,8 @@ def main(args: FlatArguments):
                 token=os.getenv("HF_TOKEN", None),
             )
         else:
-            if (check_hf_olmo_availability() and isinstance(config, OLMoConfig)):
+            if check_hf_olmo_availability() and isinstance(config, OLMoConfig):
+                logger.info("Temporary loading for recent OLMo Models")
                 # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0
                 config.flash_attention = args.use_flash_attn
                 model = AutoModelForCausalLM.from_pretrained(
@@ -664,7 +665,9 @@ def main(args: FlatArguments):
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (
+        check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)
+    ):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
index c45e5873d..b7fd1ae1a 100644
--- a/open_instruct/utils.py
+++ b/open_instruct/utils.py
@@ -70,10 +70,9 @@ def check_hf_olmo_availability(return_version: bool = False) -> Union[dict, bool
             try:
                 package = importlib.import_module(pkg_name)
                 package_version = getattr(package, "__version__", "N/A")
-                if package_version == "N/A":
-                    package_exists = False
             except ImportError:
                 package_exists = False
+                package_version = "N/A"
 
     if return_version:
         return {

From ee5c7d22cce9aebfc46f5ab35d7eaa3f47e5d944 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Wed, 11 Sep 2024 22:13:59 +0000
Subject: [PATCH 13/23] clean

---
 Dockerfile.olmo                               |  3 +-
 .../default_finetune_offloading.yaml          | 69 +++++++++++++++++++
 configs/train_configs/sft/olmo_7b_0924.yaml   | 10 ++-
 .../train_configs/sft/olmo_7b_0924_fw2.yaml   | 24 -------
 .../sft/olmo_7b_0924_fw2_permissive.yaml      | 13 ++--
 open_instruct/finetune.py                     | 52 +++++++-------
 6 files changed, 110 insertions(+), 61 deletions(-)
 create mode 100644 configs/beaker_configs/default_finetune_offloading.yaml
 delete mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2.yaml

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
index 6259756ae..1f8da727d 100644
--- a/Dockerfile.olmo
+++ b/Dockerfile.olmo
@@ -93,7 +93,8 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
 # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
 # core is a dependency of ai2-olmo
 RUN pip install ai2-olmo-core==0.1.0 omegaconf
-RUN pip install ai2-olmo>=0.5.0 --no-deps
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
+RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps
 RUN pip install -r requirements-olmo.txt
 
 # NLTK download
diff --git a/configs/beaker_configs/default_finetune_offloading.yaml b/configs/beaker_configs/default_finetune_offloading.yaml
new file mode 100644
index 000000000..4722b4e13
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_offloading.yaml
@@ -0,0 +1,69 @@
+version: v2
+description: open-instruct-finetune
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+      --mixed_precision bf16
+      --num_machines 1
+      --num_processes 4
+      --use_deepspeed
+      --deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf
+      open_instruct/finetune.py
+      --model_name_or_path /hf_llama_models
+      --use_flash_attn
+      --tokenizer_name /hf_llama_models
+      --max_seq_length 2048
+      --preprocessing_num_workers 16
+      --per_device_train_batch_size 2
+      --gradient_accumulation_steps 16
+      --learning_rate 2e-5
+      --lr_scheduler_type linear
+      --warmup_ratio 0.03
+      --weight_decay 0.
+      --num_train_epochs 2
+      --output_dir /output/
+      --with_tracking
+      --report_to tensorboard
+      --logging_steps 1
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: HF_TOKEN
+    # datasets: # example for how to include datasets in mounting
+    #   - mountPath: /data
+    #     source:
+    #       beaker: Yizhongw03/processed_open_instruct_data
+    #   - mountPath: /mmlu
+    #     source:
+    #       beaker: Yizhongw03/mmlu
+    #   - mountPath: /hf_llama_models
+    #     source:
+    #       beaker: Yizhongw03/hf_llama_model_7B
+    result:
+      path: /output
+    resources:
+      gpuCount: 4
+    context:
+      cluster: ai2/allennlp-cirrascale
+      priority: high
+      preemptible: false
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index b08a6ac2a..7f62e1dca 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -1,15 +1,13 @@
-# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
+model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 model_revision: main
 use_flash_attn: true
-# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf
+tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
 use_slow_tokenizer: false # olmo models only use fast tokenizers
 dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
 max_seq_length: 4096
 preprocessing_num_workers: 128
-per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
-gradient_accumulation_steps: 8
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only
 learning_rate: 2.0e-06
 lr_scheduler_type: linear
 warmup_ratio: 0.03
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
deleted file mode 100644
index 3a40cd915..000000000
--- a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
-model_revision: main
-use_flash_attn: true
-# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
-use_slow_tokenizer: false # olmo models only use fast tokenizers
-dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
-max_seq_length: 4096
-preprocessing_num_workers: 128
-per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
-gradient_accumulation_steps: 4 # designed for 4 nodes
-learning_rate: 2.0e-06
-lr_scheduler_type: linear
-warmup_ratio: 0.03
-weight_decay: 0.0
-num_train_epochs: 3
-output_dir: /output/olmo_instruct/
-with_tracking: true
-report_to:
-  - wandb
-logging_steps: 1
-checkpointing_steps: epoch
-add_bos: true
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
index 72d3637b6..4539f713a 100644
--- a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
@@ -7,22 +7,27 @@ tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from
 use_slow_tokenizer: false # olmo models only use fast tokenizers
 dataset_mixer:
     ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License
-    nvidia/Daring-Anteater: 1.0 # CC BY 4.0, 
     natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0
     natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0
-    Open-Orca/OpenOrca: .02 # MIT
     allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0
     ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback
+    ai2-adapt-dev/aya_dataset-reformat: 1.0 # Apache 2.0
+    ai2-adapt-dev/SlimOrca-reformat: 0.25 # MIT License
+    ai2-adapt-dev/Daring-Anteater-reformat: 1.0 # CC BY 4.0
+    ai2-adapt-dev/WebInstructSub-reformat-apache: 0.1 # Apache 2.0
+    ai2-adapt-dev/Table-GPT-All-train: 0.5 # MIT
 max_seq_length: 4096
 preprocessing_num_workers: 128
-per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+per_device_train_batch_size: 1
 gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
 learning_rate: 2.0e-06
 lr_scheduler_type: linear
 warmup_ratio: 0.03
 weight_decay: 0.0
 num_train_epochs: 3
-output_dir: /output/olmo_instruct/
+output_dir: /output/
 with_tracking: true
 report_to:
   - wandb
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index 5842e6811..f771790f8 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -620,32 +620,32 @@ def main(args: FlatArguments):
                 token=os.getenv("HF_TOKEN", None),
             )
         else:
-            if check_hf_olmo_availability() and isinstance(config, OLMoConfig):
-                logger.info("Temporary loading for recent OLMo Models")
-                # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0
-                config.flash_attention = args.use_flash_attn
-                model = AutoModelForCausalLM.from_pretrained(
-                    args.model_name_or_path,
-                    from_tf=bool(".ckpt" in args.model_name_or_path),
-                    config=config,
-                    trust_remote_code=args.trust_remote_code,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    torch_dtype=torch.bfloat16,
-                    revision=args.model_revision,
-                    token=os.getenv("HF_TOKEN", None),
-                )
-            else:
-                model = AutoModelForCausalLM.from_pretrained(
-                    args.model_name_or_path,
-                    from_tf=bool(".ckpt" in args.model_name_or_path),
-                    config=config,
-                    trust_remote_code=args.trust_remote_code,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    torch_dtype=torch.bfloat16,
-                    attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
-                    revision=args.model_revision,
-                    token=os.getenv("HF_TOKEN", None),
-                )
+            # if check_hf_olmo_availability() and isinstance(config, OLMoConfig):
+            #     logger.info("Temporary loading for recent OLMo Models")
+            #     # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0
+            #     config.flash_attention = args.use_flash_attn
+            #     model = AutoModelForCausalLM.from_pretrained(
+            #         args.model_name_or_path,
+            #         from_tf=bool(".ckpt" in args.model_name_or_path),
+            #         config=config,
+            #         trust_remote_code=args.trust_remote_code,
+            #         low_cpu_mem_usage=args.low_cpu_mem_usage,
+            #         torch_dtype=torch.bfloat16,
+            #         revision=args.model_revision,
+            #         token=os.getenv("HF_TOKEN", None),
+            #     )
+            # else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_name_or_path,
+                from_tf=bool(".ckpt" in args.model_name_or_path),
+                config=config,
+                trust_remote_code=args.trust_remote_code,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+                revision=args.model_revision,
+                token=os.getenv("HF_TOKEN", None),
+            )
     else:
         logger.info("Training new model from scratch")
         model = AutoModelForCausalLM.from_config(config)

From fd16aeef6e97de44b938b424db5ccab460b53233 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Wed, 11 Sep 2024 22:14:44 +0000
Subject: [PATCH 14/23] clean

---
 open_instruct/finetune.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index f771790f8..488d4e551 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -453,7 +453,7 @@ def main(args: FlatArguments):
     if check_hf_olmo_availability():
         # allows AutoModel... to work with not in transformers olmo models
         import hf_olmo  # noqa
-        from hf_olmo import OLMoConfig, OLMoTokenizerFast
+        from hf_olmo import OLMoTokenizerFast
 
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
@@ -620,21 +620,6 @@ def main(args: FlatArguments):
                 token=os.getenv("HF_TOKEN", None),
             )
         else:
-            # if check_hf_olmo_availability() and isinstance(config, OLMoConfig):
-            #     logger.info("Temporary loading for recent OLMo Models")
-            #     # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0
-            #     config.flash_attention = args.use_flash_attn
-            #     model = AutoModelForCausalLM.from_pretrained(
-            #         args.model_name_or_path,
-            #         from_tf=bool(".ckpt" in args.model_name_or_path),
-            #         config=config,
-            #         trust_remote_code=args.trust_remote_code,
-            #         low_cpu_mem_usage=args.low_cpu_mem_usage,
-            #         torch_dtype=torch.bfloat16,
-            #         revision=args.model_revision,
-            #         token=os.getenv("HF_TOKEN", None),
-            #     )
-            # else:
             model = AutoModelForCausalLM.from_pretrained(
                 args.model_name_or_path,
                 from_tf=bool(".ckpt" in args.model_name_or_path),

From fa447f8e8c3e6ef0e6cfe6958398e6187054d203 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Wed, 11 Sep 2024 22:16:51 +0000
Subject: [PATCH 15/23] clean

---
 Dockerfile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9f4e1fb00..dd6b95a97 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -90,10 +90,6 @@ RUN pip install --upgrade pip "setuptools<70.0.0" wheel
 RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
 RUN pip install packaging
 RUN pip install flash-attn==2.6.3 --no-build-isolation
-# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
-# core is a dependency of ai2-olmo
-# RUN pip install ai2-olmo-core==0.1.0 omegaconf
-# RUN pip install ai2-olmo>=0.5.0 --no-deps
 RUN pip install -r requirements.txt
 
 # NLTK download

From e02e98547eb0f929837002c7191e9ee29f581117 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Wed, 11 Sep 2024 22:26:17 +0000
Subject: [PATCH 16/23] up

---
 Dockerfile.olmo | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
index 1f8da727d..040abad5c 100644
--- a/Dockerfile.olmo
+++ b/Dockerfile.olmo
@@ -94,6 +94,7 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
 # core is a dependency of ai2-olmo
 RUN pip install ai2-olmo-core==0.1.0 omegaconf
 # RUN pip install ai2-olmo>=0.5.0 --no-deps
+# TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release
 RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps
 RUN pip install -r requirements-olmo.txt
 

From a0a32bf0b4f73d3756b767d38c947dbf9a422cdb Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Wed, 11 Sep 2024 23:15:32 +0000
Subject: [PATCH 17/23] no longer install from branch

---
 Dockerfile.olmo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
index 040abad5c..10e2d9f12 100644
--- a/Dockerfile.olmo
+++ b/Dockerfile.olmo
@@ -95,7 +95,7 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
 RUN pip install ai2-olmo-core==0.1.0 omegaconf
 # RUN pip install ai2-olmo>=0.5.0 --no-deps
 # TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release
-RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps
+RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps
 RUN pip install -r requirements-olmo.txt
 
 # NLTK download

From 503e61e7d4d3b0eabd29c2015ac1fbe491281333 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Mon, 16 Sep 2024 19:49:46 +0000
Subject: [PATCH 18/23] fixes

---
 configs/train_configs/sft/olmo_7b_0924.yaml   |  2 +-
 .../sft/olmo_7b_0924_fw2_tulu_v3.4.yaml       | 26 +++++++++++++++++++
 open_instruct/mix_data.py                     |  6 +++--
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml

diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
index 7f62e1dca..e8264bd0a 100644
--- a/configs/train_configs/sft/olmo_7b_0924.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -13,7 +13,7 @@ lr_scheduler_type: linear
 warmup_ratio: 0.03
 weight_decay: 0.0
 num_train_epochs: 3
-output_dir: /output/olmo_instruct/
+output_dir: /output/
 with_tracking: true
 report_to:
   - wandb
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
new file mode 100644
index 000000000..3efd900c6
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
@@ -0,0 +1,26 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/tulu-v3.4-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/open_instruct/mix_data.py b/open_instruct/mix_data.py
index 6e7260c8f..c34bf0274 100644
--- a/open_instruct/mix_data.py
+++ b/open_instruct/mix_data.py
@@ -15,11 +15,13 @@
 # limitations under the License.
 
 # script for mixing and saving data
-from .utils import ArgumentParserPlus, FlatArguments, get_datasets
+from open_instruct.utils import ArgumentParserPlus, get_datasets
+from open_instruct.finetune import FlatArguments
 
 # Run as module for local imports, e.g.:
-# python -m open_instruct.mix_data configs/train_configs/sft/default.yaml --dataset_mix_dir=output/tmp/
+# python open_instruct/mix_data.py configs/train_configs/sft/tulu3_8b_preview_mix_v3.4.yaml --dataset_mix_dir=output/tmp/
 # can pass --save_to_hub=allenai/tulu-v3.1-mix-preview-4096-OLMoE
+# note that = is needed with our argparser
 
 
 def main():

From 982726435268af12ade94d8713a39e747e0a413b Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 24 Sep 2024 00:52:04 +0000
Subject: [PATCH 19/23] dpo config

---
 configs/train_configs/dpo/olmo_7b_0924.yaml | 29 +++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 configs/train_configs/dpo/olmo_7b_0924.yaml

diff --git a/configs/train_configs/dpo/olmo_7b_0924.yaml b/configs/train_configs/dpo/olmo_7b_0924.yaml
new file mode 100644
index 000000000..3028bfca8
--- /dev/null
+++ b/configs/train_configs/dpo/olmo_7b_0924.yaml
@@ -0,0 +1,29 @@
+model_name_or_path: /model
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+dataset_mixer:
+  allenai/ultrafeedback_binarized_cleaned_train: 1.0
+  ai2-adapt-dev/DaringAnteater-prefs-RM-filter: 1.0
+  ai2-adapt-dev/WildChat-prefs-280824: 1.0
+  allenai/tulu-3-hardcoded-preferences: 1.0
+tokenizer_name: /model
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128
+learning_rate: 5.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: 1000
\ No newline at end of file

From b175717e14ea64506485f7261cc1d02d921d5c89 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathanl@allenai.org>
Date: Tue, 24 Sep 2024 16:30:10 +0000
Subject: [PATCH 20/23] temp olmo changes

---
 Dockerfile.olmo                                           | 4 ++++
 configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml | 4 +++-
 scripts/eval/oe-eval.sh                                   | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
index 10e2d9f12..40ef4377a 100644
--- a/Dockerfile.olmo
+++ b/Dockerfile.olmo
@@ -98,6 +98,10 @@ RUN pip install ai2-olmo-core==0.1.0 omegaconf
 RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps
 RUN pip install -r requirements-olmo.txt
 
+RUN pip install git+https://github.com/AkshitaB/vllm.git
+RUN pip install vllm-flash-attn
+
+
 # NLTK download
 RUN python -m nltk.downloader punkt
 COPY open_instruct open_instruct
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
index 3efd900c6..491fc4502 100644
--- a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
@@ -9,7 +9,8 @@ dataset_name: allenai/tulu-v3.4-mix-preview
 max_seq_length: 4096
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1
-gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 4 # designed for 4 nodes
+gradient_accumulation_steps: 8 # designed for 2 nodes
 # gradient_accumulation_steps: 16 # designed for 1 nodes
 gradient_checkpointing: true
 learning_rate: 2.0e-06
@@ -19,6 +20,7 @@ weight_decay: 0.0
 num_train_epochs: 3
 output_dir: /output/
 with_tracking: true
+reduce_loss: mean
 report_to:
   - wandb
 logging_steps: 1
diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index e250faaef..3cb3a68e1 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -92,7 +92,10 @@ TASKS=(
     "alpaca_eval_v2::tulu"
     "truthfulqa::tulu"
 )
-MODEL_TYPE="--model-type vllm"
+# For models without VLLM (experimental architectures)
+# comment out the VLLM arg and set GPU_COUNT_OTHER to 1
+# also consider lowering the batch size (VLLM arg), maybe to 5, VLLM handles it differently
+# MODEL_TYPE="--model-type vllm"
 BATCH_SIZE_VLLM=10000
 BATCH_SIZE_OTHER=1
 GPU_COUNT=1

From 281e28eb245648dc9f3dc4b38c664d1f0f06f2f7 Mon Sep 17 00:00:00 2001
From: nouhadziri <dziri@ualberta.ca>
Date: Mon, 4 Nov 2024 15:05:34 -0500
Subject: [PATCH 21/23] add olmo training

---
 .../sft/{ => olmo}/olmo_7b_0924.yaml          |  0
 .../sft/olmo/olmo_7b_0924_v3.9_safety.yaml    | 24 +++++++++++++++++++
 open_instruct/olmo/scripts/sft/olmo_test.sh   |  7 ++++++
 3 files changed, 31 insertions(+)
 rename configs/train_configs/sft/{ => olmo}/olmo_7b_0924.yaml (100%)
 create mode 100644 configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
 create mode 100644 open_instruct/olmo/scripts/sft/olmo_test.sh

diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924.yaml
similarity index 100%
rename from configs/train_configs/sft/olmo_7b_0924.yaml
rename to configs/train_configs/sft/olmo/olmo_7b_0924.yaml
diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
new file mode 100644
index 000000000..49f926e9f
--- /dev/null
+++ b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
@@ -0,0 +1,24 @@
+model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_revision: main
+use_flash_attn: true
+tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_mixer:
+  ai2-adapt-dev/synthetic-cot-wildguarmixtrain: 86759
+  ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused: 209574 # all
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh
new file mode 100644
index 000000000..f8ff7b63e
--- /dev/null
+++ b/open_instruct/olmo/scripts/sft/olmo_test.sh
@@ -0,0 +1,7 @@
+python scripts/submit_finetune_job.py \
+  --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \
+  --config configs/train_configs/sft/olmo_7b_0924_v3.9_safety.yaml \
+  --cluster ai2/jupiter-cirrascale-2 \
+  --priority high \
+  --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \
+  --num_gpus 8
\ No newline at end of file

From e8ddd567fb4f16d69341499fe81328a5d1dc6df8 Mon Sep 17 00:00:00 2001
From: nouhadziri <dziri@ualberta.ca>
Date: Mon, 4 Nov 2024 15:15:03 -0500
Subject: [PATCH 22/23] fix dir in config

---
 open_instruct/olmo/scripts/sft/olmo_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh
index f8ff7b63e..73c4b8153 100644
--- a/open_instruct/olmo/scripts/sft/olmo_test.sh
+++ b/open_instruct/olmo/scripts/sft/olmo_test.sh
@@ -1,6 +1,6 @@
 python scripts/submit_finetune_job.py \
   --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \
-  --config configs/train_configs/sft/olmo_7b_0924_v3.9_safety.yaml \
+  --config configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml \
   --cluster ai2/jupiter-cirrascale-2 \
   --priority high \
   --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \

From abd25bf81a5194b28cbd56ee3af8a7ffe7a5d46b Mon Sep 17 00:00:00 2001
From: nouhadziri <dziri@ualberta.ca>
Date: Mon, 4 Nov 2024 16:04:59 -0500
Subject: [PATCH 23/23] rollback my changes

---
 .../sft/olmo/olmo_7b_0924_v3.9_safety.yaml    | 24 -------------------
 .../sft/{olmo => }/olmo_7b_0924.yaml          |  0
 open_instruct/olmo/scripts/sft/olmo_test.sh   |  7 ------
 3 files changed, 31 deletions(-)
 delete mode 100644 configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
 rename configs/train_configs/sft/{olmo => }/olmo_7b_0924.yaml (100%)
 delete mode 100644 open_instruct/olmo/scripts/sft/olmo_test.sh

diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
deleted file mode 100644
index 49f926e9f..000000000
--- a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-model_revision: main
-use_flash_attn: true
-tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
-use_slow_tokenizer: false # olmo models only use fast tokenizers
-dataset_mixer:
-  ai2-adapt-dev/synthetic-cot-wildguarmixtrain: 86759
-  ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused: 209574 # all
-max_seq_length: 4096
-preprocessing_num_workers: 128
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only
-learning_rate: 2.0e-06
-lr_scheduler_type: linear
-warmup_ratio: 0.03
-weight_decay: 0.0
-num_train_epochs: 3
-output_dir: /output/
-with_tracking: true
-report_to:
-  - wandb
-logging_steps: 1
-checkpointing_steps: epoch
-add_bos: true
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
similarity index 100%
rename from configs/train_configs/sft/olmo/olmo_7b_0924.yaml
rename to configs/train_configs/sft/olmo_7b_0924.yaml
diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh
deleted file mode 100644
index 73c4b8153..000000000
--- a/open_instruct/olmo/scripts/sft/olmo_test.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-python scripts/submit_finetune_job.py \
-  --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \
-  --config configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml \
-  --cluster ai2/jupiter-cirrascale-2 \
-  --priority high \
-  --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \
-  --num_gpus 8
\ No newline at end of file